27 #include "../common/assert.hpp"
37 namespace clustering {
38 namespace compressor {
42 struct compare_by_first {
43 template <
typename T,
typename S>
45 const std::pair<T, S>& p1,
46 const std::pair<T, S>& p2)
const {
47 return p1.first < p2.first;
53 const std::vector<double>& scores,
54 std::vector<T>& array,
58 "lengths of scores and data must be same.");
60 std::vector<std::pair<double, T> > pairs(scores.size());
61 for (
size_t i = 0; i < scores.size(); ++i) {
62 pairs[i].first = scores[i];
63 swap(pairs[i].second, array[i]);
65 size = std::min(size, pairs.size());
66 std::partial_sort(pairs.begin(),
71 for (
size_t i = 0; i < size; ++i) {
72 swap(pairs[i].second, array[i]);
76 void bicriteria_as_coreset(
83 typedef wplist::const_iterator citer;
84 typedef wplist::iterator iter;
85 bic.resize(dstsize - dst.size());
86 for (iter it = bic.begin(); it != bic.end(); ++it) {
89 for (citer it = dst.begin(); it != dst.end(); ++it) {
90 pair<int, double> m =
min_dist(*it, bic);
91 bic[m.first].weight -= it->weight;
93 for (citer it = src.begin(); it != src.end(); ++it) {
94 pair<int, double> m =
min_dist(*it, bic);
95 bic[m.first].weight += it->weight;
97 dst.insert(dst.end(), bic.begin(), bic.end());
98 for (iter it = dst.begin(); it != dst.end(); ++it) {
119 if (dstsize >= src.size()) {
125 if (bicriteria.size() < dstsize) {
130 dstsize - bicriteria.size(),
133 bicriteria_as_coreset(src, bicriteria, dstsize, dst);
143 vector<double> weights(src.size());
145 (1 - std::exp(bsize * (std::log(bsize) - std::log(src.size())) / dstsize))
148 std::vector<size_t> ind(bsize);
149 while (resid.size() > 1 && dst.size() < dstsize) {
150 weights.resize(resid.size());
151 for (wplist::const_iterator it = resid.begin(); it != resid.end(); ++it) {
152 weights[it - resid.begin()] = it->weight;
157 std::generate(ind.begin(), ind.end(), d);
159 std::sort(ind.begin(), ind.end());
160 ind.erase(std::unique(ind.begin(), ind.end()), ind.end());
162 for (std::vector<size_t>::iterator it = ind.begin();
163 it != ind.end(); ++it) {
164 dst.push_back(resid[*it]);
168 std::vector<double> distances;
169 for (wplist::iterator itr = resid.begin(); itr != resid.end(); ++itr) {
170 distances.push_back(-
min_dist(*itr, dst).second);
173 size_t size = std::min(resid.size(),
174 static_cast<size_t>(resid.size() * r));
179 partial_sort_by(distances, resid, size);
189 double squared_min_dist_sum) {
190 return std::ceil(weight_sum * (
191 min_dist * min_dist * p.
weight / squared_min_dist_sum)) + 1;
199 if (bicriteria.size() == 0) {
203 double weight_sum = 0;
204 double squared_min_dist_sum = 0;
205 std::vector<size_t> nearest_indexes(src.size());
206 std::vector<double> nearest_distances(src.size());
207 std::vector<double> bicriteria_scores(bicriteria.size());
208 for (
size_t i = 0; i < src.size(); ++i) {
209 double weight = src[i].weight;
210 std::pair<int, double> m =
min_dist(src[i], bicriteria);
211 nearest_indexes[i] = m.first;
212 nearest_distances[i] = m.second;
214 bicriteria_scores[m.first] += weight;
215 squared_min_dist_sum += m.second * m.second * weight;
216 weight_sum += weight;
218 squared_min_dist_sum = std::max(squared_min_dist_sum,
219 std::numeric_limits<double>::min());
220 std::vector<double> weights;
222 for (
size_t i = 0; i < src.size(); ++i) {
225 nearest_distances[i],
226 bicriteria[nearest_indexes[i]],
227 bicriteria_scores[nearest_indexes[i]],
229 squared_min_dist_sum);
230 weights.push_back(prob);
236 std::vector<size_t> ind(dstsize);
237 std::generate(ind.begin(), ind.end(), d);
239 for (std::vector<size_t>::iterator it = ind.begin(); it != ind.end(); ++it) {
242 double prob = weights[index] / sumw;
243 sample.
weight *= 1.0 / dstsize / prob;
244 dst.push_back(sample);
#define JUBATUS_ASSERT_EQ(a, b, messages)
void compress(const wplist &src, size_t bsize, size_t dstsize, wplist &dst)
void get_bicriteria(const wplist &src, size_t bsize, size_t dstsize, wplist &dst)
kmeans_compressor(const clustering_config &cfg)
void concat(const wplist &src, wplist &dst)
pair< size_t, double > min_dist(const common::sfv_t &p, const vector< common::sfv_t > &P)
#define JUBATUS_ASSERT_GE(a, b, messages)
virtual double get_probability(const weighted_point &p, double min_dist, const weighted_point &nearest_bp, double bp_score, double weight_sum, double squared_min_dist_sum)
void swap(weighted_point &p1, weighted_point &p2)
void bicriteria_to_coreset(const wplist &src, const wplist &bicriteria, size_t dstsize, wplist &dst)
std::vector< weighted_point > wplist