jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
Public Member Functions | Private Member Functions | List of all members
jubatus::core::clustering::compressor::kmeans_compressor Class Reference

#include <kmeans_compressor.hpp>

Inheritance diagram for jubatus::core::clustering::compressor::kmeans_compressor:
Inheritance graph
Collaboration diagram for jubatus::core::clustering::compressor::kmeans_compressor:
Collaboration graph

Public Member Functions

void compress (const wplist &src, size_t bsize, size_t dstsize, wplist &dst)
 
virtual double get_probability (const weighted_point &p, double min_dist, const weighted_point &nearest_bp, double bp_score, double weight_sum, double squared_min_dist_sum)
 
 kmeans_compressor (const clustering_config &cfg)
 
 ~kmeans_compressor ()
 
- Public Member Functions inherited from jubatus::core::clustering::compressor::compressor
 compressor (const clustering_config &cfg)
 
 MSGPACK_DEFINE (config_)
 
virtual ~compressor ()
 

Private Member Functions

void bicriteria_to_coreset (const wplist &src, const wplist &bicriteria, size_t dstsize, wplist &dst)
 
void get_bicriteria (const wplist &src, size_t bsize, size_t dstsize, wplist &dst)
 

Detailed Description

Definition at line 27 of file kmeans_compressor.hpp.

Constructor & Destructor Documentation

jubatus::core::clustering::compressor::kmeans_compressor::kmeans_compressor ( const clustering_config cfg)
explicit

Definition at line 107 of file kmeans_compressor.cpp.

108  : compressor(cfg) {
109 }
compressor(const clustering_config &cfg)
Definition: compressor.hpp:32
jubatus::core::clustering::compressor::kmeans_compressor::~kmeans_compressor ( )

Definition at line 111 of file kmeans_compressor.cpp.

111  {
112 }

Member Function Documentation

void jubatus::core::clustering::compressor::kmeans_compressor::bicriteria_to_coreset ( const wplist src,
const wplist bicriteria,
size_t  dstsize,
wplist dst 
)
private

Definition at line 194 of file kmeans_compressor.cpp.

References get_probability(), jubatus::core::clustering::min_dist(), and jubatus::core::clustering::weighted_point::weight.

Referenced by compress().

198  {
199  if (bicriteria.size() == 0) {
200  dst = src;
201  return;
202  }
203  double weight_sum = 0;
204  double squared_min_dist_sum = 0;
205  std::vector<size_t> nearest_indexes(src.size());
206  std::vector<double> nearest_distances(src.size());
207  std::vector<double> bicriteria_scores(bicriteria.size());
208  for (size_t i = 0; i < src.size(); ++i) {
209  double weight = src[i].weight;
210  std::pair<int, double> m = min_dist(src[i], bicriteria);
211  nearest_indexes[i] = m.first;
212  nearest_distances[i] = m.second;
213 
214  bicriteria_scores[m.first] += weight;
215  squared_min_dist_sum += m.second * m.second * weight;
216  weight_sum += weight;
217  }
218  squared_min_dist_sum = std::max(squared_min_dist_sum,
219  std::numeric_limits<double>::min());
220  std::vector<double> weights;
221  double sumw = 0;
222  for (size_t i = 0; i < src.size(); ++i) {
223  double prob = get_probability(
224  src[i],
225  nearest_distances[i],
226  bicriteria[nearest_indexes[i]],
227  bicriteria_scores[nearest_indexes[i]],
228  weight_sum,
229  squared_min_dist_sum);
230  weights.push_back(prob);
231  sumw += prob;
232  }
233 
234  discrete_distribution d(weights.begin(), weights.end());
235 
236  std::vector<size_t> ind(dstsize);
237  std::generate(ind.begin(), ind.end(), d);
238 
239  for (std::vector<size_t>::iterator it = ind.begin(); it != ind.end(); ++it) {
240  size_t index = *it;
241  weighted_point sample = src[index];
242  double prob = weights[index] / sumw;
243  sample.weight *= 1.0 / dstsize / prob;
244  dst.push_back(sample);
245  }
246 }
pair< size_t, double > min_dist(const common::sfv_t &p, const vector< common::sfv_t > &P)
Definition: util.cpp:182
virtual double get_probability(const weighted_point &p, double min_dist, const weighted_point &nearest_bp, double bp_score, double weight_sum, double squared_min_dist_sum)

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::clustering::compressor::kmeans_compressor::compress ( const wplist src,
size_t  bsize,
size_t  dstsize,
wplist dst 
)
virtual

Implements jubatus::core::clustering::compressor::compressor.

Definition at line 114 of file kmeans_compressor.cpp.

References bicriteria_to_coreset(), jubatus::core::clustering::concat(), and get_bicriteria().

118  {
119  if (dstsize >= src.size()) {
120  concat(src, dst);
121  return;
122  }
123  wplist bicriteria;
124  get_bicriteria(src, bsize, dstsize, bicriteria);
125  if (bicriteria.size() < dstsize) {
126  wplist srcclone = src;
128  srcclone,
129  bicriteria,
130  dstsize - bicriteria.size(),
131  dst);
132  }
133  bicriteria_as_coreset(src, bicriteria, dstsize, dst);
134 }
void get_bicriteria(const wplist &src, size_t bsize, size_t dstsize, wplist &dst)
void concat(const wplist &src, wplist &dst)
Definition: util.cpp:33
void bicriteria_to_coreset(const wplist &src, const wplist &bicriteria, size_t dstsize, wplist &dst)
std::vector< weighted_point > wplist
Definition: types.hpp:55

Here is the call graph for this function:

void jubatus::core::clustering::compressor::kmeans_compressor::get_bicriteria ( const wplist src,
size_t  bsize,
size_t  dstsize,
wplist dst 
)
private

Definition at line 136 of file kmeans_compressor.cpp.

References jubatus::core::clustering::min_dist().

Referenced by compress().

140  {
141  dst.clear();
142  wplist resid = src;
143  vector<double> weights(src.size());
144  double r =
145  (1 - std::exp(bsize * (std::log(bsize) - std::log(src.size())) / dstsize))
146  / 2;
147  r = max(0.1, r);
148  std::vector<size_t> ind(bsize);
149  while (resid.size() > 1 && dst.size() < dstsize) {
150  weights.resize(resid.size());
151  for (wplist::const_iterator it = resid.begin(); it != resid.end(); ++it) {
152  weights[it - resid.begin()] = it->weight;
153  }
154 
155  // Sample `bsize` points and insert them to the result
156  discrete_distribution d(weights.begin(), weights.end());
157  std::generate(ind.begin(), ind.end(), d);
158 
159  std::sort(ind.begin(), ind.end());
160  ind.erase(std::unique(ind.begin(), ind.end()), ind.end());
161 
162  for (std::vector<size_t>::iterator it = ind.begin();
163  it != ind.end(); ++it) {
164  dst.push_back(resid[*it]);
165  }
166 
167  // Remove `r` nearest points from `resid`
168  std::vector<double> distances;
169  for (wplist::iterator itr = resid.begin(); itr != resid.end(); ++itr) {
170  distances.push_back(-min_dist(*itr, dst).second);
171  }
172  // TODO(unno): Is `r` lesser than 1.0?
173  size_t size = std::min(resid.size(),
174  static_cast<size_t>(resid.size() * r));
175  if (size == 0) {
176  size = 1;
177  }
178 
179  partial_sort_by(distances, resid, size);
180  }
181 }
pair< size_t, double > min_dist(const common::sfv_t &p, const vector< common::sfv_t > &P)
Definition: util.cpp:182
std::vector< weighted_point > wplist
Definition: types.hpp:55

Here is the call graph for this function:

Here is the caller graph for this function:

double jubatus::core::clustering::compressor::kmeans_compressor::get_probability ( const weighted_point p,
double  min_dist,
const weighted_point nearest_bp,
double  bp_score,
double  weight_sum,
double  squared_min_dist_sum 
)
virtual

Reimplemented in jubatus::core::clustering::compressor::gmm_compressor.

Definition at line 183 of file kmeans_compressor.cpp.

References jubatus::core::clustering::weighted_point::weight.

Referenced by bicriteria_to_coreset().

189  {
190  return std::ceil(weight_sum * (
191  min_dist * min_dist * p.weight / squared_min_dist_sum)) + 1;
192 }
pair< size_t, double > min_dist(const common::sfv_t &p, const vector< common::sfv_t > &P)
Definition: util.cpp:182

Here is the caller graph for this function:


The documentation for this class was generated from the following files: