jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
gmm.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2013 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "gmm.hpp"
18 
19 #include <algorithm>
20 #include <ctime>
21 #include <cfloat>
22 #include <cmath>
23 #include <limits>
24 #include <iostream>
25 #include <utility>
26 #include <vector>
27 #include "jubatus/util/math/random.h"
28 #include "../common/exception.hpp"
29 
30 using jubatus::util::lang::shared_ptr;
31 using std::max;
32 using std::min;
33 using std::exp;
34 using std::vector;
35 
36 namespace jubatus {
37 namespace core {
38 namespace clustering {
39 
40 void gmm::batch(const eigen_wsvec_list_t& data, int d, int k) {
41  if (data.empty()) {
42  *this = gmm();
43  return;
44  }
45 
46  typedef eigen_wsvec_list_t::const_iterator data_iter;
47  initialize(data, d, k);
48 
49  eigen_svec_list_t old_means;
50  eigen_smat_list_t old_covs;
51  eigen_solver_list_t old_solvers;
52  double old_obj = 0, obj = 0;
53  vector<double> weights(k);
54 
55  bool converged = false;
56  int64_t niter = 1;
57  while (!converged) {
58  old_covs = covs_;
59  old_means = means_;
60  old_solvers = cov_solvers_;
61  old_obj = obj;
62  obj = 0;
63  fill(weights.begin(), weights.end(), 0);
64  fill(means_.begin(), means_.end(), eigen_svec_t(d));
65  fill(covs_.begin(), covs_.end(), eigen_smat_t(d, d));
66 
67  for (data_iter i = data.begin(); i != data.end(); ++i) {
68  eigen_svec_t cps =
69  cluster_probs(i->data, old_means, old_covs, old_solvers);
70  for (int c = 0; c < k; ++c) {
71  double cp = i->weight * cps.coeff(c);
72  means_[c] += cp * i->data;
73  covs_[c] += i->data * (i->data.transpose()) * cp;
74  weights[c] += cp;
75  obj -= std::log(std::max(cp, std::numeric_limits<double>::min()));
76  }
77  }
78  for (int c = 0; c < k; ++c) {
79  means_[c] /= weights[c];
80  covs_[c] /= weights[c];
81  double eps = 0.1;
82  covs_[c] -= means_[c] * means_[c].transpose();
83  covs_[c] += eps * eye_;
84  cov_solvers_[c] =
85  shared_ptr<eigen_solver_t>(new eigen_solver_t(covs_[c]));
86  }
87  converged = is_converged(niter++, means_, old_means, obj, old_obj);
88  }
89 }
90 
93 }
94 
96  if (means_.empty()) {
98  "clustering is not performed yet"));
99  }
100 
101  double max_prob = 0;
102  int64_t max_idx = 0;
104  for (int c = 0; c < k_; ++c) {
105  double cp = cps.coeff(c);
106  if (cp > max_prob) {
107  max_prob = cp;
108  max_idx = c;
109  }
110  }
111  return max_idx;
112 }
113 
114 void gmm::initialize(const eigen_wsvec_list_t& data, int d, int k) {
115  d_ = d;
116  k_ = k;
120  eye_ = eigen_smat_t(d, d);
121 
122  for (int i = 0; i < d; ++i) {
123  eye_.insert(i, i) = 1;
124  }
125 
126  jubatus::util::math::random::mtrand r(time(NULL));
127  for (int c = 0; c < k; ++c) {
128  means_[c] = data[r.next_int(0, data.size()-1)].data;
129  for (int i = 0; i < d; ++i) {
130  covs_[c].insert(i, i) = 1;
131  }
132  cov_solvers_[c] = shared_ptr<eigen_solver_t>(new eigen_solver_t(covs_[c]));
133  }
134 }
135 
137  int64_t niter,
138  const eigen_svec_list_t& means,
139  const eigen_svec_list_t& old_means,
140  double obj,
141  double old_obj) {
142  double max_dist = 0;
143  for (int c = 0; c < k_; ++c) {
144  max_dist = max(max_dist, (means[c] - old_means[c]).norm());
145  }
146  return (max_dist < 1e-09 || niter > 1e05);
147 }
148 
150  const eigen_svec_t& x,
151  const eigen_svec_list_t& means,
152  const eigen_smat_list_t& covs,
153  const eigen_solver_list_t& cov_solvers) const {
154  double den = DBL_MIN;
155  eigen_svec_t ret(k_);
156  for (int i = 0; i < k_; ++i) {
157  eigen_svec_t dif = x - means[i];
158  double det = std::abs(cov_solvers[i]->determinant());
159  double quad = (dif.transpose() * cov_solvers[i]->solve(dif)).sum();
160  double lp = -1 / 2. * (std::log(det) + quad);
161  ret.coeffRef(i) = lp;
162  den = (den == DBL_MIN) ?
163  lp : std::max(den, lp) +
164  std::log(1 + std::exp(min(den, lp) - std::max(den, lp)));
165  }
166  for (int i = 0; i < k_; ++i) {
167  ret.coeffRef(i) = std::exp(ret.coeff(i) - den);
168  }
169  return ret;
170 }
171 
172 } // namespace clustering
173 } // namespace core
174 } // namespace jubatus
void batch(const eigen_wsvec_list_t &data, int d, int k)
Definition: gmm.cpp:40
Eigen::SimplicialCholesky< eigen_smat_t > eigen_solver_t
Definition: gmm_types.hpp:31
int64_t get_nearest_center_index(const eigen_svec_t &p) const
Definition: gmm.cpp:95
std::vector< jubatus::util::lang::shared_ptr< eigen_solver_t > > eigen_solver_list_t
Definition: gmm_types.hpp:35
eigen_smat_list_t covs_
Definition: gmm.hpp:52
void initialize(const eigen_wsvec_list_t &data, int d, int k)
Definition: gmm.cpp:114
Eigen::SparseMatrix< double > eigen_smat_t
Definition: gmm_types.hpp:30
bool is_converged(int64_t niter, const eigen_svec_list_t &means, const eigen_svec_list_t &old_means, double obj, double old_obj)
Definition: gmm.cpp:136
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
eigen_svec_t get_nearest_center(const eigen_svec_t &p) const
Definition: gmm.cpp:91
eigen_svec_t cluster_probs(const eigen_svec_t &x, const eigen_svec_list_t &mean, const eigen_smat_list_t &cov, const eigen_solver_list_t &solvers) const
Definition: gmm.cpp:149
std::vector< eigen_svec_t > eigen_svec_list_t
Definition: gmm_types.hpp:32
double sum(const common::sfv_t &p)
Definition: util.cpp:47
eigen_solver_list_t cov_solvers_
Definition: gmm.hpp:54
std::vector< eigen_wsvec_t > eigen_wsvec_list_t
Definition: gmm_types.hpp:42
Eigen::SparseVector< double > eigen_svec_t
Definition: gmm_types.hpp:29
eigen_svec_list_t means_
Definition: gmm.hpp:51
std::vector< eigen_smat_t > eigen_smat_list_t
Definition: gmm_types.hpp:33