jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
light_lof.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2013 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "light_lof.hpp"
18 
19 #include <algorithm>
20 #include <limits>
21 #include <numeric>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 #include "jubatus/util/data/unordered_map.h"
26 #include "jubatus/util/lang/bind.h"
27 #include "jubatus/util/lang/shared_ptr.h"
28 #include "../common/exception.hpp"
29 #include "../storage/column_table.hpp"
30 #include "../framework/mixable_versioned_table.hpp"
31 #include "../nearest_neighbor/nearest_neighbor_base.hpp"
32 
33 using jubatus::util::data::unordered_map;
34 using jubatus::util::data::unordered_set;
35 using jubatus::util::lang::shared_ptr;
36 using jubatus::util::lang::bind;
39 
40 namespace jubatus {
41 namespace core {
42 namespace anomaly {
43 namespace {
44 
45 const uint32_t DEFAULT_NEIGHBOR_NUM = 10;
46 const uint32_t DEFAULT_REVERSE_NN_NUM = 30;
47 
48 const size_t KDIST_COLUMN_INDEX = 0;
49 const size_t LRD_COLUMN_INDEX = 1;
50 
51 shared_ptr<column_table> create_lof_table() {
52  shared_ptr<column_table> table(new column_table);
53  std::vector<storage::column_type> schema(
54  2, storage::column_type(storage::column_type::float_type));
55  table->init(schema);
56  return table;
57 }
58 
59 float calculate_lof(float lrd, const std::vector<float>& neighbor_lrds) {
60  if (neighbor_lrds.empty()) {
61  return lrd == 0 ? 1 : std::numeric_limits<float>::infinity();
62  }
63 
64  const float sum_neighbor_lrd = std::accumulate(
65  neighbor_lrds.begin(), neighbor_lrds.end(), 0.0f);
66 
67  if (std::isinf(sum_neighbor_lrd) && std::isinf(lrd)) {
68  return 1;
69  }
70 
71  return sum_neighbor_lrd / (neighbor_lrds.size() * lrd);
72 }
73 
74 } // namespace
75 
77  : nearest_neighbor_num(DEFAULT_NEIGHBOR_NUM),
78  reverse_nearest_neighbor_num(DEFAULT_REVERSE_NN_NUM) {
79 }
80 
82  const config& conf,
83  const std::string& id,
84  shared_ptr<nearest_neighbor_base> nearest_neighbor_engine)
85  : nearest_neighbor_engine_(nearest_neighbor_engine),
86  mixable_nearest_neighbor_(new framework::mixable_versioned_table),
87  mixable_scores_(new framework::mixable_versioned_table),
88  config_(conf),
89  my_id_(id) {
90  if (!(2 <= conf.nearest_neighbor_num)) {
91  throw JUBATUS_EXCEPTION(
92  common::invalid_parameter("2 <= nearest_neighbor_num"));
93  }
94 
95  if (!(conf.nearest_neighbor_num
97  throw JUBATUS_EXCEPTION(
99  "nearest_neighbor_num <= reverse_nearest_neighbor_num"));
100  }
101 
102  mixable_nearest_neighbor_->set_model(nearest_neighbor_engine_->get_table());
103  mixable_scores_->set_model(create_lof_table());
104 }
105 
107  const config& conf,
108  const std::string& id,
109  shared_ptr<nearest_neighbor_base> nearest_neighbor_engine,
110  shared_ptr<unlearner::unlearner_base> unlearner)
111  : nearest_neighbor_engine_(nearest_neighbor_engine),
112  unlearner_(unlearner),
113  mixable_nearest_neighbor_(new framework::mixable_versioned_table),
114  mixable_scores_(new framework::mixable_versioned_table),
115  config_(conf),
116  my_id_(id) {
117  shared_ptr<column_table> nn_table = nearest_neighbor_engine_->get_table();
118  shared_ptr<column_table> lof_table = create_lof_table();
119  unlearner_->set_callback(bind(
120  &light_lof::unlearn, this, jubatus::util::lang::_1));
121 
122  mixable_nearest_neighbor_->set_model(nn_table);
123  mixable_nearest_neighbor_->set_unlearner(unlearner_);
124  mixable_scores_->set_model(lof_table);
125  mixable_scores_->set_unlearner(unlearner_);
126 }
127 
129 }
130 
131 float light_lof::calc_anomaly_score(const common::sfv_t& query) const {
132  std::vector<float> neighbor_lrds;
133  const float lrd = collect_lrds(query, neighbor_lrds);
134 
135  return calculate_lof(lrd, neighbor_lrds);
136 }
137 
138 float light_lof::calc_anomaly_score(const std::string& id) const {
139  std::vector<float> neighbor_lrds;
140  const float lrd = collect_lrds(id, neighbor_lrds);
141 
142  return calculate_lof(lrd, neighbor_lrds);
143 }
144 
146  nearest_neighbor_engine_->clear();
147  mixable_scores_->get_model()->clear();
148  if (unlearner_) {
149  unlearner_->clear();
150  }
151 }
152 
153 void light_lof::clear_row(const std::string& id) {
155 }
156 
157 void light_lof::update_row(const std::string& id, const sfv_diff_t& diff) {
159 }
160 
161 void light_lof::set_row(const std::string& id, const common::sfv_t& sfv) {
162  unordered_set<std::string> update_set;
163 
164  shared_ptr<column_table> table = mixable_scores_->get_model();
165  if (table->exact_match(id).first) {
166  collect_neighbors(id, update_set);
167  }
168 
169  touch(id);
170  nearest_neighbor_engine_->set_row(id, sfv);
171  collect_neighbors(id, update_set);
172 
173  // Primarily add id to lof table with dummy parameters.
174  // update_entries() below overwrites this row.
175  table->add(id, storage::owner(my_id_), -1.f, -1.f);
176  update_set.insert(id);
177  update_entries(update_set);
178 }
179 
180 void light_lof::get_all_row_ids(std::vector<std::string>& ids) const {
181  nearest_neighbor_engine_->get_all_row_ids(ids);
182 }
183 
184 std::string light_lof::type() const {
185  return "light_lof";
186 }
187 
188 std::vector<framework::mixable*> light_lof::get_mixables() const {
189  std::vector<framework::mixable*> mixables;
190  mixables.push_back(nearest_neighbor_engine_->get_mixable());
191  mixables.push_back(mixable_scores_.get());
192  return mixables;
193 }
194 
195 // private
196 
197 void light_lof::touch(const std::string& id) {
198  if (unlearner_) {
199  if (!unlearner_->touch(id)) {
201  "no more space available to add new ID: " + id));
202  }
203  }
204 }
205 
206 // Unlearning callback function of light_lof.
207 //
208 // It removes a row and updates parameters of its reverse k-nearest neighbors.
209 // This behavior is similar to ``light_lof::set_row``, which updates reverse
210 // k-NNs if given id exists already.
211 //
212 // It is necessary to update reverse k-NNs of the point to be removed for model
213 // correctness: if this procedure is omitted, ``light_lof`` no longer runs
214 // correctly.
215 void light_lof::unlearn(const std::string& key) {
216  unordered_set<std::string> reverse_knn;
217  collect_neighbors(key, reverse_knn);
218  reverse_knn.erase(key);
219 
220  mixable_nearest_neighbor_->get_model()->delete_row(key);
221  mixable_scores_->get_model()->delete_row(key);
222 
223  update_entries(reverse_knn);
224 }
225 
227  const common::sfv_t& query,
228  std::vector<float>& neighbor_lrds) const {
229  std::vector<std::pair<std::string, float> > neighbors;
230  nearest_neighbor_engine_->neighbor_row(
231  query, neighbors, config_.nearest_neighbor_num);
232 
233  return collect_lrds_from_neighbors(neighbors, neighbor_lrds);
234 }
235 
237  const std::string& id,
238  std::vector<float>& neighbor_lrds) const {
239  std::vector<std::pair<std::string, float> > neighbors;
240  nearest_neighbor_engine_->neighbor_row(
241  id, neighbors, config_.nearest_neighbor_num + 1);
242 
243  // neighbors may contain given id. We ignore it.
244  for (size_t i = 0; i < neighbors.size(); ++i) {
245  if (neighbors[i].first == id) {
246  std::swap(neighbors[i], neighbors.back());
247  neighbors.pop_back();
248  break;
249  }
250  }
251  if (neighbors.size() > static_cast<size_t>(config_.nearest_neighbor_num)) {
252  neighbors.resize(config_.nearest_neighbor_num);
253  }
254 
255  return collect_lrds_from_neighbors(neighbors, neighbor_lrds);
256 }
257 
259  const std::vector<std::pair<std::string, float> >& neighbors,
260  std::vector<float>& neighbor_lrds) const {
261  neighbor_lrds.resize(neighbors.size());
262  if (neighbors.empty()) {
263  return std::numeric_limits<float>::infinity();
264  }
265 
266  // Collect parameters of given neighbors.
267  std::vector<parameter> parameters(neighbors.size());
268  for (size_t i = 0; i < neighbors.size(); ++i) {
269  parameters[i] = get_row_parameter(neighbors[i].first);
270  neighbor_lrds[i] = parameters[i].lrd;
271  }
272 
273  // Calculate LRD value of the query.
274  float sum_reachability = 0;
275  for (size_t i = 0; i < neighbors.size(); ++i) {
276  // Accumulate the reachability distance of the query and the i-th neighbor.
277  sum_reachability += std::max(neighbors[i].second, parameters[i].kdist);
278  }
279 
280  if (sum_reachability == 0) {
281  // All k-nearest neighbors are at the same point with given query.
282  return std::numeric_limits<float>::infinity();
283  }
284 
285  // LRD is an inverse of mean of reachability distances between the query and
286  // its k-nearest neighbors.
287  return neighbors.size() / sum_reachability;
288 }
289 
291  const std::string& query,
292  unordered_set<std::string>& neighbors) const {
293  std::vector<std::pair<std::string, float> > nn_result;
294  nearest_neighbor_engine_->neighbor_row(
295  query, nn_result, config_.reverse_nearest_neighbor_num);
296 
297  for (size_t i = 0; i < nn_result.size(); ++i) {
298  neighbors.insert(nn_result[i].first);
299  }
300 }
301 
302 void light_lof::update_entries(const unordered_set<std::string>& neighbors) {
303  shared_ptr<column_table> table = mixable_scores_->get_model();
304  storage::float_column& kdist_column =
305  table->get_float_column(KDIST_COLUMN_INDEX);
306  storage::float_column& lrd_column = table->get_float_column(LRD_COLUMN_INDEX);
307 
308  std::vector<uint64_t> ids;
309  ids.reserve(neighbors.size());
310  for (unordered_set<std::string>::const_iterator it = neighbors.begin();
311  it != neighbors.end(); ++it) {
312  const std::pair<bool, uint64_t> hit = table->exact_match(*it);
313  if (hit.first) {
314  ids.push_back(hit.second);
315  }
316  }
317 
318  unordered_map<uint64_t, std::vector<std::pair<uint64_t, float> > >
319  nested_neighbors;
320 
321  // Gather k-nearest neighbors of each member of neighbors and update their
322  // k-dists.
323  std::vector<std::pair<std::string, float> > nn_result;
324  for (std::vector<uint64_t>::const_iterator it = ids.begin();
325  it != ids.end(); ++it) {
326  nearest_neighbor_engine_->neighbor_row(
327  table->get_key(*it), nn_result, config_.nearest_neighbor_num);
328  std::vector<std::pair<uint64_t, float> >& nn_indexes =
329  nested_neighbors[*it];
330 
331  nn_indexes.reserve(nn_result.size());
332  for (size_t i = 0; i < nn_result.size(); ++i) {
333  const std::pair<bool, uint64_t> hit =
334  table->exact_match(nn_result[i].first);
335  if (hit.first) {
336  nn_indexes.push_back(std::make_pair(hit.second, nn_result[i].second));
337  }
338  }
339 
340  kdist_column[*it] = nn_result.back().second;
341  }
342 
343  // Calculate LRDs of neighbors.
344  const storage::owner owner(my_id_);
345  for (std::vector<uint64_t>::const_iterator it = ids.begin();
346  it != ids.end(); ++it) {
347  const std::vector<std::pair<uint64_t, float> >& nn = nested_neighbors[*it];
348  float lrd = 1;
349  if (!nn.empty()) {
350  const size_t length = std::min(
351  nn.size(), static_cast<size_t>(config_.nearest_neighbor_num));
352  float sum_reachability = 0;
353  for (size_t i = 0; i < length; ++i) {
354  sum_reachability += std::max(nn[i].second, kdist_column[nn[i].first]);
355  }
356 
357  if (sum_reachability == 0) {
358  lrd = std::numeric_limits<float>::infinity();
359  } else {
360  lrd = length / sum_reachability;
361  }
362  }
363  lrd_column[*it] = lrd;
364  table->update_clock(*it, owner);
365  }
366 }
367 
369  const {
370  shared_ptr<column_table> table = mixable_scores_->get_model();
371  std::pair<bool, uint64_t> hit = table->exact_match(row);
372  if (!hit.first) {
374  "row \"" + row + "\" not found in light_lof table"));
375  }
376  parameter param;
377  param.kdist = table->get_float_column(KDIST_COLUMN_INDEX)[hit.second];
378  param.lrd = table->get_float_column(LRD_COLUMN_INDEX)[hit.second];
379  return param;
380 }
381 
383  packer.pack_array(2);
384  nearest_neighbor_engine_->pack(packer);
385  mixable_scores_->get_model()->pack(packer);
386 }
387 
388 void light_lof::unpack(msgpack::object o) {
389  if (o.type != msgpack::type::ARRAY || o.via.array.size != 2) {
390  throw msgpack::type_error();
391  }
392 
393  // clear before load
394  clear();
395 
396  nearest_neighbor_engine_->unpack(o.via.array.ptr[0]);
397  mixable_scores_->get_model()->unpack(o.via.array.ptr[1]);
398 }
399 
400 } // namespace anomaly
401 } // namespace core
402 } // namespace jubatus
float collect_lrds_from_neighbors(const std::vector< std::pair< std::string, float > > &neighbors, std::vector< float > &neighbor_lrd) const
Definition: light_lof.cpp:258
jubatus::util::lang::shared_ptr< nearest_neighbor::nearest_neighbor_base > nearest_neighbor_engine_
Definition: light_lof.hpp:130
jubatus::util::lang::shared_ptr< framework::mixable_versioned_table > mixable_scores_
Definition: light_lof.hpp:138
void set_row(const std::string &id, const common::sfv_t &sfv)
Definition: light_lof.cpp:161
parameter get_row_parameter(const std::string &row) const
Definition: light_lof.cpp:368
void touch(const std::string &id)
Definition: light_lof.cpp:197
void pack(framework::packer &packer) const
Definition: light_lof.cpp:382
light_lof(const config &config, const std::string &id, jubatus::util::lang::shared_ptr< nearest_neighbor::nearest_neighbor_base > nearest_neighbor_engine)
void update_row(const std::string &id, const sfv_diff_t &diff)
Definition: light_lof.cpp:157
jubatus::util::lang::shared_ptr< framework::mixable_versioned_table > mixable_nearest_neighbor_
Definition: light_lof.hpp:135
int nearest_neighbor_num
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
jubatus::util::data::optional< std::string > unlearner
common::sfv_t sfv_diff_t
void swap(weighted_point &p1, weighted_point &p2)
Definition: types.hpp:47
void clear_row(const std::string &id)
Definition: light_lof.cpp:153
msgpack::packer< jubatus_packer > packer
Definition: bandit_base.hpp:31
float collect_lrds(const common::sfv_t &query, std::vector< float > &neighbor_lrds) const
Definition: light_lof.cpp:226
float calc_anomaly_score(const common::sfv_t &query) const
Definition: light_lof.cpp:131
void update_entries(const jubatus::util::data::unordered_set< std::string > &neighbors)
Definition: light_lof.cpp:302
void unlearn(const std::string &id)
Definition: light_lof.cpp:215
std::vector< std::pair< std::string, float > > sfv_t
Definition: type.hpp:29
jubatus::util::lang::shared_ptr< unlearner::unlearner_base > unlearner_
Definition: light_lof.hpp:131
void collect_neighbors(const std::string &query, jubatus::util::data::unordered_set< std::string > &neighbors) const
Definition: light_lof.cpp:290
std::vector< framework::mixable * > get_mixables() const
Definition: light_lof.cpp:188
void get_all_row_ids(std::vector< std::string > &ids) const
Definition: light_lof.cpp:180
void unpack(msgpack::object o)
Definition: light_lof.cpp:388