jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
lsh_util.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2012 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "lsh_util.hpp"
18 
19 #include <algorithm>
20 #include <cmath>
21 #include <functional>
22 #include <queue>
23 #include <utility>
24 #include <vector>
25 
26 using std::copy;
27 using std::greater;
28 using std::make_pair;
29 using std::pair;
30 using std::sort;
31 using std::vector;
32 using std::priority_queue;
33 
34 namespace jubatus {
35 namespace core {
36 namespace storage {
37 
38 namespace {
39 
40 typedef pair<float, pair<int, vector<int> > > diff_type;
41 typedef priority_queue<
42  diff_type,
43  vector<diff_type>,
44  greater<diff_type> > heap_type;
45 
46 void partition(
47  const vector<float>& hash,
48  size_t num_hash_tables,
49  vector<vector<float> >& hashes) {
50  const size_t hash_size = hash.size() / num_hash_tables;
51  hashes.resize(num_hash_tables);
52  for (size_t i = 0; i < num_hash_tables; ++i) {
53  hashes[i].resize(hash_size);
54  std::vector<float>::const_iterator begin = hash.begin();
55  copy(begin + i*hash_size, begin + (i+1)*hash_size, hashes[i].begin());
56  }
57 }
58 
59 void threshold(const vector<float>& hash, lsh_vector& lv) {
60  lv.resize_and_clear(hash.size());
61  for (size_t j = 0; j < hash.size(); ++j) {
62  lv.set(j, std::floor(hash[j]));
63  }
64 }
65 
66 lsh_vector perturbe(
67  const lsh_vector& src,
68  const vector<int>& diff,
69  const vector<pair<float, int> >& cands) {
70  lsh_vector ret(src);
71  for (size_t i = 0; i < diff.size(); ++i) {
72  const int d_idx = cands[diff[i]].second;
73  if (d_idx >= 0) {
74  ret.set(d_idx, ret.get(d_idx) + 1);
75  } else {
76  ret.set(~d_idx, ret.get(~d_idx) - 1);
77  }
78  }
79  return ret;
80 }
81 
82 } // namespace
83 
85  const vector<float>& hash,
86  size_t num_hash_tables) {
87  partition(hash, num_hash_tables, hash_);
88  base_.resize(hash_.size());
89  for (size_t i = 0; i < hash_.size(); ++i) {
90  threshold(hash_[i], base_[i]);
91  }
92 }
93 
95  const int base_size = base_[0].size();
96 
97  perturbation_sets_.resize(base_.size());
98  for (size_t i = 0; i < base_.size(); ++i) {
99  vector<pair<float, int> >& cands = perturbation_sets_[i];
100  cands.resize(2 * base_size);
101  for (int j = 0; j < base_size; ++j) {
102  const float dist = hash_[i][j] - base_[i].get(j);
103  cands[j * 2] = make_pair((1 - dist) * (1 - dist), j);
104  cands[j * 2 + 1] = make_pair(dist * dist, ~j);
105  }
106  sort(cands.begin(), cands.end());
107  }
108 
109  for (size_t i = 0; i < base_.size(); ++i) {
110  if (!perturbation_sets_[i].empty()) {
111  heap_.push(
112  make_pair(perturbation_sets_[i][0].first,
113  make_pair(i, vector<int>(1))));
114  }
115  }
116 }
117 
119  const size_t base_size = base_[0].size();
120  lsh_vector lv(base_size * base_.size());
121  for (size_t i = 0; i < base_.size(); ++i) {
122  for (size_t j = 0; j < base_size; ++j) {
123  lv.set(i * base_size + j, base_[i].get(j));
124  }
125  }
126  return lv;
127 }
128 
130  if (heap_.empty()) {
131  return make_pair(-1ul, lsh_vector());
132  }
133  const size_t i = heap_.top().second.first;
134  pair<size_t, lsh_vector> ret(
135  i,
136  perturbe(base_[i], heap_.top().second.second, perturbation_sets_[i]));
138  return ret;
139 }
140 
142  const int index = heap_.top().second.first;
143  const vector<pair<float, int> >& cands = perturbation_sets_[index];
144 
145  vector<int> shifted = heap_.top().second.second;
146  ++shifted.back();
147 
148  vector<int> expanded = heap_.top().second.second;
149  expanded.push_back(expanded.back() + 1);
150 
151  const float score_base = heap_.top().first;
152  heap_.pop();
153 
154  if ((size_t) shifted.back() < cands.size()) {
155  const float score_diff = cands[shifted.back()].first
156  - cands[shifted.back() - 1].first;
157  heap_.push(make_pair(score_base + score_diff, make_pair(index, shifted)));
158  }
159 
160  if ((size_t) expanded.back() < cands.size()) {
161  const float score_diff = cands[expanded.back()].first;
162  heap_.push(make_pair(score_base + score_diff, make_pair(index, expanded)));
163  }
164 }
165 
166 } // namespace storage
167 } // namespace core
168 } // namespace jubatus
lsh_probe_generator(const std::vector< float > &hash, size_t num_hash_tables)
Definition: lsh_util.cpp:84
std::pair< size_t, lsh_vector > get_next_table_and_vector()
Definition: lsh_util.cpp:129
double dist(const common::sfv_t &p1, const common::sfv_t &p2)
Definition: util.cpp:151
fixed_size_heap< pair< uint64_t, string >, greater< pair< uint64_t, string > > > heap_type
std::vector< std::vector< float > > hash_
Definition: lsh_util.hpp:51
std::vector< std::vector< std::pair< float, int > > > perturbation_sets_
Definition: lsh_util.hpp:53
void set(size_t pos, int value)
Definition: lsh_vector.cpp:60