jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
euclid_lsh.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "euclid_lsh.hpp"
18 
19 #include <map>
20 #include <string>
21 #include <vector>
22 #include <utility>
23 #include <cmath>
24 #include "jubatus/util/lang/cast.h"
25 #include "../storage/fixed_size_heap.hpp"
26 #include "../storage/column_table.hpp"
27 #include "lsh_function.hpp"
28 
29 using std::map;
30 using std::pair;
31 using std::make_pair;
32 using std::string;
33 using std::vector;
34 using jubatus::util::lang::lexical_cast;
41 
42 namespace jubatus {
43 namespace core {
44 namespace nearest_neighbor {
45 namespace {
46 
47 float squared_l2norm(const common::sfv_t& sfv) {
48  float sqnorm = 0;
49  for (size_t i = 0; i < sfv.size(); ++i) {
50  sqnorm += sfv[i].second * sfv[i].second;
51  }
52  return sqnorm;
53 }
54 
55 float l2norm(const common::sfv_t& sfv) {
56  return std::sqrt(squared_l2norm(sfv));
57 }
58 
59 } // namespace
60 
62  const config& conf,
63  jubatus::util::lang::shared_ptr<column_table> table,
64  const std::string& id)
65  : nearest_neighbor_base(table, id) {
66  set_config(conf);
67 
68  vector<column_type> schema;
69  fill_schema(schema);
70  get_table()->init(schema);
71 }
72 
73 euclid_lsh::euclid_lsh(
74  const config& conf,
75  jubatus::util::lang::shared_ptr<column_table> table,
76  vector<column_type>& schema,
77  const std::string& id)
78  : nearest_neighbor_base(table, id) {
79  set_config(conf);
80  fill_schema(schema);
81 }
82 
83 void euclid_lsh::set_row(const string& id, const common::sfv_t& sfv) {
84  // TODO(beam2d): support nested algorithm, e.g. when used by lof and then we
85  // cannot suppose that the first two columns are assigned to euclid_lsh.
86  get_table()->add(id, owner(my_id_), cosine_lsh(sfv, hash_num_), l2norm(sfv));
87 }
88 
89 void euclid_lsh::neighbor_row(
90  const common::sfv_t& query,
91  vector<pair<string, float> >& ids,
92  uint64_t ret_num) const {
93  neighbor_row_from_hash(
94  cosine_lsh(query, hash_num_),
95  l2norm(query),
96  ids,
97  ret_num);
98 }
99 
100 void euclid_lsh::neighbor_row(
101  const std::string& query_id,
102  vector<pair<string, float> >& ids,
103  uint64_t ret_num) const {
104  const pair<bool, uint64_t> maybe_index =
105  get_const_table()->exact_match(query_id);
106  if (!maybe_index.first) {
107  ids.clear();
108  return;
109  }
110 
111  const bit_vector bv = lsh_column()[maybe_index.second];
112  const float norm = norm_column()[maybe_index.second];
113  neighbor_row_from_hash(bv, norm, ids, ret_num);
114 }
115 
116 void euclid_lsh::set_config(const config& conf) {
117  if (!(1 <= conf.hash_num)) {
118  throw JUBATUS_EXCEPTION(
119  common::invalid_parameter("1 <= hash_num"));
120  }
121 
122  hash_num_ = conf.hash_num;
123 }
124 
125 void euclid_lsh::fill_schema(vector<column_type>& schema) {
126  first_column_id_ = schema.size();
127  schema.push_back(column_type(column_type::bit_vector_type, hash_num_));
128  schema.push_back(column_type(column_type::float_type));
129 }
130 
131 const_bit_vector_column& euclid_lsh::lsh_column() const {
132  return get_const_table()->get_bit_vector_column(first_column_id_);
133 }
134 
135 const_float_column& euclid_lsh::norm_column() const {
136  return get_const_table()->get_float_column(first_column_id_ + 1);
137 }
138 
139 void euclid_lsh::neighbor_row_from_hash(
140  const bit_vector& bv,
141  float norm,
142  vector<pair<string, float> >& ids,
143  uint64_t ret_num) const {
144  jubatus::util::lang::shared_ptr<const column_table> table = get_const_table();
145 
147  {
148  const_bit_vector_column& bv_col = lsh_column();
149  const_float_column& norm_col = norm_column();
150 
151  const float denom = bv.bit_num();
152  for (size_t i = 0; i < table->size(); ++i) {
153  const size_t hamm_dist = bv.calc_hamming_distance(bv_col[i]);
154  const float theta = hamm_dist * M_PI / denom;
155  const float score =
156  norm_col[i] * (norm_col[i] - 2 * norm * std::cos(theta));
157  heap.push(make_pair(score, i));
158  }
159  }
160 
161  vector<pair<float, size_t> > sorted;
162  heap.get_sorted(sorted);
163 
164  ids.clear();
165  const float squared_norm = norm * norm;
166  for (size_t i = 0; i < sorted.size(); ++i) {
167  ids.push_back(make_pair(table->get_key(sorted[i].second),
168  std::sqrt(squared_norm + sorted[i].first)));
169  }
170 }
171 
172 } // namespace nearest_neighbor
173 } // namespace core
174 } // namespace jubatus
void get_sorted(std::vector< T > &v) const
static float squared_norm(const common::sfv_t &fv)
const bit_vector_column const_bit_vector_column
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
bit_vector_base< uint64_t > bit_vector
std::vector< std::pair< std::string, float > > sfv_t
Definition: type.hpp:29
bit_vector cosine_lsh(const common::sfv_t &sfv, uint32_t hash_num)
const typed_column< float > const_float_column
euclid_lsh(const config &conf, jubatus::util::lang::shared_ptr< storage::column_table > table, const std::string &id)