jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
minhash.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "minhash.hpp"
18 
19 #include <algorithm>
20 #include <cmath>
21 #include <cfloat>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 #include "../common/exception.hpp"
26 #include "../common/hash.hpp"
27 
28 using std::pair;
29 using std::string;
30 using std::vector;
31 using jubatus::util::lang::shared_ptr;
33 
34 namespace jubatus {
35 namespace core {
36 namespace recommender {
37 
38 const uint64_t minhash::hash_prime = 0xc3a5c85c97cb3127ULL;
39 
41  : hash_num_(64) {
43 }
44 
46  : hash_num_(config.hash_num) {
47 
48  if (!(1 <= config.hash_num)) {
49  throw JUBATUS_EXCEPTION(
50  common::invalid_parameter("1 <= hash_num"));
51  }
52 
54 }
55 
57 }
58 
60  const common::sfv_t& query,
61  vector<pair<string, float> >& ids,
62  size_t ret_num) const {
63  ids.clear();
64  if (ret_num == 0) {
65  return;
66  }
67 
68  bit_vector query_bv;
69  calc_minhash_values(query, query_bv);
70  mixable_storage_->get_model()->similar_row(query_bv, ids, ret_num);
71 }
72 
74  const common::sfv_t& query,
75  vector<pair<string, float> >& ids,
76  size_t ret_num) const {
77  similar_row(query, ids, ret_num);
78  for (size_t i = 0; i < ids.size(); ++i) {
79  ids[i].second = 1 - ids[i].second;
80  }
81 }
82 
84  orig_.clear();
85  mixable_storage_->get_model()->clear();
86 }
87 
88 void minhash::clear_row(const string& id) {
89  orig_.remove_row(id);
90  mixable_storage_->get_model()->remove_row(id);
91 }
92 
94  bit_vector& bv) const {
95  vector<float> min_values_buffer(hash_num_, FLT_MAX);
96  vector<uint64_t> hash_buffer(hash_num_);
97  for (size_t i = 0; i < sfv.size(); ++i) {
98  uint64_t key_hash = common::hash_util::calc_string_hash(sfv[i].first);
99  float val = sfv[i].second;
100  for (uint64_t j = 0; j < hash_num_; ++j) {
101  float hashval = calc_hash(key_hash, j, val);
102  if (hashval < min_values_buffer[j]) {
103  min_values_buffer[j] = hashval;
104  hash_buffer[j] = key_hash;
105  }
106  }
107  }
108 
109  bv.resize_and_clear(hash_num_);
110  for (size_t i = 0; i < hash_buffer.size(); ++i) {
111  if ((hash_buffer[i] & 1LLU) == 1) {
112  bv.set_bit(i);
113  }
114  }
115 }
116 
117 void minhash::update_row(const string& id, const sfv_diff_t& diff) {
118  orig_.set_row(id, diff);
119 
120  common::sfv_t row;
121  orig_.get_row(id, row);
122  bit_vector bv;
123  calc_minhash_values(row, bv);
124  mixable_storage_->get_model()->set_row(id, bv);
125 }
126 
127 void minhash::get_all_row_ids(std::vector<std::string>& ids) const {
128  mixable_storage_->get_model()->get_all_row_ids(ids);
129 }
130 
131 // original by Hash64 http://burtleburtle.net/bob/hash/evahash.html
132 void minhash::hash_mix64(uint64_t& a, uint64_t& b, uint64_t& c) {
133  a -= b;
134  a -= c;
135  a ^= (c >> 43);
136  b -= c;
137  b -= a;
138  b ^= (a << 9);
139  c -= a;
140  c -= b;
141  c ^= (b >> 8);
142  a -= b;
143  a -= c;
144  a ^= (c >> 38);
145  b -= c;
146  b -= a;
147  b ^= (a << 23);
148  c -= a;
149  c -= b;
150  c ^= (b >> 5);
151  a -= b;
152  a -= c;
153  a ^= (c >> 35);
154  b -= c;
155  b -= a;
156  b ^= (a << 49);
157  c -= a;
158  c -= b;
159  c ^= (b >> 11);
160  a -= b;
161  a -= c;
162  a ^= (c >> 12);
163  b -= c;
164  b -= a;
165  b ^= (a << 18);
166  c -= a;
167  c -= b;
168  c ^= (b >> 22);
169 }
170 
171 float minhash::calc_hash(uint64_t a, uint64_t b, float val) {
172  uint64_t c = hash_prime;
173  hash_mix64(a, b, c);
174  hash_mix64(a, b, c);
175  float r = static_cast<float>(a) / static_cast<float>(0xFFFFFFFFFFFFFFFFLLU);
176  return -std::log(r) / val;
177 }
178 
179 string minhash::type() const {
180  return string("minhash");
181 }
182 
184  return mixable_storage_.get();
185 }
186 
188  shared_ptr<storage::bit_index_storage> p(new storage::bit_index_storage);
190 }
191 
193  packer.pack_array(2);
194  orig_.pack(packer);
195  mixable_storage_->get_model()->pack(packer);
196 }
197 
198 void minhash::unpack(msgpack::object o) {
199  if (o.type != msgpack::type::ARRAY || o.via.array.size != 2) {
200  throw msgpack::type_error();
201  }
202  orig_.unpack(o.via.array.ptr[0]);
203  mixable_storage_->get_model()->unpack(o.via.array.ptr[1]);
204 }
205 
206 } // namespace recommender
207 } // namespace core
208 } // namespace jubatus
void get_row(const std::string &row, std::vector< std::pair< std::string, float > > &columns) const
void calc_minhash_values(const common::sfv_t &sfv, core::storage::bit_vector &bv) const
Definition: minhash.cpp:93
void update_row(const std::string &id, const sfv_diff_t &diff)
Definition: minhash.cpp:117
void similar_row(const common::sfv_t &query, std::vector< std::pair< std::string, float > > &ids, size_t ret_num) const
Definition: minhash.cpp:59
void neighbor_row(const common::sfv_t &query, std::vector< std::pair< std::string, float > > &ids, size_t ret_num) const
Definition: minhash.cpp:73
void pack(framework::packer &packer) const
Definition: minhash.cpp:192
jubatus::util::lang::shared_ptr< storage::mixable_bit_index_storage > mixable_storage_
Definition: minhash.hpp:87
core::common::sfv_t sfv_diff_t
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
void pack(framework::packer &packer) const
void clear_row(const std::string &id)
Definition: minhash.cpp:88
framework::mixable * get_mixable() const
Definition: minhash.cpp:183
static const uint64_t hash_prime
Definition: minhash.hpp:84
void get_all_row_ids(std::vector< std::string > &ids) const
Definition: minhash.cpp:127
msgpack::packer< jubatus_packer > packer
Definition: bandit_base.hpp:31
bit_vector_base< uint64_t > bit_vector
static float calc_hash(uint64_t a, uint64_t b, float val)
Definition: minhash.cpp:171
void unpack(msgpack::object o)
Definition: minhash.cpp:198
static void hash_mix64(uint64_t &a, uint64_t &b, uint64_t &c)
Definition: minhash.cpp:132
core::storage::sparse_matrix_storage orig_
std::vector< std::pair< std::string, float > > sfv_t
Definition: type.hpp:29
void set_row(const std::string &row, const std::vector< std::pair< std::string, float > > &columns)
static uint64_t calc_string_hash(const std::string &s)
Definition: hash.hpp:29