jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
minhash.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "minhash.hpp"
18 #include <cfloat>
19 #include <cmath>
20 #include <map>
21 #include <string>
22 #include <vector>
23 
24 #include "../common/hash.hpp"
25 
26 using std::string;
27 using std::map;
28 using std::vector;
29 using std::pair;
30 using jubatus::util::lang::lexical_cast;
34 
35 
36 namespace jubatus {
37 namespace core {
38 namespace nearest_neighbor {
39 namespace {
40 
41 void hash_mix64(uint64_t& a, uint64_t& b, uint64_t& c) {
42  a -= b;
43  a -= c;
44  a ^= (c>>43);
45 
46  b -= c;
47  b -= a;
48  b ^= (a<<9);
49 
50  c -= a;
51  c -= b;
52  c ^= (b>>8);
53 
54  a -= b;
55  a -= c;
56  a ^= (c>>38);
57 
58  b -= c;
59  b -= a;
60  b ^= (a<<23);
61 
62  c -= a;
63  c -= b;
64  c ^= (b>>5);
65 
66  a -= b;
67  a -= c;
68  a ^= (c>>35);
69 
70  b -= c;
71  b -= a;
72  b ^= (a<<49);
73 
74  c -= a;
75  c -= b;
76  c ^= (b>>11);
77 
78  a -= b;
79  a -= c;
80  a ^= (c>>12);
81 
82  b -= c;
83  b -= a;
84  b ^= (a<<18);
85 
86  c -= a;
87  c -= b;
88  c ^= (b>>22);
89 }
90 
91 float calc_hash(uint64_t a, uint64_t b, float val) {
92  static const uint64_t HASH_PRIME = 0xc3a5c85c97cb3127ULL;
93 
94  uint64_t c = HASH_PRIME;
95  hash_mix64(a, b, c);
96  hash_mix64(a, b, c);
97  float r = static_cast<float>(a) / static_cast<float>(0xFFFFFFFFFFFFFFFFLLU);
98  return - std::log(r) / val;
99 }
100 
101 } // namespace
102 
104  const config& conf,
105  jubatus::util::lang::shared_ptr<column_table> table,
106  const std::string& id)
107  : bit_vector_nearest_neighbor_base(conf.hash_num, table, id) {
108 
109  if (!(1 <= conf.hash_num)) {
110  throw JUBATUS_EXCEPTION(
111  common::invalid_parameter("1 <= hash_num"));
112  }
113 }
114 
115 minhash::minhash(
116  const config& conf,
117  jubatus::util::lang::shared_ptr<column_table> table,
118  vector<column_type>& schema,
119  const std::string& id)
120  : bit_vector_nearest_neighbor_base(conf.hash_num, table, schema, id) {
121 
122  if (!(1 <= conf.hash_num)) {
123  throw JUBATUS_EXCEPTION(
124  common::invalid_parameter("1 <= hash_num"));
125  }
126 }
127 
128 bit_vector minhash::hash(const common::sfv_t& sfv) const {
129  vector<float> min_values_buffer(bitnum(), FLT_MAX);
130  vector<uint64_t> hash_buffer(bitnum());
131  for (size_t i = 0; i < sfv.size(); ++i) {
132  uint64_t key_hash = common::hash_util::calc_string_hash(sfv[i].first);
133  float val = sfv[i].second;
134  for (uint32_t j = 0; j < bitnum(); ++j) {
135  float hashval = calc_hash(key_hash, j, val);
136  if (hashval < min_values_buffer[j]) {
137  min_values_buffer[j] = hashval;
138  hash_buffer[j] = key_hash;
139  }
140  }
141  }
142 
143  bit_vector bv(bitnum());
144  for (size_t i = 0; i < hash_buffer.size(); ++i) {
145  if ((hash_buffer[i] & 1LLU) == 1) {
146  bv.set_bit(i);
147  }
148  }
149 
150  return bv;
151 }
152 
153 } // namespace nearest_neighbor
154 } // namespace core
155 } // namespace jubatus
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
minhash(const config &conf, jubatus::util::lang::shared_ptr< storage::column_table > table, const std::string &id)
bit_vector_base< uint64_t > bit_vector
std::vector< std::pair< std::string, float > > sfv_t
Definition: type.hpp:29
static uint64_t calc_string_hash(const std::string &s)
Definition: hash.hpp:29