24 #include "jubatus/util/data/unordered_map.h"
25 #include "jubatus/util/data/unordered_set.h"
26 #include "jubatus/util/lang/cast.h"
27 #include "jubatus/util/math/random.h"
32 using std::ostringstream;
34 using std::istringstream;
40 using std::partial_sort;
41 using std::lower_bound;
42 using jubatus::util::data::unordered_map;
43 using jubatus::util::data::unordered_set;
44 using jubatus::util::math::random::mtrand;
52 struct greater_second {
54 bool operator()(
const P& l,
const P& r)
const {
55 return l.second > r.second;
59 uint64_t hash_lv(
const lsh_vector& lv) {
60 uint64_t hash = 14695981039346656037LLU;
61 for (
size_t i = 0; i < lv.size(); ++i) {
62 for (
int j = 0; j < 32; j += 8) {
63 hash *= 1099511628211LLU;
64 hash ^= (
static_cast<uint32_t
>(lv.get(i)) >> j) & 0xff;
70 void initialize_shift(uint32_t seed, vector<float>& shift) {
72 for (
size_t i = 0; i < shift.size(); ++i) {
73 shift[i] = rnd.next_double();
77 vector<float> shift_hash(
78 const vector<float>& hash,
79 const vector<float>& shift) {
80 vector<float> shifted(hash);
81 for (
size_t i = 0; i < shifted.size(); ++i) {
82 shifted[i] += shift[i];
90 for (
size_t i = 0; i < hash.size(); ++i) {
98 float calc_euclidean_distance(
99 const lsh_entry& entry,
102 const uint64_t hamm = bv.calc_hamming_similarity(entry.simhash_bv);
103 if (hamm == bv.bit_num()) {
105 return std::fabs(norm - entry.norm);
107 const float angle = (1 -
static_cast<float>(hamm) / bv.bit_num()) * M_PI;
108 const float dot = entry.norm * norm * std::cos(angle);
109 return std::sqrt(norm * norm + entry.norm * entry.norm - 2 * dot);
112 void retrieve_hit_rows_from_table(
115 unordered_set<uint64_t>& cands) {
116 lsh_table_t::const_iterator it = table.find(hash);
117 if (it != table.end()) {
118 const vector<uint64_t>& range = it->second;
119 for (
size_t j = 0; j < range.size(); ++j) {
120 cands.insert(range[j]);
134 : shift_(lsh_num * table_num),
135 table_num_(table_num) {
136 initialize_shift(seed,
shift_);
141 const vector<float>& shift)
143 table_num_(table_num) {
151 const vector<float>& hash,
160 const vector<uint64_t>& lsh_hash = it->second.lsh_hash;
161 for (
size_t i = 0; i < lsh_hash.size(); ++i) {
163 vector<uint64_t>::iterator it = lower_bound(range.begin(), range.end(), id);
164 if (it == range.end() ||
id != *it) {
165 range.insert(it,
id);
177 lsh_master_table_t::iterator entry_it =
master_table_.find(row);
204 unordered_set<std::string> id_set;
206 id_set.rehash(std::ceil(size_upper_bound / id_set.max_load_factor()));
208 for (lsh_master_table_t::const_iterator it =
master_table_.begin();
210 if (!it->second.lsh_hash.empty()) {
211 id_set.insert(it->first);
216 if (!it->second.lsh_hash.empty()) {
217 id_set.insert(it->first);
221 vector<string> ret(id_set.size());
222 copy(id_set.begin(), id_set.end(), ret.begin());
227 const vector<float>& hash,
231 vector<pair<string, float> >& ids)
const {
232 const vector<float> shifted = shift_hash(hash,
shift_);
236 unordered_set<uint64_t> cands;
248 for (uint64_t i = 0; i < probe_num; ++i) {
249 pair<size_t, lsh_vector> p = gen.get_next_table_and_vector();
250 p.second.push_back(p.first);
261 vector<pair<string, float> >& ids)
const {
270 unordered_set<uint64_t> cands;
271 for (
size_t i = 0; i < it->second.lsh_hash.size(); ++i) {
278 it->second.simhash_bv,
284 return "lsh_index_storage";
301 for (lsh_master_table_t::const_iterator it = diff.begin(); it != diff.end();
303 if (it->second.lsh_hash.empty()) {
323 for (lsh_master_table_t::const_iterator it = lhs.begin(); it != lhs.end();
325 rhs[it->first] = it->second;
339 lsh_master_table_t::iterator ret_it = entry_it;
356 for (
size_t i = 0; i < entry.
lsh_hash.size(); ++i) {
359 vector<uint64_t>& range = it->second;
360 vector<uint64_t>::iterator jt = lower_bound(range.begin(),
363 if (jt != range.end() && row_id == *jt) {
374 const vector<float>& hash,
377 const vector<float> shifted = shift_hash(hash,
shift_);
402 for (
size_t i = 0; i < entry->
lsh_hash.size(); ++i) {
405 vector<uint64_t>& range = it->second;
406 vector<uint64_t>::iterator jt = find(range.begin(), range.end(), row_id);
407 if (jt != range.end()) {
422 for (
size_t i = 0; i < entry.
lsh_hash.size(); ++i) {
430 unordered_set<uint64_t>& cands)
const {
432 retrieve_hit_rows_from_table(hash,
lsh_table_, cands);
433 return cands.size() >=
static_cast<uint64_t
>(ret_num);
437 const unordered_set<uint64_t>& cands,
441 vector<pair<string, float> >& ids)
const {
443 vector<pair<uint64_t, float> > scored;
444 scored.reserve(cands.size());
445 for (unordered_set<uint64_t>::const_iterator it = cands.begin();
446 it != cands.end(); ++it) {
448 if (!entry || entry->
lsh_hash.empty()) {
451 const float dist = calc_euclidean_distance(*entry, query_simhash,
453 scored.push_back(make_pair(*it, -dist));
456 if (scored.size() <= ret_num) {
457 sort(scored.begin(), scored.end(), greater_second());
459 partial_sort(scored.begin(),
460 scored.begin() + ret_num, scored.end(),
462 scored.resize(ret_num);
465 ids.resize(scored.size());
466 for (
size_t i = 0; i < scored.size(); ++i) {
468 ids[i].second = scored[i].second;
uint64_t get_id_const(const std::string &key) const
void get_sorted_similar_rows(const jubatus::util::data::unordered_set< uint64_t > &cands, const bit_vector &query_simhash, float query_norm, uint64_t ret_num, std::vector< std::pair< std::string, float > > &ids) const
bool put_diff(const lsh_master_table_t &mixed_diff)
void get_diff(lsh_master_table_t &diff) const
double dist(const common::sfv_t &p1, const common::sfv_t &p2)
void resize_and_clear(uint64_t bit_num)
void get_all_row_ids(std::vector< std::string > &ids) const
void pack(framework::packer &packer) const
bool retrieve_hit_rows(uint64_t hash, size_t ret_num, jubatus::util::data::unordered_set< uint64_t > &cands) const
bit_vector binarize(const vector< float > &proj)
void mix(const lsh_master_table_t &lhs, lsh_master_table_t &rhs) const
jubatus::util::data::unordered_map< std::string, lsh_entry > lsh_master_table_t
lsh_table_t lsh_table_diff_
std::vector< float > make_entry(const std::vector< float > &hash, float norm, lsh_entry &entry) const
const lsh_entry * get_lsh_entry(const std::string &row) const
const std::string & get_key(const uint64_t id) const
void set_row(const std::string &row, const std::vector< float > &hash, float norm)
msgpack::packer< jubatus_packer > packer
bit_vector_base< uint64_t > bit_vector
lsh_master_table_t master_table_diff_
const lsh_vector & base(size_t i) const
lsh_master_table_t::iterator remove_and_get_row(const std::string &row)
uint64_t get_id(const std::string &key)
void push_back(int value)
void set_mixed_row(const std::string &row, const lsh_entry &entry)
std::vector< float > shift_
jubatus::util::data::unordered_map< uint64_t, std::vector< uint64_t > > lsh_table_t
void put_empty_entry(uint64_t row_id, const lsh_entry &entry)
void unpack(msgpack::object o)
std::vector< uint64_t > lsh_hash
common::key_manager key_manager_
void similar_row(const std::vector< float > &hash, float norm, uint64_t probe_num, uint64_t ret_num, std::vector< std::pair< std::string, float > > &ids) const
void remove_row(const std::string &row)
lsh_master_table_t master_table_
virtual ~lsh_index_storage()
void remove_model_row(const std::string &row)