jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
Public Member Functions | Private Member Functions | Private Attributes | List of all members
jubatus::core::storage::lsh_index_storage Class Reference

#include <lsh_index_storage.hpp>

Collaboration diagram for jubatus::core::storage::lsh_index_storage:
Collaboration graph

Public Member Functions

size_t all_lsh_num () const
 
void clear ()
 
void get_all_row_ids (std::vector< std::string > &ids) const
 
void get_diff (lsh_master_table_t &diff) const
 
storage::version get_version () const
 
 lsh_index_storage ()
 
 lsh_index_storage (size_t lsh_num, size_t table_num, uint32_t seed)
 
 lsh_index_storage (size_t table_num, const std::vector< float > &shift)
 
void mix (const lsh_master_table_t &lhs, lsh_master_table_t &rhs) const
 
 MSGPACK_DEFINE (master_table_, master_table_diff_, lsh_table_, lsh_table_diff_, shift_, table_num_, key_manager_)
 
std::string name () const
 
void pack (framework::packer &packer) const
 
bool put_diff (const lsh_master_table_t &mixed_diff)
 
void remove_row (const std::string &row)
 
void set_row (const std::string &row, const std::vector< float > &hash, float norm)
 
void similar_row (const std::vector< float > &hash, float norm, uint64_t probe_num, uint64_t ret_num, std::vector< std::pair< std::string, float > > &ids) const
 
void similar_row (const std::string &id, uint64_t ret_num, std::vector< std::pair< std::string, float > > &ids) const
 
size_t table_num () const
 
void unpack (msgpack::object o)
 
virtual ~lsh_index_storage ()
 

Private Member Functions

const lsh_entryget_lsh_entry (const std::string &row) const
 
void get_sorted_similar_rows (const jubatus::util::data::unordered_set< uint64_t > &cands, const bit_vector &query_simhash, float query_norm, uint64_t ret_num, std::vector< std::pair< std::string, float > > &ids) const
 
std::vector< float > make_entry (const std::vector< float > &hash, float norm, lsh_entry &entry) const
 
void put_empty_entry (uint64_t row_id, const lsh_entry &entry)
 
lsh_master_table_t::iterator remove_and_get_row (const std::string &row)
 
void remove_model_row (const std::string &row)
 
bool retrieve_hit_rows (uint64_t hash, size_t ret_num, jubatus::util::data::unordered_set< uint64_t > &cands) const
 
void set_mixed_row (const std::string &row, const lsh_entry &entry)
 

Private Attributes

common::key_manager key_manager_
 
lsh_table_t lsh_table_
 
lsh_table_t lsh_table_diff_
 
lsh_master_table_t master_table_
 
lsh_master_table_t master_table_diff_
 
std::vector< float > shift_
 
uint64_t table_num_
 

Detailed Description

Definition at line 51 of file lsh_index_storage.hpp.

Constructor & Destructor Documentation

jubatus::core::storage::lsh_index_storage::lsh_index_storage ( )

Definition at line 127 of file lsh_index_storage.cpp.

127  {
128 }
jubatus::core::storage::lsh_index_storage::lsh_index_storage ( size_t  lsh_num,
size_t  table_num,
uint32_t  seed 
)

Definition at line 130 of file lsh_index_storage.cpp.

References shift_.

134  : shift_(lsh_num * table_num),
136  initialize_shift(seed, shift_);
137 }
jubatus::core::storage::lsh_index_storage::lsh_index_storage ( size_t  table_num,
const std::vector< float > &  shift 
)
jubatus::core::storage::lsh_index_storage::~lsh_index_storage ( )
virtual

Definition at line 146 of file lsh_index_storage.cpp.

146  {
147 }

Member Function Documentation

size_t jubatus::core::storage::lsh_index_storage::all_lsh_num ( ) const
inline

Definition at line 83 of file lsh_index_storage.hpp.

References shift_.

83  {
84  return shift_.size();
85  }
void jubatus::core::storage::lsh_index_storage::clear ( )

Definition at line 192 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::clear(), key_manager_, lsh_table_, lsh_table_diff_, master_table_, and master_table_diff_.

192  {
195  lsh_table_t().swap(lsh_table_);
198 }
jubatus::util::data::unordered_map< std::string, lsh_entry > lsh_master_table_t
Definition: euclid_lsh.hpp:39
jubatus::util::data::unordered_map< uint64_t, std::vector< uint64_t > > lsh_table_t

Here is the call graph for this function:

void jubatus::core::storage::lsh_index_storage::get_all_row_ids ( std::vector< std::string > &  ids) const

Definition at line 200 of file lsh_index_storage.cpp.

References master_table_, and master_table_diff_.

200  {
201  const size_t size_upper_bound = master_table_.size()
202  + master_table_diff_.size();
203 
204  unordered_set<std::string> id_set;
205  // equivalent to id_set.reserve(size_upper_bound) in C++11
206  id_set.rehash(std::ceil(size_upper_bound / id_set.max_load_factor()));
207 
208  for (lsh_master_table_t::const_iterator it = master_table_.begin();
209  it != master_table_.end(); ++it) {
210  if (!it->second.lsh_hash.empty()) {
211  id_set.insert(it->first);
212  }
213  }
214  for (lsh_master_table_t::const_iterator it = master_table_diff_.begin();
215  it != master_table_diff_.end(); ++it) {
216  if (!it->second.lsh_hash.empty()) {
217  id_set.insert(it->first);
218  }
219  }
220 
221  vector<string> ret(id_set.size());
222  copy(id_set.begin(), id_set.end(), ret.begin());
223  ids.swap(ret);
224 }
void jubatus::core::storage::lsh_index_storage::get_diff ( lsh_master_table_t diff) const

Definition at line 295 of file lsh_index_storage.cpp.

References master_table_diff_.

295  {
296  diff = master_table_diff_;
297 }
const lsh_entry * jubatus::core::storage::lsh_index_storage::get_lsh_entry ( const std::string &  row) const
private

Definition at line 472 of file lsh_index_storage.cpp.

References master_table_, and master_table_diff_.

Referenced by get_sorted_similar_rows(), and remove_model_row().

472  {
473  lsh_master_table_t::const_iterator it = master_table_diff_.find(row);
474  if (it == master_table_diff_.end()) {
475  it = master_table_.find(row);
476  if (it == master_table_.end()) {
477  return 0;
478  }
479  }
480  return &it->second;
481 }

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::get_sorted_similar_rows ( const jubatus::util::data::unordered_set< uint64_t > &  cands,
const bit_vector query_simhash,
float  query_norm,
uint64_t  ret_num,
std::vector< std::pair< std::string, float > > &  ids 
) const
private

Definition at line 436 of file lsh_index_storage.cpp.

References jubatus::core::clustering::dist(), jubatus::core::common::key_manager::get_key(), get_lsh_entry(), key_manager_, and jubatus::core::storage::lsh_entry::lsh_hash.

441  {
442  // Avoid string copy as far as possible
443  vector<pair<uint64_t, float> > scored;
444  scored.reserve(cands.size());
445  for (unordered_set<uint64_t>::const_iterator it = cands.begin();
446  it != cands.end(); ++it) {
447  const lsh_entry* entry = get_lsh_entry(key_manager_.get_key(*it));
448  if (!entry || entry->lsh_hash.empty()) {
449  continue;
450  }
451  const float dist = calc_euclidean_distance(*entry, query_simhash,
452  query_norm);
453  scored.push_back(make_pair(*it, -dist));
454  }
455 
456  if (scored.size() <= ret_num) {
457  sort(scored.begin(), scored.end(), greater_second());
458  } else {
459  partial_sort(scored.begin(),
460  scored.begin() + ret_num, scored.end(),
461  greater_second());
462  scored.resize(ret_num);
463  }
464 
465  ids.resize(scored.size());
466  for (size_t i = 0; i < scored.size(); ++i) {
467  ids[i].first = key_manager_.get_key(scored[i].first);
468  ids[i].second = scored[i].second;
469  }
470 }
double dist(const common::sfv_t &p1, const common::sfv_t &p2)
Definition: util.cpp:151
const lsh_entry * get_lsh_entry(const std::string &row) const
const std::string & get_key(const uint64_t id) const
Definition: key_manager.cpp:78

Here is the call graph for this function:

storage::version jubatus::core::storage::lsh_index_storage::get_version ( ) const
inline

Definition at line 86 of file lsh_index_storage.hpp.

86  {
87  return storage::version();
88  }
vector< float > jubatus::core::storage::lsh_index_storage::make_entry ( const std::vector< float > &  hash,
float  norm,
lsh_entry entry 
) const
private

Definition at line 373 of file lsh_index_storage.cpp.

References jubatus::core::storage::lsh_probe_generator::base(), jubatus::core::nearest_neighbor::binarize(), jubatus::core::storage::lsh_entry::lsh_hash, jubatus::core::storage::lsh_entry::norm, jubatus::core::storage::lsh_vector::push_back(), shift_, jubatus::core::storage::lsh_entry::simhash_bv, and table_num_.

Referenced by set_row().

376  {
377  const vector<float> shifted = shift_hash(hash, shift_);
378  lsh_probe_generator gen(shifted, table_num_);
379 
380  entry.lsh_hash.resize(table_num_);
381  for (uint64_t i = 0; i < table_num_; ++i) {
382  lsh_vector key = gen.base(i);
383  key.push_back(i);
384  entry.lsh_hash[i] = hash_lv(key);
385  }
386 
387  entry.simhash_bv = binarize(hash);
388  entry.norm = norm;
389 
390  return shifted;
391 }
bit_vector binarize(const vector< float > &proj)

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::mix ( const lsh_master_table_t lhs,
lsh_master_table_t rhs 
) const

Definition at line 320 of file lsh_index_storage.cpp.

322  {
323  for (lsh_master_table_t::const_iterator it = lhs.begin(); it != lhs.end();
324  ++it) {
325  rhs[it->first] = it->second;
326  }
327 }
jubatus::core::storage::lsh_index_storage::MSGPACK_DEFINE ( master_table_  ,
master_table_diff_  ,
lsh_table_  ,
lsh_table_diff_  ,
shift_  ,
table_num_  ,
key_manager_   
)
string jubatus::core::storage::lsh_index_storage::name ( ) const

Definition at line 283 of file lsh_index_storage.cpp.

283  {
284  return "lsh_index_storage";
285 }
void jubatus::core::storage::lsh_index_storage::pack ( framework::packer packer) const

Definition at line 287 of file lsh_index_storage.cpp.

287  {
288  packer.pack(*this);
289 }
msgpack::packer< jubatus_packer > packer
Definition: bandit_base.hpp:31
bool jubatus::core::storage::lsh_index_storage::put_diff ( const lsh_master_table_t mixed_diff)

Definition at line 299 of file lsh_index_storage.cpp.

References lsh_table_diff_, master_table_, master_table_diff_, remove_model_row(), and set_mixed_row().

300  {
301  for (lsh_master_table_t::const_iterator it = diff.begin(); it != diff.end();
302  ++it) {
303  if (it->second.lsh_hash.empty()) {
304  remove_model_row(it->first);
305  master_table_.erase(it->first);
306  } else {
307  remove_model_row(it->first);
308  set_mixed_row(it->first, it->second);
309  }
310  }
311 
312  master_table_diff_.clear();
313 
314  // lsh_table_diff_ is actually not MIXed, but must be cleared as well as diff
315  // of usual model.
316  lsh_table_diff_.clear();
317  return true;
318 }
void set_mixed_row(const std::string &row, const lsh_entry &entry)
void remove_model_row(const std::string &row)

Here is the call graph for this function:

void jubatus::core::storage::lsh_index_storage::put_empty_entry ( uint64_t  row_id,
const lsh_entry entry 
)
private

Definition at line 353 of file lsh_index_storage.cpp.

References jubatus::core::storage::lsh_entry::lsh_hash, and lsh_table_diff_.

Referenced by remove_and_get_row(), and remove_row().

355  {
356  for (size_t i = 0; i < entry.lsh_hash.size(); ++i) {
357  lsh_table_t::iterator it = lsh_table_diff_.find(entry.lsh_hash[i]);
358  if (it != lsh_table_diff_.end()) {
359  vector<uint64_t>& range = it->second;
360  vector<uint64_t>::iterator jt = lower_bound(range.begin(),
361  range.end(),
362  row_id);
363  if (jt != range.end() && row_id == *jt) {
364  range.erase(jt);
365  if (range.empty()) {
366  lsh_table_diff_.erase(it);
367  }
368  }
369  }
370  }
371 }

Here is the caller graph for this function:

lsh_master_table_t::iterator jubatus::core::storage::lsh_index_storage::remove_and_get_row ( const std::string &  row)
private

Definition at line 331 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::get_id_const(), key_manager_, master_table_, master_table_diff_, jubatus::core::common::key_manager::NOTFOUND, and put_empty_entry().

Referenced by set_row().

332  {
333  const uint64_t row_id = key_manager_.get_id_const(row);
334  if (row_id == common::key_manager::NOTFOUND) {
335  return master_table_diff_.end();
336  }
337 
338  lsh_master_table_t::iterator entry_it = master_table_diff_.find(row);
339  lsh_master_table_t::iterator ret_it = entry_it;
340  if (entry_it == master_table_diff_.end()) {
341  ret_it = master_table_diff_.insert(make_pair(row, lsh_entry())).first;
342  entry_it = master_table_.find(row);
343  if (entry_it == master_table_.end()) {
344  return ret_it;
345  }
346  }
347  lsh_entry& entry = entry_it->second;
348  put_empty_entry(row_id, entry);
349 
350  return ret_it;
351 }
uint64_t get_id_const(const std::string &key) const
Definition: key_manager.cpp:67
void put_empty_entry(uint64_t row_id, const lsh_entry &entry)

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::remove_model_row ( const std::string &  row)
private

Definition at line 395 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::get_id_const(), get_lsh_entry(), key_manager_, jubatus::core::storage::lsh_entry::lsh_hash, and lsh_table_.

Referenced by put_diff().

395  {
396  const lsh_entry* entry = get_lsh_entry(row);
397  if (!entry) {
398  return;
399  }
400 
401  const uint64_t row_id = key_manager_.get_id_const(row);
402  for (size_t i = 0; i < entry->lsh_hash.size(); ++i) {
403  lsh_table_t::iterator it = lsh_table_.find(entry->lsh_hash[i]);
404  if (it != lsh_table_.end()) {
405  vector<uint64_t>& range = it->second;
406  vector<uint64_t>::iterator jt = find(range.begin(), range.end(), row_id);
407  if (jt != range.end()) {
408  range.erase(jt);
409  if (range.empty()) {
410  lsh_table_.erase(it);
411  }
412  }
413  }
414  }
415 }
uint64_t get_id_const(const std::string &key) const
Definition: key_manager.cpp:67
const lsh_entry * get_lsh_entry(const std::string &row) const

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::remove_row ( const std::string &  row)

Definition at line 170 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::get_id_const(), key_manager_, master_table_, master_table_diff_, jubatus::core::common::key_manager::NOTFOUND, and put_empty_entry().

170  {
171  const uint64_t row_id = key_manager_.get_id_const(row);
172  if (row_id == common::key_manager::NOTFOUND) {
173  // Non-existence row
174  return;
175  }
176 
177  lsh_master_table_t::iterator entry_it = master_table_.find(row);
178  if (entry_it == master_table_.end()) {
179  // Since the row is not yet mixed, it can be immediately erased.
180  master_table_diff_.erase(row);
181  return;
182  }
183 
184  // Otherwise, keep the row with empty entry until next MIX.
185  master_table_diff_.insert(make_pair(row, lsh_entry()));
186  lsh_entry& entry = entry_it->second;
187  put_empty_entry(row_id, entry);
188 
189  return;
190 }
uint64_t get_id_const(const std::string &key) const
Definition: key_manager.cpp:67
void put_empty_entry(uint64_t row_id, const lsh_entry &entry)

Here is the call graph for this function:

bool jubatus::core::storage::lsh_index_storage::retrieve_hit_rows ( uint64_t  hash,
size_t  ret_num,
jubatus::util::data::unordered_set< uint64_t > &  cands 
) const
private

Definition at line 427 of file lsh_index_storage.cpp.

References lsh_table_, and lsh_table_diff_.

430  {
431  retrieve_hit_rows_from_table(hash, lsh_table_diff_, cands);
432  retrieve_hit_rows_from_table(hash, lsh_table_, cands);
433  return cands.size() >= static_cast<uint64_t>(ret_num);
434 }
void jubatus::core::storage::lsh_index_storage::set_mixed_row ( const std::string &  row,
const lsh_entry entry 
)
private

Definition at line 417 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::get_id(), key_manager_, jubatus::core::storage::lsh_entry::lsh_hash, lsh_table_, and master_table_.

Referenced by put_diff().

419  {
420  const uint64_t row_id = key_manager_.get_id(row);
421  master_table_[row] = entry;
422  for (size_t i = 0; i < entry.lsh_hash.size(); ++i) {
423  lsh_table_[entry.lsh_hash[i]].push_back(row_id);
424  }
425 }
uint64_t get_id(const std::string &key)
Definition: key_manager.cpp:48

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::set_row ( const std::string &  row,
const std::vector< float > &  hash,
float  norm 
)

Definition at line 149 of file lsh_index_storage.cpp.

References jubatus::core::common::key_manager::get_id(), key_manager_, lsh_table_diff_, make_entry(), master_table_diff_, and remove_and_get_row().

Referenced by jubatus::core::recommender::euclid_lsh::update_row().

152  {
153  lsh_master_table_t::iterator it = remove_and_get_row(row);
154  if (it == master_table_diff_.end()) {
155  it = master_table_diff_.insert(make_pair(row, lsh_entry())).first;
156  }
157  make_entry(hash, norm, it->second);
158 
159  const uint64_t id = key_manager_.get_id(row);
160  const vector<uint64_t>& lsh_hash = it->second.lsh_hash;
161  for (size_t i = 0; i < lsh_hash.size(); ++i) {
162  vector<uint64_t>& range = lsh_table_diff_[lsh_hash[i]];
163  vector<uint64_t>::iterator it = lower_bound(range.begin(), range.end(), id);
164  if (it == range.end() || id != *it) {
165  range.insert(it, id);
166  }
167  }
168 }
std::vector< float > make_entry(const std::vector< float > &hash, float norm, lsh_entry &entry) const
lsh_master_table_t::iterator remove_and_get_row(const std::string &row)
uint64_t get_id(const std::string &key)
Definition: key_manager.cpp:48

Here is the call graph for this function:

Here is the caller graph for this function:

void jubatus::core::storage::lsh_index_storage::similar_row ( const std::vector< float > &  hash,
float  norm,
uint64_t  probe_num,
uint64_t  ret_num,
std::vector< std::pair< std::string, float > > &  ids 
) const
void jubatus::core::storage::lsh_index_storage::similar_row ( const std::string &  id,
uint64_t  ret_num,
std::vector< std::pair< std::string, float > > &  ids 
) const
size_t jubatus::core::storage::lsh_index_storage::table_num ( ) const
inline

Definition at line 79 of file lsh_index_storage.hpp.

References table_num_.

79  {
80  return table_num_;
81  }
void jubatus::core::storage::lsh_index_storage::unpack ( msgpack::object  o)

Definition at line 291 of file lsh_index_storage.cpp.

291  {
292  o.convert(this);
293 }

Member Data Documentation

common::key_manager jubatus::core::storage::lsh_index_storage::key_manager_
private
lsh_table_t jubatus::core::storage::lsh_index_storage::lsh_table_
private

Definition at line 127 of file lsh_index_storage.hpp.

Referenced by clear(), remove_model_row(), retrieve_hit_rows(), and set_mixed_row().

lsh_table_t jubatus::core::storage::lsh_index_storage::lsh_table_diff_
private

Definition at line 128 of file lsh_index_storage.hpp.

Referenced by clear(), put_diff(), put_empty_entry(), retrieve_hit_rows(), and set_row().

lsh_master_table_t jubatus::core::storage::lsh_index_storage::master_table_
private
lsh_master_table_t jubatus::core::storage::lsh_index_storage::master_table_diff_
private
std::vector<float> jubatus::core::storage::lsh_index_storage::shift_
private

Definition at line 130 of file lsh_index_storage.hpp.

Referenced by all_lsh_num(), lsh_index_storage(), and make_entry().

uint64_t jubatus::core::storage::lsh_index_storage::table_num_
private

Definition at line 131 of file lsh_index_storage.hpp.

Referenced by make_entry(), and table_num().


The documentation for this class was generated from the following files: