jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
inverted_index_storage.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
18 #include <algorithm>
19 #include <cmath>
20 #include <functional>
21 #include <sstream>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 
26 #include "../storage/fixed_size_heap.hpp"
27 
28 using std::istringstream;
29 using std::make_pair;
30 using std::ostringstream;
31 using std::pair;
32 using std::sort;
33 using std::sqrt;
34 using std::string;
35 using std::vector;
36 
37 namespace jubatus {
38 namespace core {
39 namespace storage {
40 
42 }
43 
45 }
46 
48  const std::string& row,
49  const std::string& column,
50  float val) {
51  uint64_t column_id = column2id_.get_id_const(column);
52 
53  if (column_id == common::key_manager::NOTFOUND) {
54  column_id = column2id_.get_id(column);
55  } else {
56  float cur_val = get(row, column);
57  column2norm_diff_[column_id] -= cur_val * cur_val;
58  }
59  inv_diff_[row][column_id] = val;
60  column2norm_diff_[column_id] += val * val;
61  if (column2norm_diff_[column_id] == 0) {
62  column2norm_diff_.erase(column_id);
63  }
64 }
65 
67  const string& row,
68  const string& column) const {
69  uint64_t column_id = column2id_.get_id_const(column);
70  if (column_id == common::key_manager::NOTFOUND) {
71  return 0.f;
72  }
73  {
74  bool exist = false;
75  float ret = get_from_tbl(row, column_id, inv_diff_, exist);
76  if (exist) {
77  return ret;
78  }
79  }
80  {
81  bool exist = false;
82  float ret = get_from_tbl(row, column_id, inv_, exist);
83  if (exist) {
84  return ret;
85  }
86  }
87  return 0.0;
88 }
89 
91  const std::string& row,
92  uint64_t column_id,
93  const tbl_t& tbl,
94  bool& exist) const {
95  exist = false;
96 
97  if (column_id == common::key_manager::NOTFOUND) {
98  return 0.f;
99  }
100  tbl_t::const_iterator it = tbl.find(row);
101  if (it == tbl.end()) {
102  return 0.f;
103  } else {
104  row_t::const_iterator it_row = it->second.find(column_id);
105  if (it_row == it->second.end()) {
106  return 0.f;
107  } else {
108  exist = true;
109  return it_row->second;
110  }
111  }
112 }
113 
115  const std::string& row,
116  const std::string& column) {
117  uint64_t column_id = column2id_.get_id_const(column);
118  if (column_id == common::key_manager::NOTFOUND) {
119  return;
120  }
121 
122  set(row, column, 0.f);
123 
124  // Test if the data exists in the master table.
125  bool exist = false;
126  get_from_tbl(row, column_id, inv_, exist);
127 
128  // If the data exists in the master table, we should
129  // keep it in the diff table until next MIX to propagate
130  // the removal of this data to other nodes.
131  // Otherwise we can immediately remove the row from
132  // the diff table.
133  if (!exist) {
134  tbl_t::iterator it = inv_diff_.find(row);
135  if (it != inv_diff_.end()) {
136  row_t::iterator it_row = it->second.find(column_id);
137  if (it_row != it->second.end()) {
138  it->second.erase(it_row);
139  if (it->second.empty()) {
140  // There are no columns that belongs to this row,
141  // so we can remove the row itself.
142  inv_diff_.erase(it);
143  }
144  }
145  }
146  }
147 }
148 
150  tbl_t().swap(inv_);
151  tbl_t().swap(inv_diff_);
152  imap_float_t().swap(column2norm_);
155 }
156 
158  std::vector<std::string>& ids) const {
159  ids.clear();
160  for (imap_float_t::const_iterator it = column2norm_.begin();
161  it != column2norm_.end(); ++it) {
162  ids.push_back(column2id_.get_key(it->first));
163  }
164  for (imap_float_t::const_iterator it = column2norm_diff_.begin();
165  it != column2norm_diff_.end(); ++it) {
166  if (column2norm_.find(it->first) == column2norm_.end()) {
167  ids.push_back(column2id_.get_key(it->first));
168  }
169  }
170 }
171 
173  for (tbl_t::const_iterator it = inv_diff_.begin(); it != inv_diff_.end();
174  ++it) {
175  vector<pair<string, float> > columns;
176  for (row_t::const_iterator it2 = it->second.begin();
177  it2 != it->second.end(); ++it2) {
178  columns.push_back(make_pair(column2id_.get_key(it2->first), it2->second));
179  }
180  diff.inv.set_row(it->first, columns);
181  }
182 
183  for (imap_float_t::const_iterator it = column2norm_diff_.begin();
184  it != column2norm_diff_.end(); ++it) {
185  diff.column2norm[column2id_.get_key(it->first)] = it->second;
186  }
187 }
188 
190  const diff_type& mixed_diff) {
191  vector<string> ids;
192  mixed_diff.inv.get_all_row_ids(ids);
193  for (size_t i = 0; i < ids.size(); ++i) {
194  const string& row = ids[i];
195  row_t& v = inv_[row];
196  vector<pair<string, float> > columns;
197  mixed_diff.inv.get_row(row, columns);
198  for (size_t j = 0; j < columns.size(); ++j) {
199  size_t id = column2id_.get_id(columns[j].first);
200  if (columns[j].second == 0.f) {
201  v.erase(id);
202  } else {
203  v[id] = columns[j].second;
204  }
205  }
206  }
207  inv_diff_.clear();
208 
209  for (map_float_t::const_iterator it = mixed_diff.column2norm.begin();
210  it != mixed_diff.column2norm.end(); ++it) {
211  uint64_t column_index = column2id_.get_id(it->first);
212  column2norm_[column_index] += it->second;
213  if (column2norm_[column_index] == 0.f) {
214  column2norm_.erase(column_index);
215  }
216  }
217  column2norm_diff_.clear();
218  return true;
219 }
220 
221 void inverted_index_storage::mix(const diff_type& lhs, diff_type& rhs) const {
222  // merge inv diffs
223  vector<string> ids;
224  lhs.inv.get_all_row_ids(ids);
225  for (size_t i = 0; i < ids.size(); ++i) {
226  const string& row = ids[i];
227 
228  vector<pair<string, float> > columns;
229  lhs.inv.get_row(row, columns);
230  rhs.inv.set_row(row, columns);
231  }
232 
233  // merge norm diffs
234  for (map_float_t::const_iterator it = lhs.column2norm.begin();
235  it != lhs.column2norm.end(); ++it) {
236  rhs.column2norm[it->first] += it->second;
237  }
238 }
239 
241  packer.pack(*this);
242 }
243 
244 void inverted_index_storage::unpack(msgpack::object o) {
245  o.convert(this);
246 }
247 
249  const common::sfv_t& query,
250  vector<pair<string, float> >& scores,
251  size_t ret_num) const {
252  float query_norm = calc_l2norm(query);
253  if (query_norm == 0.f) {
254  return;
255  }
256 
257  std::vector<float> i_scores(column2id_.get_max_id() + 1, 0.0);
258  for (size_t i = 0; i < query.size(); ++i) {
259  const string& fid = query[i].first;
260  float val = query[i].second;
261  add_inp_scores(fid, val, i_scores);
262  }
263 
265  std::greater<pair<float, uint64_t> > > heap(ret_num);
266  for (size_t i = 0; i < i_scores.size(); ++i) {
267  float score = i_scores[i];
268  if (score == 0.f)
269  continue;
270  float norm = calc_columnl2norm(i);
271  if (norm == 0.f)
272  continue;
273  float normed_score = score / norm / query_norm;
274  heap.push(make_pair(normed_score, i));
275  }
276  vector<pair<float, uint64_t> > sorted_scores;
277  heap.get_sorted(sorted_scores);
278 
279  for (size_t i = 0; i < sorted_scores.size() && i < ret_num; ++i) {
280  scores.push_back(
281  make_pair(column2id_.get_key(sorted_scores[i].second),
282  sorted_scores[i].first));
283  }
284 }
285 
287  float ret = 0.f;
288  for (size_t i = 0; i < sfv.size(); ++i) {
289  ret += sfv[i].second * sfv[i].second;
290  }
291  return std::sqrt(ret);
292 }
293 
294 float inverted_index_storage::calc_columnl2norm(uint64_t column_id) const {
295  float ret = 0.f;
296  imap_float_t::const_iterator it_diff = column2norm_diff_.find(column_id);
297  if (it_diff != column2norm_diff_.end()) {
298  ret += it_diff->second;
299  }
300  imap_float_t::const_iterator it = column2norm_.find(column_id);
301  if (it != column2norm_.end()) {
302  ret += it->second;
303  }
304  return std::sqrt(ret);
305 }
306 
308  const std::string& row,
309  float val,
310  std::vector<float>& scores) const {
311  tbl_t::const_iterator it_diff = inv_diff_.find(row);
312  if (it_diff != inv_diff_.end()) {
313  const row_t& row_v = it_diff->second;
314  for (row_t::const_iterator row_it = row_v.begin(); row_it != row_v.end();
315  ++row_it) {
316  scores[row_it->first] += row_it->second * val;
317  }
318  }
319 
320  tbl_t::const_iterator it = inv_.find(row);
321  if (it != inv_.end()) {
322  const row_t& row_v = it->second;
323  if (it_diff == inv_diff_.end()) {
324  for (row_t::const_iterator row_it = row_v.begin(); row_it != row_v.end();
325  ++row_it) {
326  scores[row_it->first] += row_it->second * val;
327  }
328  } else {
329  const row_t& row_diff_v = it_diff->second;
330  for (row_t::const_iterator row_it = row_v.begin(); row_it != row_v.end();
331  ++row_it) {
332  if (row_diff_v.find(row_it->first) == row_diff_v.end()) {
333  scores[row_it->first] += row_it->second * val;
334  }
335  }
336  }
337  }
338 }
339 
340 std::string inverted_index_storage::name() const {
341  return string("inverted_index_storage");
342 }
343 
344 } // namespace storage
345 } // namespace core
346 } // namespace jubatus
uint64_t get_id_const(const std::string &key) const
Definition: key_manager.cpp:67
const_iterator find(const K &key) const
void get_row(const std::string &row, std::vector< std::pair< std::string, float > > &columns) const
void get_sorted(std::vector< T > &v) const
void calc_scores(const common::sfv_t &sfv, std::vector< std::pair< std::string, float > > &scores, size_t ret_num) const
void set(const std::string &row, const std::string &column, float val)
const_iterator begin() const
data_type::const_iterator const_iterator
void pack(framework::packer &packer) const
void remove(const std::string &row, const std::string &column)
const std::string & get_key(const uint64_t id) const
Definition: key_manager.cpp:78
float get_from_tbl(const std::string &row, uint64_t column_id, const tbl_t &tbl, bool &exist) const
jubatus::util::data::unordered_map< uint64_t, float > imap_float_t
static float calc_l2norm(const common::sfv_t &sfv)
msgpack::packer< jubatus_packer > packer
Definition: bandit_base.hpp:31
void get_all_column_ids(std::vector< std::string > &ids) const
std::vector< T > v(size)
uint64_t get_id(const std::string &key)
Definition: key_manager.cpp:48
jubatus::util::data::unordered_map< std::string, row_t > tbl_t
std::vector< std::pair< std::string, float > > sfv_t
Definition: type.hpp:29
void mix(const diff_type &lhs_str, diff_type &rhs_str) const
void get_all_row_ids(std::vector< std::string > &ids) const
float get(const std::string &row, const std::string &column) const
void set_row(const std::string &row, const std::vector< std::pair< std::string, float > > &columns)
void add_inp_scores(const std::string &row, float val, std::vector< float > &scores) const