jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
converter_config.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "converter_config.hpp"
18 
19 #include <map>
20 #include <string>
21 #include <vector>
22 #include <sstream>
23 #include "jubatus/util/text/json.h"
24 #include "jubatus/util/lang/bind.h"
25 #include "jubatus/util/lang/function.h"
26 #include "binary_feature.hpp"
28 #include "combination_feature.hpp"
31 #include "except_match.hpp"
33 #include "exception.hpp"
34 #include "factory.hpp"
35 #include "key_matcher.hpp"
36 #include "key_matcher_factory.hpp"
37 #include "num_feature.hpp"
38 #include "num_feature_impl.hpp"
39 #include "num_feature_factory.hpp"
40 #include "num_filter.hpp"
41 #include "num_filter_factory.hpp"
42 #include "space_splitter.hpp"
44 #include "string_filter.hpp"
46 #include "without_split.hpp"
47 
48 using std::string;
49 using std::stringstream;
50 using std::vector;
51 using std::map;
52 
53 namespace jubatus {
54 namespace core {
55 namespace fv_converter {
56 
57 namespace {
58 
59 typedef jubatus::util::lang::shared_ptr<string_feature> string_feature_ptr;
60 typedef jubatus::util::lang::shared_ptr<key_matcher> matcher_ptr;
61 typedef jubatus::util::lang::shared_ptr<num_feature> num_feature_ptr;
62 typedef jubatus::util::lang::shared_ptr<binary_feature> binary_feature_ptr;
63 typedef jubatus::util::lang::shared_ptr<combination_feature>
64  combination_feature_ptr;
65 typedef jubatus::util::lang::shared_ptr<string_filter> string_filter_ptr;
66 typedef jubatus::util::lang::shared_ptr<num_filter> num_filter_ptr;
67 
68 splitter_weight_type make_weight_type(
69  const string& sample, const string& global) {
70  frequency_weight_type sample_type;
71  if (sample == "bin") {
72  sample_type = FREQ_BINARY;
73  } else if (sample == "tf") {
74  sample_type = TERM_FREQUENCY;
75  } else if (sample == "log_tf") {
76  sample_type = LOG_TERM_FREQUENCY;
77  } else {
78  throw JUBATUS_EXCEPTION(
79  converter_exception("unknown sample_weight: [" +
80  sample + "] in string_rules"));
81  }
82 
83  term_weight_type global_type;
84  if (global == "bin") {
85  global_type = TERM_BINARY;
86  } else if (global == "idf") {
87  global_type = IDF;
88  } else if (global == "weight") {
89  global_type = WITH_WEIGHT_FILE;
90  } else {
91  throw JUBATUS_EXCEPTION(
92  converter_exception("unknown global_weight: [" +
93  global + "] in string_rules"));
94  }
95  return splitter_weight_type(sample_type, global_type);
96 }
97 
98 string get_or_die(
99  const map<string, string>& m,
100  const string& key) {
101  map<string, string>::const_iterator it = m.find(key);
102  if (it == m.end()) {
103  throw JUBATUS_EXCEPTION(
104  converter_exception("parameter: [" + key + "] must be defined"));
105  } else {
106  return it->second;
107  }
108 }
109 
110 matcher_ptr create_key_matcher(
111  const string& key,
112  const jubatus::util::data::optional<string>& except) {
113  key_matcher_factory f;
114  if (except) {
115  matcher_ptr m1(f.create_matcher(key));
116  matcher_ptr m2(f.create_matcher(*except));
117  return matcher_ptr(new except_match(m1, m2));
118  } else {
119  return matcher_ptr(f.create_matcher(key));
120  }
121 }
122 
123 void init_string_filter_types(
124  const map<string, param_t>& filter_types,
125  map<string, string_filter_ptr>& filters,
127  string_filter_factory f(ext);
128  for (map<string, param_t>::const_iterator it = filter_types.begin();
129  it != filter_types.end(); ++it) {
130  const string& name = it->first;
131  const map<string, string>& param = it->second;
132 
133  string method = get_or_die(param, "method");
134  string_filter_ptr filter(f.create(method, param));
135  filters[name] = filter;
136  }
137 }
138 
139 void init_num_filter_types(
140  const map<string, param_t>& filter_types,
141  map<string, num_filter_ptr>& filters,
143  num_filter_factory f(ext);
144  for (map<string, param_t>::const_iterator it = filter_types.begin();
145  it != filter_types.end(); ++it) {
146  const string& name = it->first;
147  const map<string, string>& param = it->second;
148 
149  string method = get_or_die(param, "method");
150  num_filter_ptr filter(f.create(method, param));
151  filters[name] = filter;
152  }
153 }
154 
155 void init_num_filter_rules(
156  const vector<filter_rule>& filter_rules,
157  const map<string, num_filter_ptr>& filters,
158  datum_to_fv_converter& conv) {
159  for (size_t i = 0; i < filter_rules.size(); ++i) {
160  const filter_rule& rule = filter_rules[i];
161  map<string, num_filter_ptr>::const_iterator it =
162  filters.find(rule.type);
163  if (it == filters.end()) {
164  throw JUBATUS_EXCEPTION(
165  converter_exception("unknown type: [" +
166  rule.type + "] in num_filter_rules"));
167  }
168 
169  matcher_ptr m(create_key_matcher(rule.key, rule.except));
170  conv.register_num_filter(m, it->second, rule.suffix);
171  }
172 }
173 
174 void register_default_string_types(
175  map<string, string_feature_ptr>& splitters) {
176  splitters["str"] = string_feature_ptr(new without_split());
177  splitters["space"] = string_feature_ptr(new space_splitter());
178 }
179 
180 void init_string_types(
181  const map<string, param_t>& string_types,
182  map<string, string_feature_ptr>& splitters,
184  string_feature_factory f(ext);
185  for (map<string, param_t>::const_iterator it = string_types.begin();
186  it != string_types.end(); ++it) {
187  const string& name = it->first;
188  const map<string, string>& param = it->second;
189 
190  string method = get_or_die(param, "method");
191  string_feature_ptr splitter(f.create(method, param));
192  splitters[name] = splitter;
193  }
194 }
195 
196 void init_string_filter_rules(
197  const vector<filter_rule>& filter_rules,
198  const map<string, string_filter_ptr>& filters,
199  datum_to_fv_converter& conv) {
200  for (size_t i = 0; i < filter_rules.size(); ++i) {
201  const filter_rule& rule = filter_rules[i];
202  map<string, string_filter_ptr>::const_iterator it =
203  filters.find(rule.type);
204  if (it == filters.end()) {
205  throw JUBATUS_EXCEPTION(
206  converter_exception("unknown type: [" +
207  rule.type + "] in string_filter_rules"));
208  }
209 
210  matcher_ptr m(create_key_matcher(rule.key, rule.except));
211  conv.register_string_filter(m, it->second, rule.suffix);
212  }
213 }
214 
215 void init_string_rules(
216  const vector<string_rule>& string_rules,
217  const map<string, string_feature_ptr>& splitters,
218  datum_to_fv_converter& conv) {
219  for (size_t i = 0; i < string_rules.size(); ++i) {
220  const string_rule& rule = string_rules[i];
221  matcher_ptr m(create_key_matcher(rule.key, rule.except));
222  map<string, string_feature_ptr>::const_iterator it =
223  splitters.find(rule.type);
224  if (it == splitters.end()) {
225  throw JUBATUS_EXCEPTION(
226  converter_exception("unknown type: [" +
227  rule.type + "] in string_rules"));
228  }
229 
230  vector<splitter_weight_type> ws;
231  ws.push_back(make_weight_type(rule.sample_weight, rule.global_weight));
232  conv.register_string_rule(rule.type, m, it->second, ws);
233  }
234 }
235 
236 void register_default_num_types(
237  map<string, num_feature_ptr>& num_features) {
238  num_features["num"] = num_feature_ptr(new num_value_feature());
239  num_features["log"] = num_feature_ptr(new num_log_feature());
240  num_features["str"] = num_feature_ptr(new num_string_feature());
241 }
242 
243 void init_num_types(
244  const map<string, param_t>& num_types,
245  map<string, num_feature_ptr>& num_features,
247  num_feature_factory f(ext);
248  for (map<string, param_t>::const_iterator it = num_types.begin();
249  it != num_types.end(); ++it) {
250  const string& name = it->first;
251  const map<string, string>& param = it->second;
252 
253  string method = get_or_die(param, "method");
254  num_feature_ptr feature(f.create(method, param));
255  num_features[name] = feature;
256  }
257 }
258 
259 void init_num_rules(
260  const vector<num_rule>& num_rules,
261  const map<string, num_feature_ptr>& num_features,
262  datum_to_fv_converter& conv) {
263  for (size_t i = 0; i < num_rules.size(); ++i) {
264  const num_rule& rule = num_rules[i];
265  matcher_ptr m(create_key_matcher(rule.key, rule.except));
266  map<string, num_feature_ptr>::const_iterator it =
267  num_features.find(rule.type);
268  if (it == num_features.end()) {
269  throw JUBATUS_EXCEPTION(
270  converter_exception("unknown type: [" +
271  rule.type + "] in num_rules"));
272  }
273 
274  conv.register_num_rule(rule.type, m, it->second);
275  }
276 }
277 
278 void init_binary_types(
279  const map<string, param_t>& binary_types,
280  map<string, binary_feature_ptr>& binary_features,
282  binary_feature_factory f(ext);
283  for (map<string, param_t>::const_iterator it = binary_types.begin();
284  it != binary_types.end(); ++it) {
285  const string& name = it->first;
286  const map<string, string>& param = it->second;
287 
288  string method = get_or_die(param, "method");
289  binary_feature_ptr feature(f.create(method, param));
290  binary_features[name] = feature;
291  }
292 }
293 
294 void init_binary_rules(
295  const vector<binary_rule>& binary_rules,
296  const map<string, binary_feature_ptr>& binary_features,
297  datum_to_fv_converter& conv) {
298  key_matcher_factory f;
299  for (size_t i = 0; i < binary_rules.size(); ++i) {
300  const binary_rule& rule = binary_rules[i];
301  matcher_ptr m(f.create_matcher(rule.key));
302  map<string, binary_feature_ptr>::const_iterator it =
303  binary_features.find(rule.type);
304  if (it == binary_features.end()) {
305  throw JUBATUS_EXCEPTION(
306  converter_exception("unknown type: [" +
307  rule.type + "] in binary_rules"));
308  }
309 
310  conv.register_binary_rule(rule.type, m, it->second);
311  }
312 }
313 
314 void init_combination_types(
315  const map<string, param_t>& combination_types,
316  map<string, combination_feature_ptr>& combination_features,
318  combination_feature_factory f(ext);
319  for (
320  map<string, param_t>::const_iterator it =
321  combination_types.begin();
322  it != combination_types.end(); ++it) {
323  const string& name = it->first;
324  const map<string, string>& param = it->second;
325 
326  string method = get_or_die(param, "method");
327  combination_feature_ptr feature(f.create(method, param));
328  combination_features[name] = feature;
329  }
330 }
331 
332 void register_default_combination_types(
333  map<string, combination_feature_ptr>& combination_features) {
334  combination_features["add"] =
335  combination_feature_ptr(new combination_add_feature());
336  combination_features["mul"] =
337  combination_feature_ptr(new combination_mul_feature());
338 }
339 
340 void init_combination_rules(
341  const vector<combination_rule>& combination_rules,
342  const map<string, combination_feature_ptr>& combination_features,
343  datum_to_fv_converter& conv) {
344  key_matcher_factory f;
345  for (size_t i = 0; i < combination_rules.size(); ++i) {
346  const combination_rule& rule = combination_rules[i];
347  matcher_ptr m_left(f.create_matcher(rule.key_left));
348  matcher_ptr m_right(f.create_matcher(rule.key_right));
349  map<string, combination_feature_ptr>::const_iterator it =
350  combination_features.find(rule.type);
351  if (it == combination_features.end()) {
352  throw JUBATUS_EXCEPTION(
353  converter_exception("unknown type: [" + rule.type +
354  "] in combination_rules"));
355  }
356 
357  conv.register_combination_rule(rule.type, m_left, m_right, it->second);
358  }
359 }
360 
361 } // namespace
362 
364  const converter_config& config,
365  datum_to_fv_converter& conv,
366  const factory_extender* ext) {
367  using jubatus::util::lang::bind;
368  using jubatus::util::lang::_1;
369  using jubatus::util::lang::_2;
370 
371  if (config.hash_max_size.bool_test() && *config.hash_max_size.get() <= 0) {
372  stringstream msg;
373  msg << "hash_max_size must be positive, but is "
374  << *config.hash_max_size.get();
375  throw JUBATUS_EXCEPTION(converter_exception(msg.str()));
376  }
377 
378  map<string, string_filter_ptr> string_filters;
379  if (config.string_filter_types) {
381  if (ext) {
382  f = bind(&factory_extender::create_string_filter, ext, _1, _2);
383  }
384  init_string_filter_types(*config.string_filter_types, string_filters, f);
385  }
386 
387  map<string, num_filter_ptr> num_filters;
388  if (config.num_filter_types) {
390  if (ext) {
391  f = bind(&factory_extender::create_num_filter, ext, _1, _2);
392  }
393  init_num_filter_types(*config.num_filter_types, num_filters, f);
394  }
395 
396  map<string, string_feature_ptr> splitters;
397  register_default_string_types(splitters);
398  if (config.string_types) {
400  if (ext) {
401  f = bind(&factory_extender::create_string_feature, ext, _1, _2);
402  }
403  init_string_types(*config.string_types, splitters, f);
404  }
405 
406  map<string, num_feature_ptr> num_features;
407  register_default_num_types(num_features);
408  if (config.num_types) {
410  if (ext) {
411  f = bind(&factory_extender::create_num_feature, ext, _1, _2);
412  }
413  init_num_types(*config.num_types, num_features, f);
414  }
415 
416  map<string, binary_feature_ptr> binary_features;
417  if (config.binary_types) {
419  if (ext) {
420  f = bind(&factory_extender::create_binary_feature, ext, _1, _2);
421  }
422  init_binary_types(*config.binary_types, binary_features, f);
423  }
424 
425  map<string, combination_feature_ptr> combination_features;
426  register_default_combination_types(combination_features);
427  if (config.combination_types) {
429  if (ext) {
430  f = bind(&factory_extender::create_combination_feature, ext, _1, _2);
431  }
432  init_combination_types(*config.combination_types, combination_features, f);
433  }
434 
435  conv.clear_rules();
436  if (config.string_filter_rules) {
437  init_string_filter_rules(*config.string_filter_rules, string_filters, conv);
438  }
439  if (config.num_filter_rules) {
440  init_num_filter_rules(*config.num_filter_rules, num_filters, conv);
441  }
442  if (config.string_rules) {
443  init_string_rules(*config.string_rules, splitters, conv);
444  }
445  if (config.num_rules) {
446  init_num_rules(*config.num_rules, num_features, conv);
447  }
448  if (config.binary_rules) {
449  init_binary_rules(*config.binary_rules, binary_features, conv);
450  }
451  if (config.combination_rules) {
452  init_combination_rules(
453  *config.combination_rules,
454  combination_features, conv);
455  }
456 
457  if (config.hash_max_size.bool_test()) {
458  conv.set_hash_max_size(*config.hash_max_size.get());
459  }
460 }
461 
462 jubatus::util::lang::shared_ptr<datum_to_fv_converter> make_fv_converter(
463  const converter_config& config, const factory_extender* extender) {
464  jubatus::util::lang::shared_ptr<fv_converter::datum_to_fv_converter>
466  fv_converter::initialize_converter(config, *converter, extender);
467  return converter;
468 }
469 
470 } // namespace fv_converter
471 } // namespace core
472 } // namespace jubatus
void initialize_converter(const converter_config &config, datum_to_fv_converter &conv, const factory_extender *ext)
virtual combination_feature * create_combination_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > num_filter_types
virtual num_feature * create_num_feature(const std::string &name, const param_t &) const =0
jubatus::util::lang::function< string_filter *(const std::string &, const param_t &)> create_function
virtual num_filter * create_num_filter(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > string_types
jubatus::util::lang::function< num_filter *(const std::string &, const param_t &)> create_function
jubatus::util::data::optional< std::vector< num_rule > > num_rules
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
std::string method
jubatus::util::lang::function< string_feature *(const std::string &, const param_t &)> create_function
jubatus::util::data::optional< std::vector< filter_rule > > string_filter_rules
jubatus::util::data::optional< std::map< std::string, param_t > > binary_types
jubatus::util::data::optional< std::vector< string_rule > > string_rules
const std::string & get_or_die(const std::map< std::string, std::string > &params, const std::string &key)
Definition: util.cpp:28
jubatus::util::data::optional< std::map< std::string, param_t > > combination_types
jubatus::util::lang::function< num_feature *(const std::string &, const param_t &)> create_function
jubatus::util::lang::shared_ptr< datum_to_fv_converter > make_fv_converter(const converter_config &config, const factory_extender *extender)
virtual string_filter * create_string_filter(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< int64_t > hash_max_size
jubatus::util::data::optional< std::vector< combination_rule > > combination_rules
virtual binary_feature * create_binary_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > num_types
virtual string_feature * create_string_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::vector< filter_rule > > num_filter_rules
jubatus::util::data::optional< std::map< std::string, param_t > > string_filter_types
jubatus::util::data::optional< std::vector< binary_rule > > binary_rules
jubatus::util::lang::function< binary_feature *(const std::string &, const param_t &)> create_function
jubatus::util::lang::function< combination_feature *(const std::string &, const param_t &)> create_function