23 #include "jubatus/util/text/json.h"
24 #include "jubatus/util/lang/bind.h"
25 #include "jubatus/util/lang/function.h"
49 using std::stringstream;
55 namespace fv_converter {
59 typedef jubatus::util::lang::shared_ptr<string_feature> string_feature_ptr;
60 typedef jubatus::util::lang::shared_ptr<key_matcher> matcher_ptr;
61 typedef jubatus::util::lang::shared_ptr<num_feature> num_feature_ptr;
62 typedef jubatus::util::lang::shared_ptr<binary_feature> binary_feature_ptr;
63 typedef jubatus::util::lang::shared_ptr<combination_feature>
64 combination_feature_ptr;
65 typedef jubatus::util::lang::shared_ptr<string_filter> string_filter_ptr;
66 typedef jubatus::util::lang::shared_ptr<num_filter> num_filter_ptr;
68 splitter_weight_type make_weight_type(
69 const string& sample,
const string& global) {
71 if (sample ==
"bin") {
73 }
else if (sample ==
"tf") {
75 }
else if (sample ==
"log_tf") {
79 converter_exception(
"unknown sample_weight: [" +
80 sample +
"] in string_rules"));
84 if (global ==
"bin") {
86 }
else if (global ==
"idf") {
88 }
else if (global ==
"weight") {
92 converter_exception(
"unknown global_weight: [" +
93 global +
"] in string_rules"));
95 return splitter_weight_type(sample_type, global_type);
99 const map<string, string>& m,
101 map<string, string>::const_iterator it = m.find(key);
104 converter_exception(
"parameter: [" + key +
"] must be defined"));
110 matcher_ptr create_key_matcher(
112 const jubatus::util::data::optional<string>& except) {
113 key_matcher_factory f;
115 matcher_ptr m1(f.create_matcher(key));
116 matcher_ptr m2(f.create_matcher(*except));
117 return matcher_ptr(
new except_match(m1, m2));
119 return matcher_ptr(f.create_matcher(key));
123 void init_string_filter_types(
124 const map<string, param_t>& filter_types,
125 map<string, string_filter_ptr>& filters,
127 string_filter_factory f(ext);
128 for (map<string, param_t>::const_iterator it = filter_types.begin();
129 it != filter_types.end(); ++it) {
130 const string& name = it->first;
131 const map<string, string>& param = it->second;
134 string_filter_ptr filter(f.create(method, param));
135 filters[name] = filter;
139 void init_num_filter_types(
140 const map<string, param_t>& filter_types,
141 map<string, num_filter_ptr>& filters,
143 num_filter_factory f(ext);
144 for (map<string, param_t>::const_iterator it = filter_types.begin();
145 it != filter_types.end(); ++it) {
146 const string& name = it->first;
147 const map<string, string>& param = it->second;
150 num_filter_ptr filter(f.create(method, param));
151 filters[name] = filter;
155 void init_num_filter_rules(
156 const vector<filter_rule>& filter_rules,
157 const map<string, num_filter_ptr>& filters,
158 datum_to_fv_converter& conv) {
159 for (
size_t i = 0; i < filter_rules.size(); ++i) {
160 const filter_rule& rule = filter_rules[i];
161 map<string, num_filter_ptr>::const_iterator it =
162 filters.find(rule.type);
163 if (it == filters.end()) {
165 converter_exception(
"unknown type: [" +
166 rule.type +
"] in num_filter_rules"));
169 matcher_ptr m(create_key_matcher(rule.key, rule.except));
170 conv.register_num_filter(m, it->second, rule.suffix);
174 void register_default_string_types(
175 map<string, string_feature_ptr>& splitters) {
176 splitters[
"str"] = string_feature_ptr(
new without_split());
177 splitters[
"space"] = string_feature_ptr(
new space_splitter());
180 void init_string_types(
181 const map<string, param_t>& string_types,
182 map<string, string_feature_ptr>& splitters,
184 string_feature_factory f(ext);
185 for (map<string, param_t>::const_iterator it = string_types.begin();
186 it != string_types.end(); ++it) {
187 const string& name = it->first;
188 const map<string, string>& param = it->second;
191 string_feature_ptr splitter(f.create(method, param));
192 splitters[name] = splitter;
196 void init_string_filter_rules(
197 const vector<filter_rule>& filter_rules,
198 const map<string, string_filter_ptr>& filters,
199 datum_to_fv_converter& conv) {
200 for (
size_t i = 0; i < filter_rules.size(); ++i) {
201 const filter_rule& rule = filter_rules[i];
202 map<string, string_filter_ptr>::const_iterator it =
203 filters.find(rule.type);
204 if (it == filters.end()) {
206 converter_exception(
"unknown type: [" +
207 rule.type +
"] in string_filter_rules"));
210 matcher_ptr m(create_key_matcher(rule.key, rule.except));
211 conv.register_string_filter(m, it->second, rule.suffix);
215 void init_string_rules(
216 const vector<string_rule>& string_rules,
217 const map<string, string_feature_ptr>& splitters,
218 datum_to_fv_converter& conv) {
219 for (
size_t i = 0; i < string_rules.size(); ++i) {
220 const string_rule& rule = string_rules[i];
221 matcher_ptr m(create_key_matcher(rule.key, rule.except));
222 map<string, string_feature_ptr>::const_iterator it =
223 splitters.find(rule.type);
224 if (it == splitters.end()) {
226 converter_exception(
"unknown type: [" +
227 rule.type +
"] in string_rules"));
230 vector<splitter_weight_type> ws;
231 ws.push_back(make_weight_type(rule.sample_weight, rule.global_weight));
232 conv.register_string_rule(rule.type, m, it->second, ws);
236 void register_default_num_types(
237 map<string, num_feature_ptr>& num_features) {
238 num_features[
"num"] = num_feature_ptr(
new num_value_feature());
239 num_features[
"log"] = num_feature_ptr(
new num_log_feature());
240 num_features[
"str"] = num_feature_ptr(
new num_string_feature());
244 const map<string, param_t>& num_types,
245 map<string, num_feature_ptr>& num_features,
247 num_feature_factory f(ext);
248 for (map<string, param_t>::const_iterator it = num_types.begin();
249 it != num_types.end(); ++it) {
250 const string& name = it->first;
251 const map<string, string>& param = it->second;
254 num_feature_ptr feature(f.create(method, param));
255 num_features[name] = feature;
260 const vector<num_rule>& num_rules,
261 const map<string, num_feature_ptr>& num_features,
262 datum_to_fv_converter& conv) {
263 for (
size_t i = 0; i < num_rules.size(); ++i) {
264 const num_rule& rule = num_rules[i];
265 matcher_ptr m(create_key_matcher(rule.key, rule.except));
266 map<string, num_feature_ptr>::const_iterator it =
267 num_features.find(rule.type);
268 if (it == num_features.end()) {
270 converter_exception(
"unknown type: [" +
271 rule.type +
"] in num_rules"));
274 conv.register_num_rule(rule.type, m, it->second);
278 void init_binary_types(
279 const map<string, param_t>& binary_types,
280 map<string, binary_feature_ptr>& binary_features,
282 binary_feature_factory f(ext);
283 for (map<string, param_t>::const_iterator it = binary_types.begin();
284 it != binary_types.end(); ++it) {
285 const string& name = it->first;
286 const map<string, string>& param = it->second;
289 binary_feature_ptr feature(f.create(method, param));
290 binary_features[name] = feature;
294 void init_binary_rules(
295 const vector<binary_rule>& binary_rules,
296 const map<string, binary_feature_ptr>& binary_features,
297 datum_to_fv_converter& conv) {
298 key_matcher_factory f;
299 for (
size_t i = 0; i < binary_rules.size(); ++i) {
300 const binary_rule& rule = binary_rules[i];
301 matcher_ptr m(f.create_matcher(rule.key));
302 map<string, binary_feature_ptr>::const_iterator it =
303 binary_features.find(rule.type);
304 if (it == binary_features.end()) {
306 converter_exception(
"unknown type: [" +
307 rule.type +
"] in binary_rules"));
310 conv.register_binary_rule(rule.type, m, it->second);
314 void init_combination_types(
315 const map<string, param_t>& combination_types,
316 map<string, combination_feature_ptr>& combination_features,
318 combination_feature_factory f(ext);
320 map<string, param_t>::const_iterator it =
321 combination_types.begin();
322 it != combination_types.end(); ++it) {
323 const string& name = it->first;
324 const map<string, string>& param = it->second;
327 combination_feature_ptr feature(f.create(method, param));
328 combination_features[name] = feature;
332 void register_default_combination_types(
333 map<string, combination_feature_ptr>& combination_features) {
334 combination_features[
"add"] =
335 combination_feature_ptr(
new combination_add_feature());
336 combination_features[
"mul"] =
337 combination_feature_ptr(
new combination_mul_feature());
340 void init_combination_rules(
341 const vector<combination_rule>& combination_rules,
342 const map<string, combination_feature_ptr>& combination_features,
343 datum_to_fv_converter& conv) {
344 key_matcher_factory f;
345 for (
size_t i = 0; i < combination_rules.size(); ++i) {
346 const combination_rule& rule = combination_rules[i];
347 matcher_ptr m_left(f.create_matcher(rule.key_left));
348 matcher_ptr m_right(f.create_matcher(rule.key_right));
349 map<string, combination_feature_ptr>::const_iterator it =
350 combination_features.find(rule.type);
351 if (it == combination_features.end()) {
353 converter_exception(
"unknown type: [" + rule.type +
354 "] in combination_rules"));
357 conv.register_combination_rule(rule.type, m_left, m_right, it->second);
367 using jubatus::util::lang::bind;
368 using jubatus::util::lang::_1;
369 using jubatus::util::lang::_2;
373 msg <<
"hash_max_size must be positive, but is "
378 map<string, string_filter_ptr> string_filters;
387 map<string, num_filter_ptr> num_filters;
396 map<string, string_feature_ptr> splitters;
397 register_default_string_types(splitters);
406 map<string, num_feature_ptr> num_features;
407 register_default_num_types(num_features);
413 init_num_types(*config.
num_types, num_features, f);
416 map<string, binary_feature_ptr> binary_features;
422 init_binary_types(*config.
binary_types, binary_features, f);
425 map<string, combination_feature_ptr> combination_features;
426 register_default_combination_types(combination_features);
443 init_string_rules(*config.
string_rules, splitters, conv);
446 init_num_rules(*config.
num_rules, num_features, conv);
449 init_binary_rules(*config.
binary_rules, binary_features, conv);
452 init_combination_rules(
454 combination_features, conv);
464 jubatus::util::lang::shared_ptr<fv_converter::datum_to_fv_converter>
void initialize_converter(const converter_config &config, datum_to_fv_converter &conv, const factory_extender *ext)
virtual combination_feature * create_combination_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > num_filter_types
virtual num_feature * create_num_feature(const std::string &name, const param_t &) const =0
jubatus::util::lang::function< string_filter *(const std::string &, const param_t &)> create_function
virtual num_filter * create_num_filter(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > string_types
jubatus::util::lang::function< num_filter *(const std::string &, const param_t &)> create_function
jubatus::util::data::optional< std::vector< num_rule > > num_rules
#define JUBATUS_EXCEPTION(e)
jubatus::util::lang::function< string_feature *(const std::string &, const param_t &)> create_function
jubatus::util::data::optional< std::vector< filter_rule > > string_filter_rules
jubatus::util::data::optional< std::map< std::string, param_t > > binary_types
jubatus::util::data::optional< std::vector< string_rule > > string_rules
const std::string & get_or_die(const std::map< std::string, std::string > ¶ms, const std::string &key)
jubatus::util::data::optional< std::map< std::string, param_t > > combination_types
jubatus::util::lang::function< num_feature *(const std::string &, const param_t &)> create_function
jubatus::util::lang::shared_ptr< datum_to_fv_converter > make_fv_converter(const converter_config &config, const factory_extender *extender)
virtual string_filter * create_string_filter(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< int64_t > hash_max_size
void set_hash_max_size(uint64_t hash_max_size)
jubatus::util::data::optional< std::vector< combination_rule > > combination_rules
virtual binary_feature * create_binary_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::map< std::string, param_t > > num_types
virtual string_feature * create_string_feature(const std::string &name, const param_t &) const =0
jubatus::util::data::optional< std::vector< filter_rule > > num_filter_rules
jubatus::util::data::optional< std::map< std::string, param_t > > string_filter_types
jubatus::util::data::optional< std::vector< binary_rule > > binary_rules
jubatus::util::lang::function< binary_feature *(const std::string &, const param_t &)> create_function
jubatus::util::lang::function< combination_feature *(const std::string &, const param_t &)> create_function