24 #include "jubatus/util/data/optional.h"
25 #include "jubatus/util/lang/shared_ptr.h"
44 namespace fv_converter {
50 typedef jubatus::util::data::unordered_map<std::string, float>
weight_t;
53 jubatus::util::lang::shared_ptr<key_matcher>
matcher_;
54 jubatus::util::lang::shared_ptr<string_filter>
filter_;
58 for (
size_t i = 0; i < string_values.size(); ++i) {
59 const std::pair<std::string, std::string>& value = string_values[i];
60 if (matcher_->match(value.first)) {
62 filter_->filter(value.second, out);
63 std::string dest = value.first +
suffix_;
64 filtered.push_back(std::make_pair(dest, out));
71 jubatus::util::lang::shared_ptr<key_matcher>
matcher_;
72 jubatus::util::lang::shared_ptr<num_filter>
filter_;
76 for (
size_t i = 0; i < num_values.size(); ++i) {
77 const std::pair<std::string, double>& value = num_values[i];
78 if (matcher_->match(value.first)) {
79 double out = filter_->filter(value.second);
80 std::string dest = value.first +
suffix_;
81 filtered.push_back(std::make_pair(dest, out));
89 jubatus::util::lang::shared_ptr<key_matcher>
matcher_;
90 jubatus::util::lang::shared_ptr<string_feature>
splitter_;
94 const std::string& name,
95 jubatus::util::lang::shared_ptr<key_matcher> matcher,
96 jubatus::util::lang::shared_ptr<string_feature> splitter,
97 const std::vector<splitter_weight_type>& weights)
107 jubatus::util::lang::shared_ptr<key_matcher>
matcher_;
111 const std::string& name,
112 jubatus::util::lang::shared_ptr<key_matcher> matcher,
113 jubatus::util::lang::shared_ptr<num_feature> feature_func)
116 feature_func_(feature_func) {
122 jubatus::util::lang::shared_ptr<key_matcher>
matcher_;
126 const std::string& name,
127 jubatus::util::lang::shared_ptr<key_matcher> matcher,
128 jubatus::util::lang::shared_ptr<binary_feature> feature_func)
131 feature_func_(feature_func) {
142 const std::string& name,
143 jubatus::util::lang::shared_ptr<key_matcher> matcher_left,
144 jubatus::util::lang::shared_ptr<key_matcher> matcher_right,
145 jubatus::util::lang::shared_ptr<combination_feature> feature_func)
147 matcher_left_(matcher_left),
148 matcher_right_(matcher_right),
149 feature_func_(feature_func) {
164 jubatus::util::data::optional<feature_hasher>
hasher_;
175 string_filter_rules_.clear();
176 num_filter_rules_.clear();
177 string_rules_.clear();
179 binary_rules_.clear();
180 combination_rules_.clear();
184 jubatus::util::lang::shared_ptr<key_matcher> matcher,
185 jubatus::util::lang::shared_ptr<string_filter> filter,
186 const std::string& suffix) {
188 string_filter_rules_.push_back(rule);
192 jubatus::util::lang::shared_ptr<key_matcher> matcher,
193 jubatus::util::lang::shared_ptr<num_filter> filter,
194 const std::string& suffix) {
196 num_filter_rules_.push_back(rule);
200 const std::string& name,
201 jubatus::util::lang::shared_ptr<key_matcher> matcher,
202 jubatus::util::lang::shared_ptr<string_feature> splitter,
203 const std::vector<splitter_weight_type>& weights) {
204 string_rules_.push_back(
209 const std::string& name,
210 jubatus::util::lang::shared_ptr<key_matcher> matcher,
211 jubatus::util::lang::shared_ptr<num_feature> feature_func) {
216 const std::string& name,
217 jubatus::util::lang::shared_ptr<key_matcher> matcher,
218 jubatus::util::lang::shared_ptr<binary_feature> feature_func) {
223 const std::string& name,
224 jubatus::util::lang::shared_ptr<key_matcher> matcher_left,
225 jubatus::util::lang::shared_ptr<key_matcher> matcher_right,
226 jubatus::util::lang::shared_ptr<combination_feature> feature_func) {
227 combination_rules_.push_back(
236 jubatus::util::lang::shared_ptr<weight_manager> weights =
237 mixable_weights_->get_model();
239 (*weights).add_weight(key, weight);
246 jubatus::util::lang::shared_ptr<weight_manager> weights =
247 mixable_weights_->get_model();
249 weights->get_weight(fv);
255 hasher_->hash_feature_keys(fv);
264 jubatus::util::lang::shared_ptr<weight_manager> weights =
265 mixable_weights_->get_model();
267 weights->update_weight(fv);
268 weights->get_weight(fv);
274 hasher_->hash_feature_keys(fv);
283 std::vector<std::pair<std::string, std::string> > filtered_strings;
288 std::vector<std::pair<std::string, double> > filtered_nums;
299 const std::string& feature,
300 std::pair<std::string, std::string>& expect)
const {
303 size_t sharp = feature.rfind(
'#');
304 if (sharp == std::string::npos) {
308 size_t at = feature.rfind(
'@', sharp);
309 if (at == std::string::npos) {
313 size_t dollar = feature.rfind(
'$', at);
314 if (dollar == std::string::npos) {
318 if (feature.substr(at + 1, sharp - at - 1) !=
"str") {
323 std::string key(feature.substr(0, dollar));
324 std::string value(feature.substr(dollar + 1, at - dollar - 1));
326 expect.first.swap(key);
327 expect.second.swap(value);
335 mixable_weights_->set_model(wm);
339 jubatus::util::lang::shared_ptr<weight_manager> weights =
340 mixable_weights_->get_model();
350 for (
size_t i = 0; i < string_filter_rules_.size(); ++i) {
352 string_filter_rules_[i].filter(string_values, update);
353 string_filter_rules_[i].filter(filtered_values, update);
355 filtered_values.insert(filtered_values.end(), update.begin(),
363 for (
size_t i = 0; i < num_filter_rules_.size(); ++i) {
365 num_filter_rules_[i].filter(num_values, update);
366 num_filter_rules_[i].filter(filtered_values, update);
368 filtered_values.insert(
369 filtered_values.end(), update.begin(), update.end());
375 for (
size_t i = 0; i < string_rules_.size(); ++i) {
381 for (
size_t i = 0; i < s.
weights_.size(); ++i) {
393 for (
size_t j = 0; j < string_values.size(); ++j) {
394 const std::string& key = string_values[j].first;
395 const std::string& value = string_values[j].second;
398 for (
size_t i = 0; i < splitter.
weights_.size(); ++i) {
407 for (
size_t i = 0; i < binary_rules_.size(); ++i) {
416 for (
size_t j = 0; j < binary_values.size(); ++j) {
417 const std::string& key = binary_values[j].first;
418 const std::string& value = binary_values[j].second;
427 const std::string& key,
428 const std::string& value,
429 const std::string& splitter,
430 const std::string& sample_weight,
431 const std::string& global_weight) {
433 return key +
"$" + value +
"@" + splitter +
"#" + sample_weight +
"/" +
438 const std::string& key,
439 const std::string& value,
440 const std::string& splitter) {
442 return key +
"$" + value +
"@" + splitter;
446 if (key.find(
'$') != std::string::npos) {
454 const std::string& key,
455 const std::string& value,
457 if (splitter.
matcher_->match(key)) {
458 std::vector<string_feature_element> elements;
459 splitter.
splitter_->extract(value, elements);
461 for (
size_t i = 0; i < elements.size(); i++) {
462 counter[elements[i].value] += elements[i].score;
470 std::string& name)
const {
482 return std::log(1. + tf);
500 "unknown global weight type"));
505 const std::string& key,
506 const std::string& splitter_name,
511 it != count.
end(); ++it) {
512 std::string sample_weight_name;
518 float v =
static_cast<float>(sample_weight);
521 key, it->first, splitter_name, sample_weight_name,
523 ret_fv.push_back(std::make_pair(f, v));
530 for (
size_t i = 0; i < num_values.size(); ++i) {
531 convert_num(num_values[i].first, num_values[i].second, ret_fv);
538 for (
size_t i = 0; i < num_rules_.size(); ++i) {
542 std::string k = key +
"@" + r.
name_;
549 const size_t original_size = ret_fv.size();
551 if (original_size < 2) {
556 for (
size_t i = 0; i < combination_rules_.size(); ++i) {
558 for (
size_t j = 0 ; j < original_size - 1; ++j) {
559 for (
size_t m = j + 1; m < original_size; ++m) {
563 ret_fv[j].first +
"&" + ret_fv[m].first +
"/" + r.
name_,
583 pimpl_->convert(datum, ret_fv);
589 pimpl_->convert_and_update_weight(datum, ret_fv);
597 jubatus::util::lang::shared_ptr<key_matcher> matcher,
598 jubatus::util::lang::shared_ptr<string_filter> filter,
599 const std::string& suffix) {
600 pimpl_->register_string_filter(matcher, filter, suffix);
604 jubatus::util::lang::shared_ptr<key_matcher> matcher,
605 jubatus::util::lang::shared_ptr<num_filter> filter,
606 const std::string& suffix) {
607 pimpl_->register_num_filter(matcher, filter, suffix);
611 const std::string& name,
612 jubatus::util::lang::shared_ptr<key_matcher> matcher,
613 jubatus::util::lang::shared_ptr<string_feature> splitter,
614 const std::vector<splitter_weight_type>& weights) {
615 pimpl_->register_string_rule(name, matcher, splitter, weights);
619 const std::string& name,
620 jubatus::util::lang::shared_ptr<key_matcher> matcher,
621 jubatus::util::lang::shared_ptr<num_feature> feature_func) {
622 pimpl_->register_num_rule(name, matcher, feature_func);
626 const std::string& name,
627 jubatus::util::lang::shared_ptr<key_matcher> matcher,
628 jubatus::util::lang::shared_ptr<binary_feature> feature_func) {
629 pimpl_->register_binary_rule(name, matcher, feature_func);
633 const std::string& name,
634 jubatus::util::lang::shared_ptr<key_matcher> matcher_left,
635 jubatus::util::lang::shared_ptr<key_matcher> matcher_right,
636 jubatus::util::lang::shared_ptr<combination_feature> feature_func) {
637 pimpl_->register_combination_rule(
645 pimpl_->add_weight(key, weight);
649 const std::string& feature,
650 std::pair<std::string, std::string>& expect)
const {
651 pimpl_->revert_feature(feature, expect);
655 pimpl_->set_hash_max_size(hash_max_size);
659 jubatus::util::lang::shared_ptr<weight_manager> wm) {
660 pimpl_->set_weight_manager(wm);
frequency_weight_type freq_weight_type_
void convert(const datum &datum, common::sfv_t &ret_fv) const
void make_string_features(const std::string &key, const std::string &splitter_name, const splitter_weight_type &weight_type, const counter< std::string > &count, common::sfv_t &ret_fv) const
void revert_feature(const std::string &feature, std::pair< std::string, std::string > &expect) const
std::vector< num_filter_rule > num_filter_rules_
string_feature_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< string_feature > splitter, const std::vector< splitter_weight_type > &weights)
std::vector< binary_feature_rule > binary_rules_
bool contains_idf(const string_feature_rule &s) const
jubatus::util::lang::shared_ptr< string_feature > splitter_
void convert(const datum &datum, common::sfv_t &ret_fv) const
void convert_strings(const string_feature_rule &splitter, const datum::sv_t &string_values, common::sfv_t &ret_fv) const
void convert_and_update_weight(const datum &datum, common::sfv_t &ret_fv)
void filter_strings(const datum::sv_t &string_values, datum::sv_t &filtered_values) const
void convert_binaries(const binary_feature_rule &feature, const datum::sv_t &binary_values, common::sfv_t &ret_fv) const
void set_weight_manager(jubatus::util::lang::shared_ptr< weight_manager > wm)
void register_string_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< string_feature > splitter, const std::vector< splitter_weight_type > &weights)
jubatus::util::lang::shared_ptr< combination_feature > feature_func_
static std::string make_feature(const std::string &key, const std::string &value, const std::string &splitter, const std::string &sample_weight, const std::string &global_weight)
void register_num_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< num_feature > feature_func)
const_iterator begin() const
void convert_combinations(common::sfv_t &ret_fv) const
void register_num_filter(jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< num_filter > filter, const std::string &suffix)
std::vector< string_filter_rule > string_filter_rules_
static void check_key(const std::string &key)
void register_string_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< string_feature > splitter, const std::vector< splitter_weight_type > &weights)
void register_binary_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< binary_feature > feature_func)
#define JUBATUS_EXCEPTION(e)
void set_hash_max_size(uint64_t hash_max_size)
jubatus::util::lang::shared_ptr< num_feature > feature_func_
std::vector< splitter_weight_type > weights_
void register_string_filter(jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< string_filter > filter, const std::string &suffix)
jubatus::util::lang::shared_ptr< key_matcher > matcher_
void revert_feature(const std::string &feature, std::pair< std::string, std::string > &expect) const
jubatus::util::data::optional< feature_hasher > hasher_
jubatus::util::lang::shared_ptr< key_matcher > matcher_
void register_combination_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher_left, jubatus::util::lang::shared_ptr< key_matcher > matcher_right, jubatus::util::lang::shared_ptr< combination_feature > feature_func)
num_feature_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< num_feature > feature_func)
void convert_and_update_weight(const datum &datum, common::sfv_t &ret_fv)
void add_weight(const std::string &key, float weight)
combination_feature_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher_left, jubatus::util::lang::shared_ptr< key_matcher > matcher_right, jubatus::util::lang::shared_ptr< combination_feature > feature_func)
void filter(const datum::nv_t &num_values, datum::nv_t &filtered) const
void register_num_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< num_feature > feature_func)
void convert_num(const std::string &key, double value, common::sfv_t &ret_fv) const
jubatus::util::lang::shared_ptr< string_filter > filter_
void add_weight(const std::string &key, float weight)
std::vector< std::pair< std::string, std::string > > sv_t
std::string get_global_weight_name(term_weight_type type) const
jubatus::util::lang::shared_ptr< mixable_weight_manager > mixable_weights_
jubatus::util::lang::shared_ptr< key_matcher > matcher_left_
void set_weight_manager(jubatus::util::lang::shared_ptr< weight_manager > wm)
std::vector< num_feature_rule > num_rules_
void register_binary_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< binary_feature > feature_func)
jubatus::util::lang::shared_ptr< num_filter > filter_
void register_combination_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher_left, jubatus::util::lang::shared_ptr< key_matcher > matcher_right, jubatus::util::lang::shared_ptr< combination_feature > feature_func)
jubatus::util::lang::shared_ptr< key_matcher > matcher_
void set_hash_max_size(uint64_t hash_max_size)
std::vector< combination_feature_rule > combination_rules_
jubatus::util::lang::shared_ptr< key_matcher > matcher_right_
std::vector< std::pair< std::string, float > > sfv_t
void convert_binaries(const datum::sv_t &binary_values, common::sfv_t &ret_fv) const
double get_sample_weight(frequency_weight_type type, double tf, std::string &name) const
jubatus::util::lang::shared_ptr< key_matcher > matcher_
void register_num_filter(jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< num_filter > filter, const std::string &suffix)
jubatus::util::lang::scoped_ptr< datum_to_fv_converter_impl > pimpl_
void filter_nums(const datum::nv_t &num_values, datum::nv_t &filtered_values) const
datum_to_fv_converter_impl()
std::vector< std::pair< std::string, double > > nv_t
const_iterator end() const
jubatus::util::lang::shared_ptr< binary_feature > feature_func_
binary_feature_rule(const std::string &name, jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< binary_feature > feature_func)
void count_words(const string_feature_rule &splitter, const std::string &key, const std::string &value, counter< std::string > &counter) const
jubatus::util::data::unordered_map< std::string, float > weight_t
jubatus::util::lang::shared_ptr< key_matcher > matcher_
void register_string_filter(jubatus::util::lang::shared_ptr< key_matcher > matcher, jubatus::util::lang::shared_ptr< string_filter > filter, const std::string &suffix)
term_weight_type term_weight_type_
void convert_strings(const datum::sv_t &string_values, common::sfv_t &ret_fv) const
static std::string make_feature_key(const std::string &key, const std::string &value, const std::string &splitter)
void filter(const datum::sv_t &string_values, datum::sv_t &filtered) const
void convert_nums(const datum::nv_t &num_values, common::sfv_t &ret_fv) const
void convert_unweighted(const datum &datum, common::sfv_t &ret_fv) const
std::vector< string_feature_rule > string_rules_