jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
re2_splitter.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2012 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "re2_splitter.hpp"
18 
19 #include <iostream>
20 #include <map>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 #include "jubatus/util/lang/cast.h"
25 #include "exception.hpp"
26 
27 using jubatus::util::lang::lexical_cast;
29 
30 namespace jubatus {
31 namespace core {
32 namespace fv_converter {
33 
34 regexp_splitter::regexp_splitter(const std::string& regexp, int group)
35  : re_(regexp),
36  group_(group) {
37  if (group < 0) {
39  "'group' must be positive: " + lexical_cast<std::string>(group)));
40  }
41  if (!re_.ok()) {
42  throw JUBATUS_EXCEPTION(
43  converter_exception("invalid regular expression: " + regexp));
44  }
45  if (group > re_.NumberOfCapturingGroups()) {
46  std::string msg = "regexp '" + regexp + "' only contains "
47  + lexical_cast<std::string>(re_.NumberOfCapturingGroups())
48  + " groups, but 'group' is " + lexical_cast<std::string>(group);
50  }
51 }
52 
54  const std::string& str,
55  std::vector<std::pair<size_t, size_t> >& bounds) const {
56  re2::StringPiece input(str.c_str());
57  int groupSize = re_.NumberOfCapturingGroups() + 1;
58  std::vector<re2::StringPiece> words(groupSize);
59  size_t current = 0;
60  while (re_.Match(input, current, input.size(), re2::RE2::UNANCHORED,
61  &(words[0]), words.size())) {
62  size_t len = words[group_].length();
63  size_t pos = words[group_].begin() - input.begin();
64  bounds.push_back(std::make_pair(pos, len));
65 
66  size_t next = words[group_].end() - input.begin();
67  if (current == next) {
68  // did not match
69  current += 1;
70  } else {
71  current = next;
72  }
73  }
74 }
75 
76 } // namespace fv_converter
77 } // namespace core
78 } // namespace jubatus
regexp_splitter(const std::string &regexp, int group)
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
void split(const std::string &str, std::vector< std::pair< size_t, size_t > > &bounds) const