jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
onig_splitter.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2013 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 
18 #include "onig_splitter.hpp"
19 #include <string>
20 #include <utility>
21 #include <vector>
22 #include "jubatus/util/lang/cast.h"
23 #include "exception.hpp"
24 
25 using jubatus::util::lang::lexical_cast;
27 
28 namespace jubatus {
29 namespace core {
30 namespace fv_converter {
31 
32 regexp_splitter::regexp_splitter(const std::string& regexp, int group)
33  : reg_(NULL),
34  group_(group) {
35  if (group < 0) {
37  "'group' must be positive: " + lexical_cast<std::string>(group)));
38  }
39 
40  const UChar* pattern = reinterpret_cast<const UChar*>(regexp.data());
41  if (ONIG_NORMAL != onig_new(&reg_, pattern, pattern + regexp.size(),
42  ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PERL, NULL)) {
44  "invalid regular expression: " + regexp));
45  }
46 
47  const int num_capture = onig_number_of_captures(reg_);
48  if (group > num_capture) {
49  std::string msg = "regexp '" + regexp + "' only contains "
50  + lexical_cast<std::string>(num_capture)
51  + " groups, but 'group' is " + lexical_cast<std::string>(group);
53  }
54 }
55 
57  if (reg_) {
58  onig_free(reg_);
59  }
60 }
61 
63  const std::string& str,
64  std::vector<std::pair<size_t, size_t> >& bounds) const {
65 
66  const UChar* head = reinterpret_cast<const UChar*>(str.data());
67  const UChar* end = head + str.size();
68 
69  OnigRegion* region = onig_region_new();
70  try {
71  int cur = 0;
72  while (head + cur < end) {
73  int match
74  = onig_match(reg_, head, end, head + cur, region, ONIG_OPTION_NONE);
75  if (match < 0) {
76  // did not match
77  cur++;
78  continue;
79  }
80 
81  const int pos = region->beg[group_];
82  const int len = region->end[group_] - pos;
83  bounds.push_back(std::make_pair(pos, len));
84 
85  if (len > 0) {
86  cur += len;
87  } else {
88  ++cur;
89  }
90  }
91  onig_region_free(region, 1);
92  } catch (...) {
93  onig_region_free(region, 1);
94  throw;
95  }
96 }
97 
98 } // namespace fv_converter
99 } // namespace core
100 } // namespace jubatus
regexp_splitter(const std::string &regexp, int group)
#define JUBATUS_EXCEPTION(e)
Definition: exception.hpp:79
void split(const std::string &str, std::vector< std::pair< size_t, size_t > > &bounds) const