jubatus_core  0.1.2
Jubatus: Online machine learning framework for distributed environment
character_ngram.cpp
Go to the documentation of this file.
1 // Jubatus: Online machine learning framework for distributed environment
2 // Copyright (C) 2011 Preferred Networks and Nippon Telegraph and Telephone Corporation.
3 //
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License version 2.1 as published by the Free Software Foundation.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
12 //
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 #include "character_ngram.hpp"
18 
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 namespace jubatus {
24 namespace core {
25 namespace fv_converter {
26 
27 namespace {
28 
29 bool is_begin_of_character(unsigned char c) {
30  return (c & 0xC0) != 0x80;
31 }
32 
33 } // namespace
34 
36  const std::string& string,
37  std::vector<std::pair<size_t, size_t> >& ret_boundaries) const {
38  const size_t len = length_;
39  std::vector<size_t> queue(len);
40  size_t p = 0;
41  size_t n = 0;
42 
43  std::vector<std::pair<size_t, size_t> > bounds;
44  for (size_t i = 1; i <= string.size(); ++i) {
45  if (i == string.size() || is_begin_of_character(string[i])) {
46  ++n;
47  if (n >= len) {
48  size_t b = queue[p];
49  bounds.push_back(std::make_pair(b, i - b));
50  }
51  queue[p] = i;
52  ++p;
53  if (p == len) {
54  p = 0;
55  }
56  }
57  }
58 
59  bounds.swap(ret_boundaries);
60 }
61 
62 } // namespace fv_converter
63 } // namespace core
64 } // namespace jubatus
void split(const std::string &string, std::vector< std::pair< size_t, size_t > > &ret_boundaries) const