Пример #1
0
ReadNormalizer::ReadNormalizer()
    : normalize_map() {
  const ustring unnormalized = string_to_ustring(
      "ガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポァィゥェォャュョッ");
  const ustring normalized = string_to_ustring(
      "カキクケコサシスセソタチツテトハヒフヘホハヒフヘホアイウエオヤユヤツ");
  for (size_t i = 0; i < unnormalized.size(); ++i) {
    normalize_map[unnormalized[i]] = normalized[i];
  }
}
Пример #2
0
void char_splitter::split(
    const std::string& string,
    std::vector<std::pair<size_t, size_t> >& ret_boundaries) const {
  std::vector<std::pair<size_t, size_t> > bounds;
  const jubatus::util::data::string::ustring target = string_to_ustring(string);

  size_t last = 0;
  while (true) {
    size_t begin = target.find_first_not_of(separator_, last);
    if (begin == std::string::npos) {
      break;
    }

    size_t begin_bytes = ustring_to_string(target.substr(0, begin)).size();
    size_t end = target.find_first_of(separator_, begin);
    if (end == std::string::npos) {
      size_t len_bytes = ustring_to_string(target.substr(begin)).size();
      bounds.push_back(std::make_pair(begin_bytes, len_bytes));
      break;
    } else {
      size_t len = end - begin;
      size_t len_bytes = ustring_to_string(target.substr(begin, len)).size();
      bounds.push_back(std::make_pair(begin_bytes, len_bytes));
      last = end;
    }
  }

  bounds.swap(ret_boundaries);
}