예제 #1
0
string_t Tokenizer::GetDelimiters(const TokenRange& range) const {
  string_t delimiters;

  auto is_delimiter = [&](const char_t& c) {
    if (!IsAlphanumericChar(c))
      if (options_.allowed_delimiters.find(c) != string_t::npos)
        if (delimiters.find(c) == string_t::npos)
          return true;
    return false;
  };

  std::copy_if(filename_.begin() + range.offset,
               filename_.begin() + range.offset + range.size,
               std::back_inserter(delimiters), is_delimiter);

  return delimiters;
}
예제 #2
0
파일: tokenizer.cpp 프로젝트: vevix/anitomy
char_t Tokenizer::GetDelimiter(TokenRange range) const {
  // Symbols are sorted by their precedence, in decreasing order. While the most
  // common delimiters are underscore, space and dot, we give comma the priority
  // to handle the case where words are separated by ", ". Besides, we'll be
  // trimming whitespace later on.
  static const string_t kDelimiterTable = L",_ .-+;&|~";

  // Trim whitespace so that it doesn't interfere with our frequency analysis.
  // This proves useful for handling some edge cases, and it doesn't seem to
  // have any side effects.
  if (!TrimWhitespace(filename_, range))
    return L' ';

  static std::map<char_t, size_t> frequency;

  if (frequency.empty()) {
    // Initialize frequency map
    for (const auto& character : kDelimiterTable) {
      frequency.insert(std::make_pair(character, 0));
    }
  } else {
    // Reset frequency map
    for (auto& pair : frequency) {
      pair.second = 0;
    }
  }

  // Count all possible delimiters
  for (size_t i = range.offset; i < range.offset + range.size; i++) {
    const char_t character = filename_.at(i);
    if (IsAlphanumericChar(character))
      continue;
    if (frequency.find(character) == frequency.end())
      continue;
    frequency.at(character) += 1;
  }

  char_t delimiter = L'\0';

  for (const auto& pair : frequency) {
    if (pair.second == 0)
      continue;

    // Initialize delimiter at first iteration
    if (delimiter == L'\0') {
      delimiter = pair.first;
      continue;
    }

    int character_distance =
        static_cast<int>(kDelimiterTable.find(pair.first)) -
        static_cast<int>(kDelimiterTable.find(delimiter));
    // If the distance is negative, then the new delimiter has higher priority
    if (character_distance < 0) {
      delimiter = pair.first;
      continue;
    }

    // Even if the new delimiter has lower priority, it may be much more common
    float frequency_ratio = static_cast<float>(pair.second) /
                            static_cast<float>(frequency[delimiter]);
    // The constant value was chosen by trial and error. There should be room
    // for improvement.
    if (frequency_ratio / abs(character_distance) > 0.8f)
      delimiter = pair.first;
  }

  return delimiter;
}