string_t Tokenizer::GetDelimiters(const TokenRange& range) const { string_t delimiters; auto is_delimiter = [&](const char_t& c) { if (!IsAlphanumericChar(c)) if (options_.allowed_delimiters.find(c) != string_t::npos) if (delimiters.find(c) == string_t::npos) return true; return false; }; std::copy_if(filename_.begin() + range.offset, filename_.begin() + range.offset + range.size, std::back_inserter(delimiters), is_delimiter); return delimiters; }
char_t Tokenizer::GetDelimiter(TokenRange range) const { // Symbols are sorted by their precedence, in decreasing order. While the most // common delimiters are underscore, space and dot, we give comma the priority // to handle the case where words are separated by ", ". Besides, we'll be // trimming whitespace later on. static const string_t kDelimiterTable = L",_ .-+;&|~"; // Trim whitespace so that it doesn't interfere with our frequency analysis. // This proves useful for handling some edge cases, and it doesn't seem to // have any side effects. if (!TrimWhitespace(filename_, range)) return L' '; static std::map<char_t, size_t> frequency; if (frequency.empty()) { // Initialize frequency map for (const auto& character : kDelimiterTable) { frequency.insert(std::make_pair(character, 0)); } } else { // Reset frequency map for (auto& pair : frequency) { pair.second = 0; } } // Count all possible delimiters for (size_t i = range.offset; i < range.offset + range.size; i++) { const char_t character = filename_.at(i); if (IsAlphanumericChar(character)) continue; if (frequency.find(character) == frequency.end()) continue; frequency.at(character) += 1; } char_t delimiter = L'\0'; for (const auto& pair : frequency) { if (pair.second == 0) continue; // Initialize delimiter at first iteration if (delimiter == L'\0') { delimiter = pair.first; continue; } int character_distance = static_cast<int>(kDelimiterTable.find(pair.first)) - static_cast<int>(kDelimiterTable.find(delimiter)); // If the distance is negative, then the new delimiter has higher priority if (character_distance < 0) { delimiter = pair.first; continue; } // Even if the new delimiter has lower priority, it may be much more common float frequency_ratio = static_cast<float>(pair.second) / static_cast<float>(frequency[delimiter]); // The constant value was chosen by trial and error. There should be room // for improvement. if (frequency_ratio / abs(character_distance) > 0.8f) delimiter = pair.first; } return delimiter; }