void KeywordManager::Peek(const string_t& filename, const TokenRange& range, Elements& elements, std::vector<TokenRange>& preidentified_tokens) const { typedef std::pair<ElementCategory, std::vector<string_t>> entry_t; static /*const*/ std::vector<entry_t> entries; { ElementCategory c = kElementAudioTerm; const char_t* k[] = {L"Dual Audio"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); } { ElementCategory c = kElementVideoTerm; const char_t* k[] = {L"H264", L"H.264", L"h264", L"h.264"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); } { ElementCategory c = kElementVideoResolution; const char_t* k[] = {L"480p", L"720p", L"1080p"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); } { ElementCategory c = kElementSource; const char_t* k[] = {L"Blu-Ray"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); } string_t::const_iterator it_begin = filename.begin() + range.offset; string_t::const_iterator it_end = it_begin + range.size; for(std::vector<entry_t>::const_iterator entry = entries.begin(); entry != entries.end(); ++entry) { for(std::vector<string_t>::const_iterator keyword = entry->second.begin(); keyword != entry->second.end(); ++keyword) { string_t::const_iterator it = std::search(it_begin, it_end, keyword->begin(), keyword->end()); if (it != it_end) { size_t offset = it - filename.begin(); elements.insert(entry->first, *keyword); preidentified_tokens.push_back(TokenRange(offset, keyword->size())); } } } }
void KeywordManager::Peek(const string_t& filename, const TokenRange& range, Elements& elements, std::vector<TokenRange>& preidentified_tokens) const { typedef std::pair<ElementCategory, std::vector<string_t>> entry_t; static const std::vector<entry_t> entries{ {kElementAudioTerm, {L"Dual Audio"}}, {kElementVideoTerm, {L"H264", L"H.264", L"h264", L"h.264"}}, {kElementVideoResolution, {L"480p", L"720p", L"1080p"}}, {kElementSource, {L"Blu-Ray"}} }; auto it_begin = filename.begin() + range.offset; auto it_end = it_begin + range.size; for (const auto& entry : entries) { for (const auto& keyword : entry.second) { auto it = std::search(it_begin, it_end, keyword.begin(), keyword.end()); if (it != it_end) { auto offset = it - filename.begin(); elements.insert(entry.first, keyword); preidentified_tokens.push_back(TokenRange(offset, keyword.size())); } } } }
void Tokenizer::TokenizeByDelimiter(bool enclosed, const TokenRange& range) { // Each group occasionally has a different delimiter, which is why we can't // analyze the whole filename in one go. const char_t delimiter = GetDelimiter(range); // TODO: Better handle groups with multiple delimiters if (!ValidateDelimiter(delimiter, enclosed, range)) { AddToken(kUnknown, enclosed, range); return; } TokenRange new_range(range.offset, 0); for (size_t offset = range.offset; offset < range.offset + range.size; offset++) { const char_t character = filename_.at(offset); if (character == delimiter) { // Add new unknown token if (new_range.offset < offset) { new_range.size = offset - new_range.offset; AddToken(kUnknown, enclosed, new_range); } // Add delimiter AddToken(kDelimiter, enclosed, TokenRange(offset, 1)); new_range.offset = offset + 1; } else if (offset == range.offset + range.size - 1) { // Add last unknown token new_range.size = offset - new_range.offset + 1; AddToken(kUnknown, enclosed, new_range); } } }
void Tokenizer::TokenizeByDelimiters(bool enclosed, const TokenRange& range) { const string_t delimiters = GetDelimiters(range); if (delimiters.empty()) { AddToken(kUnknown, enclosed, range); return; } auto char_begin = filename_.begin() + range.offset; const auto char_end = char_begin + range.size; auto current_char = char_begin; while (current_char != char_end) { current_char = std::find_first_of(current_char, char_end, delimiters.begin(), delimiters.end()); const TokenRange subrange(std::distance(filename_.begin(), char_begin), std::distance(char_begin, current_char)); if (subrange.size > 0) // Found unknown token AddToken(kUnknown, enclosed, subrange); if (current_char != char_end) { // Found delimiter AddToken(kDelimiter, enclosed, TokenRange(subrange.offset + subrange.size, 1)); char_begin = ++current_char; } } ValidateDelimiterTokens(); }
void Tokenizer::TokenizeByBrackets() { static const std::vector<std::pair<char_t, char_t>> brackets{ {L'(', L')'}, // U+0028-U+0029 Parenthesis {L'[', L']'}, // U+005B-U+005D Square bracket {L'{', L'}'}, // U+007B-U+007D Curly bracket {L'\u300C', L'\u300D'}, // Corner bracket {L'\u300E', L'\u300F'}, // White corner bracket {L'\u3010', L'\u3011'}, // Black lenticular bracket {L'\uFF08', L'\uFF09'}, // Fullwidth parenthesis }; bool is_bracket_open = false; char_t matching_bracket = L'\0'; auto char_begin = filename_.begin(); const auto char_end = filename_.end(); // This is basically std::find_first_of() customized to our needs auto find_first_bracket = [&]() -> string_t::const_iterator { for (auto it = char_begin; it != char_end; ++it) { for (const auto& bracket_pair : brackets) { if (*it == bracket_pair.first) { matching_bracket = bracket_pair.second; return it; } } } return char_end; }; auto current_char = char_begin; while (current_char != char_end && char_begin != char_end) { if (!is_bracket_open) { current_char = find_first_bracket(); } else { // Looking for the matching bracket allows us to better handle some rare // cases with nested brackets. current_char = std::find(char_begin, char_end, matching_bracket); } const TokenRange range(std::distance(filename_.begin(), char_begin), std::distance(char_begin, current_char)); if (range.size > 0) // Found unknown token TokenizeByPreidentified(is_bracket_open, range); if (current_char != char_end) { // Found bracket AddToken(kBracket, true, TokenRange(range.offset + range.size, 1)); is_bracket_open = !is_bracket_open; char_begin = ++current_char; } } }
void Tokenizer::TokenizeByBrackets() { static const string_t kOpeningBrackets = L"[({"; static const string_t kClosingBrackets = L"])}"; bool bracket_open = false; size_t last_bracket_index = 0; TokenRange range; for (size_t offset = 0; offset < filename_.size(); offset++) { const auto& brackets = bracket_open ? kClosingBrackets : kOpeningBrackets; const size_t index = brackets.find(filename_.at(offset)); // Character is a bracket if (index != string_t::npos) { // Check if it matches last open bracket if (bracket_open) { if (index != last_bracket_index) continue; } else { last_bracket_index = index; } // Add unknown token if (range.offset < offset) { range.size = offset - range.offset; TokenizeByDelimiter(bracket_open, range); } // Add bracket AddToken(kBracket, true, TokenRange(offset, 1)); bracket_open = !bracket_open; range.offset = offset + 1; // Character is not a bracket, and the loop reached the end } else if (offset == filename_.size() - 1) { // Add last unknown token range.size = offset - range.offset + 1; TokenizeByDelimiter(false, range); } } }