bool Parser::CheckAnimeSeasonKeyword(const token_iterator_t token) { auto set_anime_season = [&](token_iterator_t first, token_iterator_t second, const string_t& content) { elements_.insert(kElementAnimeSeason, content); first->category = kIdentifier; second->category = kIdentifier; }; auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); if (previous_token != tokens_.end()) { auto number = GetNumberFromOrdinal(previous_token->content); if (!number.empty()) { set_anime_season(previous_token, token, number); return true; } } auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (next_token != tokens_.end() && IsNumericString(next_token->content)) { set_anime_season(token, next_token, next_token->content); return true; } return false; }
void Parser::SearchForReleaseGroup() { token_container_t::iterator token_begin = tokens_.begin(); token_container_t::iterator token_end = tokens_.begin(); do { // Find the first enclosed unknown token token_begin = FindToken(token_end, tokens_.end(), kFlagEnclosed | kFlagUnknown); if (token_begin == tokens_.end()) return; // Continue until a bracket or identifier is found token_end = FindToken(token_begin, tokens_.end(), kFlagBracket | kFlagIdentifier); if (token_end->category != kBracket) continue; // Ignore if it's not the first non-delimiter token in group token_container_t::iterator previous_token = FindPreviousToken(tokens_, token_begin, kFlagNotDelimiter); if (previous_token != tokens_.end() && previous_token->category != kBracket) { continue; } // Build release group BuildElement(kElementReleaseGroup, true, token_begin, token_end); return; } while (token_begin != tokens_.end()); }
bool Parser::IsTokenIsolated(const token_iterator_t token) const { auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); if (previous_token == tokens_.end() || previous_token->category != kBracket) return false; auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (next_token == tokens_.end() || next_token->category != kBracket) return false; return true; }
bool Parser::SearchForSeparatedNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); // See if the number has a preceding "-" separator if (previous_token != tokens_.end() && previous_token->category == kUnknown && IsDashCharacter(previous_token->content)) { if (SetEpisodeNumber(token->content, *token, true)) { previous_token->category = kIdentifier; return true; } } } return false; }
bool Parser::SearchForLastNumber(std::vector<size_t>& tokens) { for (auto it = tokens.rbegin(); it != tokens.rend(); ++it) { size_t token_index = *it; auto token = tokens_.begin() + token_index; // Assuming that episode number always comes after the title, first token // cannot be what we're looking for if (token_index == 0) continue; // An enclosed token is unlikely to be the episode number at this point if (token->enclosed) continue; // Ignore if it's the first non-enclosed, non-delimiter token if (std::all_of(tokens_.begin(), token, [](const Token& token) { return token.enclosed || token.category == kDelimiter; })) continue; // Ignore if the previous token is "Movie" or "Part" auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); if (previous_token != tokens_.end() && previous_token->category == kUnknown) { if (IsStringEqualTo(previous_token->content, L"Movie") || IsStringEqualTo(previous_token->content, L"Part")) { continue; } } // We'll use this number after all if (SetEpisodeNumber(token->content, *token, true)) return true; } return false; }
void Parser::SearchForAnimeTitle() { bool enclosed_title = false; // Find the first non-enclosed unknown token auto token_begin = FindToken(tokens_.begin(), tokens_.end(), kFlagNotEnclosed | kFlagUnknown); // If that doesn't work, find the first unknown token in the second enclosed // group, assuming that the first one is the release group if (token_begin == tokens_.end()) { enclosed_title = true; token_begin = tokens_.begin(); bool skipped_previous_group = false; do { token_begin = FindToken(token_begin, tokens_.end(), kFlagUnknown); if (token_begin == tokens_.end()) break; // Ignore groups that are composed of non-Latin characters if (IsMostlyLatinString(token_begin->content)) if (skipped_previous_group) break; // Found it // Get the first unknown token of the next group token_begin = FindToken(token_begin, tokens_.end(), kFlagBracket); token_begin = FindToken(token_begin, tokens_.end(), kFlagUnknown); skipped_previous_group = true; } while (token_begin != tokens_.end()); } if (token_begin == tokens_.end()) return; // Continue until an identifier (or a bracket, if the title is enclosed) // is found auto token_end = FindToken(token_begin, tokens_.end(), kFlagIdentifier | (enclosed_title ? kFlagBracket : kFlagNone)); // If within the interval there's an open bracket without its matching pair, // move the upper endpoint back to the bracket if (!enclosed_title) { auto last_bracket = token_end; bool bracket_open = false; for (auto token = token_begin; token != token_end; ++token) { if (token->category == kBracket) { last_bracket = token; bracket_open = !bracket_open; } } if (bracket_open) token_end = last_bracket; } // If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"), // move the upper endpoint back to the beginning of the group. We ignore // parentheses in order to keep certain groups (e.g. "(TV)") intact. if (!enclosed_title) { auto token = FindPreviousToken(tokens_, token_end, kFlagNotDelimiter); while (CheckTokenCategory(token, kBracket) && token->content.front() != ')') { token = FindPreviousToken(tokens_, token, kFlagBracket); if (token != tokens_.end()) { token_end = token; token = FindPreviousToken(tokens_, token_end, kFlagNotDelimiter); } } } // Build anime title BuildElement(kElementAnimeTitle, false, token_begin, token_end); }
void Tokenizer::ValidateDelimiterTokens() { auto is_delimiter_token = [&](token_iterator_t it) { return it != tokens_.end() && it->category == kDelimiter; }; auto is_unknown_token = [&](token_iterator_t it) { return it != tokens_.end() && it->category == kUnknown; }; auto is_single_character_token = [&](token_iterator_t it) { return is_unknown_token(it) && it->content.size() == 1 && it->content.front() != L'-'; }; auto append_token_to = [](token_iterator_t token, token_iterator_t append_to) { append_to->content.append(token->content); token->category = kInvalid; }; for (auto token = tokens_.begin(); token != tokens_.end(); ++token) { if (token->category != kDelimiter) continue; auto delimiter = token->content.front(); auto prev_token = FindPreviousToken(tokens_, token, kFlagValid); auto next_token = FindNextToken(tokens_, token, kFlagValid); // Check for single-character tokens to prevent splitting group names, // keywords, episode number, etc. if (delimiter != L' ' && delimiter != L'_') { if (is_single_character_token(prev_token)) { append_token_to(token, prev_token); while (is_unknown_token(next_token)) { append_token_to(next_token, prev_token); next_token = FindNextToken(tokens_, next_token, kFlagValid); if (is_delimiter_token(next_token) && next_token->content.front() == delimiter) { append_token_to(next_token, prev_token); next_token = FindNextToken(tokens_, next_token, kFlagValid); } } continue; } if (is_single_character_token(next_token)) { append_token_to(token, prev_token); append_token_to(next_token, prev_token); continue; } } // Check for adjacent delimiters if (is_unknown_token(prev_token) && is_delimiter_token(next_token)) { auto next_delimiter = next_token->content.front(); if (delimiter != next_delimiter && delimiter != ',') { if (next_delimiter == ' ' || next_delimiter == '_') { append_token_to(token, prev_token); } } } } auto remove_if_invalid = std::remove_if(tokens_.begin(), tokens_.end(), [](const Token& token) -> bool { return token.category == kInvalid; }); tokens_.erase(remove_if_invalid, tokens_.end()); }