Ejemplo n.º 1
0
void KeywordManager::Peek(const string_t& filename,
                          const TokenRange& range,
                          Elements& elements,
                          std::vector<TokenRange>& preidentified_tokens) const {
  typedef std::pair<ElementCategory, std::vector<string_t>> entry_t;
  static /*const*/ std::vector<entry_t> entries;
  { ElementCategory c = kElementAudioTerm;       const char_t* k[] = {L"Dual Audio"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); }
  { ElementCategory c = kElementVideoTerm;       const char_t* k[] = {L"H264", L"H.264", L"h264", L"h.264"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); }
  { ElementCategory c = kElementVideoResolution; const char_t* k[] = {L"480p", L"720p", L"1080p"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); }
  { ElementCategory c = kElementSource;          const char_t* k[] = {L"Blu-Ray"}; entry_t e; e.first = c; e.second.assign(k, k + _countof(k)); entries.push_back(e); }

  string_t::const_iterator it_begin = filename.begin() + range.offset;
  string_t::const_iterator it_end = it_begin + range.size;

  for(std::vector<entry_t>::const_iterator entry = entries.begin(); entry != entries.end(); ++entry) {
    for(std::vector<string_t>::const_iterator keyword = entry->second.begin(); keyword != entry->second.end(); ++keyword) {
      string_t::const_iterator it = std::search(it_begin, it_end, keyword->begin(), keyword->end());
      if (it != it_end) {
        size_t offset = it - filename.begin();
        elements.insert(entry->first, *keyword);
        preidentified_tokens.push_back(TokenRange(offset, keyword->size()));
      }
    }
  }
}
Ejemplo n.º 2
0
void KeywordManager::Peek(const string_t& filename,
                          const TokenRange& range,
                          Elements& elements,
                          std::vector<TokenRange>& preidentified_tokens) const {
  typedef std::pair<ElementCategory, std::vector<string_t>> entry_t;
  static const std::vector<entry_t> entries{
    {kElementAudioTerm, {L"Dual Audio"}},
    {kElementVideoTerm, {L"H264", L"H.264", L"h264", L"h.264"}},
    {kElementVideoResolution, {L"480p", L"720p", L"1080p"}},
    {kElementSource, {L"Blu-Ray"}}
  };

  auto it_begin = filename.begin() + range.offset;
  auto it_end = it_begin + range.size;

  for (const auto& entry : entries) {
    for (const auto& keyword : entry.second) {
      auto it = std::search(it_begin, it_end, keyword.begin(), keyword.end());
      if (it != it_end) {
        auto offset = it - filename.begin();
        elements.insert(entry.first, keyword);
        preidentified_tokens.push_back(TokenRange(offset, keyword.size()));
      }
    }
  }
}
Ejemplo n.º 3
0
void Tokenizer::TokenizeByDelimiter(bool enclosed, const TokenRange& range) {
  // Each group occasionally has a different delimiter, which is why we can't
  // analyze the whole filename in one go.
  const char_t delimiter = GetDelimiter(range);

  // TODO: Better handle groups with multiple delimiters
  if (!ValidateDelimiter(delimiter, enclosed, range)) {
    AddToken(kUnknown, enclosed, range);
    return;
  }

  TokenRange new_range(range.offset, 0);

  for (size_t offset = range.offset;
       offset < range.offset + range.size; offset++) {
    const char_t character = filename_.at(offset);

    if (character == delimiter) {
      // Add new unknown token
      if (new_range.offset < offset) {
        new_range.size = offset - new_range.offset;
        AddToken(kUnknown, enclosed, new_range);
      }
      // Add delimiter
      AddToken(kDelimiter, enclosed, TokenRange(offset, 1));
      new_range.offset = offset + 1;
    } else if (offset == range.offset + range.size - 1) {
      // Add last unknown token
      new_range.size = offset - new_range.offset + 1;
      AddToken(kUnknown, enclosed, new_range);
    }
  }
}
Ejemplo n.º 4
0
void Tokenizer::TokenizeByDelimiters(bool enclosed, const TokenRange& range) {
  const string_t delimiters = GetDelimiters(range);

  if (delimiters.empty()) {
    AddToken(kUnknown, enclosed, range);
    return;
  }

  auto char_begin = filename_.begin() + range.offset;
  const auto char_end = char_begin + range.size;
  auto current_char = char_begin;

  while (current_char != char_end) {
    current_char = std::find_first_of(current_char, char_end,
                                      delimiters.begin(), delimiters.end());

    const TokenRange subrange(std::distance(filename_.begin(), char_begin),
                              std::distance(char_begin, current_char));

    if (subrange.size > 0)  // Found unknown token
      AddToken(kUnknown, enclosed, subrange);

    if (current_char != char_end) {  // Found delimiter
      AddToken(kDelimiter, enclosed,
               TokenRange(subrange.offset + subrange.size, 1));
      char_begin = ++current_char;
    }
  }

  ValidateDelimiterTokens();
}
Ejemplo n.º 5
0
void Tokenizer::TokenizeByBrackets() {
  static const std::vector<std::pair<char_t, char_t>> brackets{
      {L'(', L')'},  // U+0028-U+0029 Parenthesis
      {L'[', L']'},  // U+005B-U+005D Square bracket
      {L'{', L'}'},  // U+007B-U+007D Curly bracket
      {L'\u300C', L'\u300D'},  // Corner bracket
      {L'\u300E', L'\u300F'},  // White corner bracket
      {L'\u3010', L'\u3011'},  // Black lenticular bracket
      {L'\uFF08', L'\uFF09'},  // Fullwidth parenthesis
  };

  bool is_bracket_open = false;
  char_t matching_bracket = L'\0';

  auto char_begin = filename_.begin();
  const auto char_end = filename_.end();

  // This is basically std::find_first_of() customized to our needs
  auto find_first_bracket = [&]() -> string_t::const_iterator {
    for (auto it = char_begin; it != char_end; ++it) {
      for (const auto& bracket_pair : brackets) {
        if (*it == bracket_pair.first) {
          matching_bracket = bracket_pair.second;
          return it;
        }
      }
    }
    return char_end;
  };

  auto current_char = char_begin;

  while (current_char != char_end && char_begin != char_end) {
    if (!is_bracket_open) {
      current_char = find_first_bracket();
    } else {
      // Looking for the matching bracket allows us to better handle some rare
      // cases with nested brackets.
      current_char = std::find(char_begin, char_end, matching_bracket);
    }

    const TokenRange range(std::distance(filename_.begin(), char_begin),
                           std::distance(char_begin, current_char));

    if (range.size > 0)  // Found unknown token
      TokenizeByPreidentified(is_bracket_open, range);

    if (current_char != char_end) {  // Found bracket
      AddToken(kBracket, true, TokenRange(range.offset + range.size, 1));
      is_bracket_open = !is_bracket_open;
      char_begin = ++current_char;
    }
  }
}
Ejemplo n.º 6
0
void Tokenizer::TokenizeByBrackets() {
  static const string_t kOpeningBrackets = L"[({";
  static const string_t kClosingBrackets = L"])}";

  bool bracket_open = false;
  size_t last_bracket_index = 0;

  TokenRange range;

  for (size_t offset = 0; offset < filename_.size(); offset++) {
    const auto& brackets = bracket_open ? kClosingBrackets : kOpeningBrackets;
    const size_t index = brackets.find(filename_.at(offset));

    // Character is a bracket
    if (index != string_t::npos) {
      // Check if it matches last open bracket
      if (bracket_open) {
        if (index != last_bracket_index)
          continue;
      } else {
        last_bracket_index = index;
      }

      // Add unknown token
      if (range.offset < offset) {
        range.size = offset - range.offset;
        TokenizeByDelimiter(bracket_open, range);
      }
      // Add bracket
      AddToken(kBracket, true, TokenRange(offset, 1));
      bracket_open = !bracket_open;
      range.offset = offset + 1;

    // Character is not a bracket, and the loop reached the end
    } else if (offset == filename_.size() - 1) {
      // Add last unknown token
      range.size = offset - range.offset + 1;
      TokenizeByDelimiter(false, range);
    }
  }
}