inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
    : m_options(options)
    , m_prefixLength(0)
    , m_numberOfCharactersJustAppended(0)
    , m_atBreak(true)
    , m_needsMoreContext(options & AtWordStarts)
    , m_targetRequiresKanaWorkaround(containsKanaLetters(target))
{
    ASSERT(!target.isEmpty());
    target.appendTo(m_target);

    // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
    // of doing it in a separate replacement pass here, but ICU doesn't offer a way
    // to add tailoring on top of the locale-specific tailoring as of this writing.
    foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size());

    size_t targetLength = m_target.size();
    m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize));
    m_overlap = m_buffer.capacity() / 4;

    if ((m_options & AtWordStarts) && targetLength) {
        UChar32 targetFirstCharacter;
        U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter);
        // Characters in the separator category never really occur at the beginning of a word,
        // so if the target begins with such a character, we just ignore the AtWordStart option.
        if (isSeparator(targetFirstCharacter)) {
            m_options &= ~AtWordStarts;
            m_needsMoreContext = false;
        }
    }

    // Grab the single global searcher.
    // If we ever have a reason to do more than once search buffer at once, we'll have
    // to move to multiple searchers.
    lockSearcher();

    UStringSearch* searcher = blink::searcher();
    UCollator* collator = usearch_getCollator(searcher);

    UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
    if (ucol_getStrength(collator) != strength) {
        ucol_setStrength(collator, strength);
        usearch_reset(searcher);
    }

    UErrorCode status = U_ZERO_ERROR;
    usearch_setPattern(searcher, m_target.data(), targetLength, &status);
    ASSERT(status == U_ZERO_ERROR);

    // The kana workaround requires a normalized copy of the target string.
    if (m_targetRequiresKanaWorkaround)
        normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget);
}
inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
{
    // This function implements the kana workaround. If usearch treats
    // it as a match, but we do not want to, then it's a "bad match".
    if (!m_targetRequiresKanaWorkaround)
        return false;

    // Normalize into a match buffer. We reuse a single buffer rather than
    // creating a new one each time.
    normalizeCharactersIntoNFCForm(match, matchLength, m_normalizedMatch);

    return !checkOnlyKanaLettersInStrings(m_normalizedTarget.begin(), m_normalizedTarget.size(), m_normalizedMatch.begin(), m_normalizedMatch.size());
}
Example #3
0
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
    : m_options(options),
      m_prefixLength(0),
      m_numberOfCharactersJustAppended(0),
      m_atBreak(true),
      m_needsMoreContext(options & AtWordStarts),
      m_targetRequiresKanaWorkaround(containsKanaLetters(target)) {
  DCHECK(!target.isEmpty()) << target;
  target.appendTo(m_target);

  // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
  // of doing it in a separate replacement pass here, but ICU doesn't offer a
  // way to add tailoring on top of the locale-specific tailoring as of this
  // writing.
  foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size());

  size_t targetLength = m_target.size();
  m_buffer.reserveInitialCapacity(
      std::max(targetLength * 8, kMinimumSearchBufferSize));
  m_overlap = m_buffer.capacity() / 4;

  if ((m_options & AtWordStarts) && targetLength) {
    const UChar32 targetFirstCharacter =
        getCodePointAt(m_target.data(), 0, targetLength);
    // Characters in the separator category never really occur at the beginning
    // of a word, so if the target begins with such a character, we just ignore
    // the AtWordStart option.
    if (isSeparator(targetFirstCharacter)) {
      m_options &= ~AtWordStarts;
      m_needsMoreContext = false;
    }
  }

  m_textSearcher = WTF::makeUnique<TextSearcherICU>();
  m_textSearcher->setPattern(StringView(m_target.data(), m_target.size()),
                             !(m_options & CaseInsensitive));

  // The kana workaround requires a normalized copy of the target string.
  if (m_targetRequiresKanaWorkaround)
    normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(),
                                   m_normalizedTarget);
}