void TextSearcherICU::setCaseSensitivity(bool caseSensitive) { const UCollationStrength strength = caseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY; UCollator* const collator = usearch_getCollator(m_searcher); if (ucol_getStrength(collator) == strength) return; ucol_setStrength(collator, strength); usearch_reset(m_searcher); }
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) : m_options(options) , m_prefixLength(0) , m_numberOfCharactersJustAppended(0) , m_atBreak(true) , m_needsMoreContext(options & AtWordStarts) , m_targetRequiresKanaWorkaround(containsKanaLetters(target)) { ASSERT(!target.isEmpty()); target.appendTo(m_target); // FIXME: We'd like to tailor the searcher to fold quote marks for us instead // of doing it in a separate replacement pass here, but ICU doesn't offer a way // to add tailoring on top of the locale-specific tailoring as of this writing. foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size()); size_t targetLength = m_target.size(); m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); m_overlap = m_buffer.capacity() / 4; if ((m_options & AtWordStarts) && targetLength) { UChar32 targetFirstCharacter; U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter); // Characters in the separator category never really occur at the beginning of a word, // so if the target begins with such a character, we just ignore the AtWordStart option. if (isSeparator(targetFirstCharacter)) { m_options &= ~AtWordStarts; m_needsMoreContext = false; } } // Grab the single global searcher. // If we ever have a reason to do more than once search buffer at once, we'll have // to move to multiple searchers. lockSearcher(); UStringSearch* searcher = blink::searcher(); UCollator* collator = usearch_getCollator(searcher); UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; if (ucol_getStrength(collator) != strength) { ucol_setStrength(collator, strength); usearch_reset(searcher); } UErrorCode status = U_ZERO_ERROR; usearch_setPattern(searcher, m_target.data(), targetLength, &status); ASSERT(status == U_ZERO_ERROR); // The kana workaround requires a normalized copy of the target string. if (m_targetRequiresKanaWorkaround) normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget); }
static void *engine_fixed_compile(error_t **error, UString *ustr, uint32_t flags) { UErrorCode status; fixed_pattern_t *p; p = mem_new(*p); p->pattern = ustr; // not needed with usearch ? p->flags = flags; p->ubrk = NULL; p->usearch = NULL; status = U_ZERO_ERROR; if (ustring_empty(ustr)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } } else { if (!IS_WHOLE_LINE(flags)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } else if (WITH_GRAPHEME()) { p->ubrk = ubrk_open(UBRK_CHARACTER, NULL, NULL, 0, &status); } if (U_FAILURE(status)) { fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "ubrk_open"); return NULL; } } if (IS_WORD_BOUNDED(flags) || (IS_CASE_INSENSITIVE(flags) && !IS_WHOLE_LINE(flags))) { p->usearch = usearch_open(ustr->ptr, ustr->len, USEARCH_FAKE_USTR, uloc_getDefault(), p->ubrk, &status); if (U_FAILURE(status)) { if (NULL != p->ubrk) { ubrk_close(p->ubrk); } fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "usearch_open"); return NULL; } if (IS_CASE_INSENSITIVE(flags)) { UCollator *ucol; ucol = usearch_getCollator(p->usearch); ucol_setStrength(ucol, (flags & ~OPT_MASK) > 1 ? UCOL_SECONDARY : UCOL_PRIMARY); } } } return p; }
/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/ int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last) { UChar *uhaystack = NULL, *uneedle = NULL; int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0; unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UBreakIterator* bi = NULL; UErrorCode status; UStringSearch* src = NULL; UCollator *coll; if(puchar_pos) { *puchar_pos = -1; } /* convert the strings to UTF-16. */ status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16"); status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16"); /* get a pointer to the haystack taking into account the offset */ status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status ); STRPOS_CHECK_STATUS(status, "Failed to get iterator"); status = U_ZERO_ERROR; ubrk_setText(bi, uhaystack, uhaystack_len, &status); STRPOS_CHECK_STATUS(status, "Failed to set up iterator"); status = U_ZERO_ERROR; src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status); STRPOS_CHECK_STATUS(status, "Error creating search object"); if(f_ignore_case) { coll = usearch_getCollator(src); status = U_ZERO_ERROR; ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status); STRPOS_CHECK_STATUS(status, "Error setting collation strength"); usearch_reset(src); } if(offset != 0) { offset_pos = grapheme_get_haystack_offset(bi, offset); if(offset_pos == -1) { status = U_ILLEGAL_ARGUMENT_ERROR; STRPOS_CHECK_STATUS(status, "Invalid search offset"); } status = U_ZERO_ERROR; usearch_setOffset(src, offset_pos, &status); STRPOS_CHECK_STATUS(status, "Invalid search offset"); } if(last) { char_pos = usearch_last(src, &status); if(char_pos < offset_pos) { /* last one is beyound our start offset */ char_pos = USEARCH_DONE; } } else { char_pos = usearch_next(src, &status); } STRPOS_CHECK_STATUS(status, "Error looking up string"); if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) { ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos); if(puchar_pos) { *puchar_pos = char_pos; } } else { ret_pos = -1; } if (uhaystack) { efree( uhaystack ); } if (uneedle) { efree( uneedle ); } ubrk_close (bi); usearch_close (src); return ret_pos; }