static UBool usearch_fwd_n( UStringSearch *usearch, const UString *subject, DArray *array, /* NULL to skip n matches */ int32_t n, int32_t *l, UErrorCode *status ) { int32_t u; while (n > 0 && U_SUCCESS(*status) && USEARCH_DONE != (u = usearch_next(usearch, status))) { --n; if (NULL != array) { add_match(array, subject, *l, u); } *l = u += usearch_getMatchedLength(usearch); } if (0 == n) { return TRUE; } else { if (NULL != array) { add_match(array, subject, *l, subject->len); } return FALSE; } }
/* Return value is a "Win32 BOOL" (1 = true, 0 = false) */ extern "C" int32_t EndsWith( SortHandle* pSortHandle, const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength, int32_t options) { int32_t result = FALSE; UErrorCode err = U_ZERO_ERROR; const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); if (U_SUCCESS(err)) { UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); int32_t idx = USEARCH_DONE; if (U_SUCCESS(err)) { idx = usearch_last(pSearch, &err); if (idx != USEARCH_DONE) { if ((idx + usearch_getMatchedLength(pSearch)) == cwSourceLength) { result = TRUE; } // TODO (dotnet/corefx#3467): We should do something similar to what // StartsWith does where we can ignore // some collation elements at the end of the string if they are zero. } usearch_close(pSearch); } } return result; }
static engine_return_t engine_fixed_whole_line_match(error_t **error, void *data, const UString *subject) { FETCH_DATA(data, p, fixed_pattern_t); if (ustring_empty(p->pattern)) { return ustring_empty(subject) ? ENGINE_WHOLE_LINE_MATCH : ENGINE_NO_MATCH; } else if (NULL != p->usearch) { int32_t ret; UErrorCode status; status = U_ZERO_ERROR; usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } ret = usearch_first(p->usearch, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_first"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (ret != USEARCH_DONE && ((size_t) usearch_getMatchedLength(p->usearch)) == subject->len ? ENGINE_WHOLE_LINE_MATCH : ENGINE_NO_MATCH); } else { if (IS_CASE_INSENSITIVE(p->flags)) { return (0 == u_strcasecmp(p->pattern->ptr, subject->ptr, 0) ? ENGINE_WHOLE_LINE_MATCH : ENGINE_NO_MATCH); } else { return (0 == u_strcmp(p->pattern->ptr, subject->ptr) ? ENGINE_WHOLE_LINE_MATCH : ENGINE_NO_MATCH); } } }
bool TextSearcherICU::nextMatchResult(MatchResult& result) { UErrorCode status = U_ZERO_ERROR; const int matchStart = usearch_next(m_searcher, &status); DCHECK_EQ(status, U_ZERO_ERROR); // TODO(iceman): It is possible to use |usearch_getText| function // to retrieve text length and not store it explicitly. if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < m_textLength)) { DCHECK_EQ(matchStart, USEARCH_DONE); result.start = 0; result.length = 0; return false; } result.start = static_cast<size_t>(matchStart); result.length = usearch_getMatchedLength(m_searcher); return true; }
UBool findPattern() { UErrorCode status = U_ZERO_ERROR; int32_t offset = usearch_next(search, &status); if (offset == USEARCH_DONE) { fprintf(stdout, "Pattern not found in source\n"); } while (offset != USEARCH_DONE) { fprintf(stdout, "Pattern found at offset %d size %d\n", offset, usearch_getMatchedLength(search)); offset = usearch_next(search, &status); } if (U_FAILURE(status)) { fprintf(stderr, "Error in searching for pattern %d\n", status); return FALSE; } fprintf(stdout, "End of search\n"); return TRUE; }
// Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) { PyObject *a_, *b_; int32_t asz, bsz; UChar *a, *b; wchar_t *aw, *bw; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; int32_t pos = -1, length = -1; if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL; asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_); a = (UChar*)calloc(asz*4 + 2, sizeof(UChar)); b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar)); aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t)); bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t)); if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory(); PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1); PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1); u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status); u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status); if (U_SUCCESS(status)) { search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status); if (U_SUCCESS(status)) { pos = usearch_first(search, &status); if (pos != USEARCH_DONE) length = usearch_getMatchedLength(search); else pos = -1; } if (search != NULL) usearch_close(search); } free(a); free(b); free(aw); free(bw); return Py_BuildValue("ii", pos, length); } // }}}
// Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif PyObject *a_ = NULL, *b_ = NULL; UChar *a = NULL, *b = NULL; int32_t asz = 0, bsz = 0, pos = -1, length = -1; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz, 1); if (a == NULL) goto end; b = python_to_icu(b_, &bsz, 1); if (b == NULL) goto end; search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); if (U_SUCCESS(status)) { pos = usearch_first(search, &status); if (pos != USEARCH_DONE) { length = usearch_getMatchedLength(search); #ifdef Py_UNICODE_WIDE // We have to return number of unicode characters since the string // could contain surrogate pairs which are represented as a single // character in python wide builds length = u_countChar32(b + pos, length); pos = u_countChar32(b, pos); #endif } else pos = -1; } end: if (search != NULL) usearch_close(search); if (a != NULL) free(a); if (b != NULL) free(b); return (PyErr_Occurred()) ? NULL : Py_BuildValue("ii", pos, length); } // }}}
Boolean CFStringFindWithOptionsAndLocale (CFStringRef str, CFStringRef stringToFind, CFRange rangeToSearch, CFStringCompareFlags searchOptions, CFLocaleRef locale, CFRange *result) { UniChar *pattern; UniChar *text; CFIndex patternLength; CFIndex textLength; CFIndex start; CFIndex end; CFAllocatorRef alloc; UCollator *ucol; UStringSearch *usrch; UErrorCode err = U_ZERO_ERROR; if (rangeToSearch.length == 0) return false; alloc = CFAllocatorGetDefault (); textLength = CFStringGetLength (stringToFind); if (textLength == 0) return false; patternLength = rangeToSearch.length; pattern = CFAllocatorAllocate (alloc, patternLength * sizeof(UniChar), 0); CFStringGetCharacters (str, rangeToSearch, pattern); text = CFAllocatorAllocate (alloc, textLength * sizeof(UniChar), 0); CFStringGetCharacters (stringToFind, CFRangeMake(0, textLength), text); ucol = CFStringICUCollatorOpen (searchOptions, locale); usrch = usearch_openFromCollator (text, textLength, pattern, patternLength, ucol, NULL, &err); if (U_FAILURE(err)) return false; /* FIXME: need to handle kCFCompareAnchored */ if (searchOptions & kCFCompareBackwards) { start = usearch_last (usrch, &err); } else { start = usearch_first (usrch, &err); } if (start == USEARCH_DONE) { CFAllocatorDeallocate (alloc, pattern); CFAllocatorDeallocate (alloc, text); return false; } end = usearch_getMatchedLength (usrch); usearch_close (usrch); CFStringICUCollatorClose (ucol); if (result) *result = CFRangeMake (start + rangeToSearch.location, end); CFAllocatorDeallocate (alloc, pattern); CFAllocatorDeallocate (alloc, text); return true; }
inline size_t SearchBuffer::search(size_t& start) { size_t size = m_buffer.size(); if (m_atBreak) { if (!size) return 0; } else { if (size != m_buffer.capacity()) return 0; } UStringSearch* searcher = blink::searcher(); UErrorCode status = U_ZERO_ERROR; usearch_setText(searcher, m_buffer.data(), size, &status); ASSERT(status == U_ZERO_ERROR); usearch_setOffset(searcher, m_prefixLength, &status); ASSERT(status == U_ZERO_ERROR); int matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); nextMatch: if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { ASSERT(matchStart == USEARCH_DONE); return 0; } // Matches that start in the overlap area are only tentative. // The same match may appear later, matching more characters, // possibly including a combining character that's not yet in the buffer. if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { size_t overlap = m_overlap; if (m_options & AtWordStarts) { // Ensure that there is sufficient context before matchStart the next time around for // determining if it is at a word boundary. int wordBoundaryContextStart = matchStart; U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart)); } memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); m_prefixLength -= std::min(m_prefixLength, size - overlap); m_buffer.shrink(overlap); return 0; } size_t matchedLength = usearch_getMatchedLength(searcher); ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size); // If this match is "bad", move on to the next match. if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); goto nextMatch; } size_t newSize = size - (matchStart + 1); memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1); m_buffer.shrink(newSize); start = size - matchStart; return matchedLength; }
static UBool engine_fixed_split(error_t **error, void *data, const UString *subject, DArray *array, interval_list_t *intervals) { UErrorCode status; int32_t l, lastU; dlist_element_t *el; FETCH_DATA(data, p, fixed_pattern_t); lastU = l = 0; status = U_ZERO_ERROR; if (NULL != p->usearch) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return FALSE; } /* <X> */ if (NULL == intervals) { int32_t u; while (U_SUCCESS(status) && USEARCH_DONE != (u = usearch_next(p->usearch, &status))) { add_match(array, subject, l, u); l = u += usearch_getMatchedLength(p->usearch); } add_match(array, subject, l, subject->len); } else { /* </X> */ for (el = intervals->head; NULL != el; el = el->next) { FETCH_DATA(el->data, i, interval_t); if (i->lower_limit > 0) { if (!usearch_fwd_n(p->usearch, subject, NULL, i->lower_limit - lastU, &l, &status)) { break; } } if (!usearch_fwd_n(p->usearch, subject, array, i->upper_limit - i->lower_limit, &l, &status)) { break; } lastU = i->upper_limit; } /* <X> */ } /* </X> */ usearch_unbindText(p->usearch); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_next"); return FALSE; } } else { if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return FALSE; } } /* <X> */ if (NULL == intervals) { UChar *m; int32_t u; u = 0; while (NULL != (m = u_strFindFirst(subject->ptr + u, subject->len - u, p->pattern->ptr, p->pattern->len))) { u = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, u) && ubrk_isBoundary(p->ubrk, u + p->pattern->len))) { add_match(array, subject, l, u); } l = u = u + p->pattern->len; } add_match(array, subject, l, subject->len); } else { /* </X> */ for (el = intervals->head; NULL != el; el = el->next) { FETCH_DATA(el->data, i, interval_t); if (i->lower_limit > 0) { if (!binary_fwd_n(p->ubrk, p->pattern, subject, NULL, i->lower_limit - lastU, &l)) { break; } } if (!binary_fwd_n(p->ubrk, p->pattern, subject, array, i->upper_limit - i->lower_limit, &l)) { break; } lastU = i->upper_limit; } /* <X> */ } /* </X> */ ubrk_unbindText(p->ubrk); } return TRUE; }
static engine_return_t engine_fixed_match_all(error_t **error, void *data, const UString *subject, interval_list_t *intervals) { int32_t matches; UErrorCode status; FETCH_DATA(data, p, fixed_pattern_t); matches = 0; status = U_ZERO_ERROR; if (ustring_empty(p->pattern)) { if (IS_WORD_BOUNDED(p->flags)) { if (ustring_empty(subject)) { return ENGINE_MATCH_FOUND; } else { int32_t l, u, lastState, state; ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } if (UBRK_DONE != (l = ubrk_first(p->ubrk))) { lastState = ubrk_getRuleStatus(p->ubrk); while (UBRK_DONE != (u = ubrk_next(p->ubrk))) { state = ubrk_getRuleStatus(p->ubrk); if (UBRK_WORD_NONE == lastState && lastState == state) { return ENGINE_MATCH_FOUND; } lastState = state; l = u; } } return ENGINE_NO_MATCH; } } else { return ENGINE_MATCH_FOUND; } } else if (NULL != p->usearch) { int32_t l, u; if (subject->len > 0) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } for (l = usearch_first(p->usearch, &status); U_SUCCESS(status) && USEARCH_DONE != l; l = usearch_next(p->usearch, &status)) { matches++; u = l + usearch_getMatchedLength(p->usearch); if (interval_list_add(intervals, subject->len, l, u)) { return ENGINE_WHOLE_LINE_MATCH; } } if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_[first|next]"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } else { return ENGINE_NO_MATCH; } } else { UChar *m; int32_t pos; pos = 0; if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } } while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) { pos = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) { matches++; if (interval_list_add(intervals, subject->len, pos, pos + p->pattern->len)) { return ENGINE_WHOLE_LINE_MATCH; } } pos += p->pattern->len; } ubrk_unbindText(p->ubrk); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } }