// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; buf = python_to_icu(token, &sz); if (buf == NULL) return NULL; if (sz < 1) goto end; needle = buf; if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; } if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { if (word_start > 0 && ( (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) || (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1])) )) continue; if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue; if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; } if ( // Check that the found word is followed by a word boundary ubrk_isBoundary(self->break_iterator, word_start + sz) && // If there is a leading hyphen check that the leading // hyphen is preceded by a word boundary (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) && // Check that there is a word boundary *after* the trailing // hyphen. We cannot rely on ubrk_isBoundary() as that // always returns true because of the trailing hyphen. (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) ) { ans = word_start; break; } if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary() } } if (leading_hyphen && ans > -1) ans -= 1; #ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); #endif Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("l", (long)ans); } // }}}
static unsigned nextWordOffset(StringView text, unsigned currentOffset) { // FIXME: avoid creating textIterator object here, it could be passed as a parameter. // ubrk_isBoundary() leaves the iterator pointing to the first boundary position at // or after "offset" (ubrk_isBoundary side effect). // For many word separators, the method doesn't properly determine the boundaries // without resetting the iterator. UBreakIterator* textIterator = wordBreakIterator(text); if (!textIterator) return currentOffset; unsigned wordOffset = currentOffset; while (wordOffset < text.length() && ubrk_isBoundary(textIterator, wordOffset)) ++wordOffset; // Do not treat the word's boundary as a separator. if (!currentOffset && wordOffset == 1) return currentOffset; // Omit multiple separators. if ((wordOffset - currentOffset) > 1) --wordOffset; return wordOffset; }
UBool binary_fwd_n( UBreakIterator *ubrk, const UString *pattern, const UString *subject, DArray *array, /* NULL to skip n matches */ int32_t n, int32_t *r ) { UChar *m; int32_t pos; pos = *r; // *r = USEARCH_DONE; while (n > 0 && NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, pattern->ptr, pattern->len))) { pos = m - subject->ptr; if (NULL == ubrk || (ubrk_isBoundary(ubrk, pos) && ubrk_isBoundary(ubrk, pos + pattern->len))) { --n; if (NULL != array) { // debug(">%.*S<", pos - *r, subject->ptr + *r); add_match(array, subject, *r, pos); } *r = pos + pattern->len; // TODO: don't repeat following pos += pattern->len; } pos += pattern->len; } if (0 == n) { *r = pos; return TRUE; } else { if (NULL != array) { // debug(">%.*S<", pos - *r, subject->ptr + *r); add_match(array, subject, *r, subject->len); } *r = USEARCH_DONE; return FALSE; } }
Vector<TextCheckingResult> TextChecker::checkTextOfParagraph(int64_t spellDocumentTag, StringView text, int32_t insertionPoint, uint64_t checkingTypes, bool) { UNUSED_PARAM(insertionPoint); Vector<TextCheckingResult> paragraphCheckingResult; #if ENABLE(SPELLCHECK) if (checkingTypes & TextCheckingTypeSpelling) { UBreakIterator* textIterator = wordBreakIterator(text); if (!textIterator) return paragraphCheckingResult; // Omit the word separators at the beginning/end of the text to don't unnecessarily // involve the client to check spelling for them. unsigned offset = nextWordOffset(text, 0); unsigned lengthStrip = text.length(); while (lengthStrip > 0 && ubrk_isBoundary(textIterator, lengthStrip - 1)) --lengthStrip; while (offset < lengthStrip) { int32_t misspellingLocation = -1; int32_t misspellingLength = 0; checkSpellingOfString(spellDocumentTag, text.substring(offset, lengthStrip - offset), misspellingLocation, misspellingLength); if (!misspellingLength) break; TextCheckingResult misspellingResult; misspellingResult.type = TextCheckingTypeSpelling; misspellingResult.location = offset + misspellingLocation; misspellingResult.length = misspellingLength; paragraphCheckingResult.append(misspellingResult); offset += misspellingLocation + misspellingLength; // Generally, we end up checking at the word separator, move to the adjacent word. offset = nextWordOffset(text.substring(0, lengthStrip), offset); } } #else UNUSED_PARAM(spellDocumentTag); UNUSED_PARAM(text); UNUSED_PARAM(insertionPoint); UNUSED_PARAM(checkingTypes); #endif return paragraphCheckingResult; }
/* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */ inline int32_t grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end) { UChar *p = haystack; UChar ne = needle[needle_len-1]; UErrorCode status; int32_t grapheme_offset; end -= needle_len; while (p <= end) { if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { if (!u_memcmp(needle, p, needle_len - 1)) { /* needle_len - 1 works because if needle_len is 1, we've already tested the char */ /* does the grapheme end here? */ status = U_ZERO_ERROR; ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status); if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) { /* found it, get grapheme count offset */ grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack)); return grapheme_offset; } } } if (p == NULL) { return -1; } p++; } return -1; }
static jboolean isBoundaryImpl(JNIEnv*, jclass, jint address, jint offset) { return ubrk_isBoundary(breakIterator(address), offset); }
/* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */ int grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC) { UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle; int32_t uhaystack_len, uneedle_len; UErrorCode status; unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UBreakIterator* bi = NULL; int ret_pos, pos; /* convert the strings to UTF-16. */ uhaystack = NULL; uhaystack_len = 0; status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status ); if ( U_FAILURE( status ) ) { /* Set global error code. */ intl_error_set_code( NULL, status TSRMLS_CC ); /* Set error messages. */ intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); efree( uhaystack ); return -1; } if ( f_ignore_case ) { grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status ); } /* get a pointer to the haystack taking into account the offset */ bi = NULL; status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset); if ( NULL == puhaystack ) { intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); efree( uhaystack ); ubrk_close (bi); return -1; } uneedle = NULL; uneedle_len = 0; status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status ); if ( U_FAILURE( status ) ) { /* Set global error code. */ intl_error_set_code( NULL, status TSRMLS_CC ); /* Set error messages. */ intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); efree( uhaystack ); efree( uneedle ); ubrk_close (bi); return -1; } if ( f_ignore_case ) { grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status ); } ret_pos = -1; /* -1 represents 'not found' */ /* back up until there's needle_len characters to compare */ uhaystack_end = uhaystack + uhaystack_len; pos = ubrk_last(bi); puhaystack = uhaystack + pos; while ( uhaystack_end - puhaystack < uneedle_len ) { pos = ubrk_previous(bi); if ( UBRK_DONE == pos ) { break; } puhaystack = uhaystack + pos; } /* is there enough haystack left to hold the needle? */ if ( ( uhaystack_end - puhaystack ) < uneedle_len ) { /* not enough, not found */ goto exit; } while ( UBRK_DONE != pos ) { if (!u_memcmp(uneedle, puhaystack, uneedle_len)) { /* needle_len - 1 in zend memnstr? */ /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */ if ( ubrk_isBoundary(bi, pos + uneedle_len) ) { /* found it, get grapheme count offset */ ret_pos = grapheme_count_graphemes(bi, uhaystack, pos); break; } /* set position back */ ubrk_isBoundary(bi, pos); } pos = ubrk_previous(bi); puhaystack = uhaystack + pos; } exit: efree( uhaystack ); efree( uneedle ); ubrk_close (bi); return ret_pos; }
bool isTextBreak(TextBreakIterator* iterator, int position) { return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position); }
/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/ int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last) { UChar *uhaystack = NULL, *uneedle = NULL; int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0; unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UBreakIterator* bi = NULL; UErrorCode status; UStringSearch* src = NULL; UCollator *coll; if(puchar_pos) { *puchar_pos = -1; } /* convert the strings to UTF-16. */ status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16"); status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16"); /* get a pointer to the haystack taking into account the offset */ status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status ); STRPOS_CHECK_STATUS(status, "Failed to get iterator"); status = U_ZERO_ERROR; ubrk_setText(bi, uhaystack, uhaystack_len, &status); STRPOS_CHECK_STATUS(status, "Failed to set up iterator"); status = U_ZERO_ERROR; src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status); STRPOS_CHECK_STATUS(status, "Error creating search object"); if(f_ignore_case) { coll = usearch_getCollator(src); status = U_ZERO_ERROR; ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status); STRPOS_CHECK_STATUS(status, "Error setting collation strength"); usearch_reset(src); } if(offset != 0) { offset_pos = grapheme_get_haystack_offset(bi, offset); if(offset_pos == -1) { status = U_ILLEGAL_ARGUMENT_ERROR; STRPOS_CHECK_STATUS(status, "Invalid search offset"); } status = U_ZERO_ERROR; usearch_setOffset(src, offset_pos, &status); STRPOS_CHECK_STATUS(status, "Invalid search offset"); } if(last) { char_pos = usearch_last(src, &status); if(char_pos < offset_pos) { /* last one is beyound our start offset */ char_pos = USEARCH_DONE; } } else { char_pos = usearch_next(src, &status); } STRPOS_CHECK_STATUS(status, "Error looking up string"); if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) { ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos); if(puchar_pos) { *puchar_pos = char_pos; } } else { ret_pos = -1; } if (uhaystack) { efree( uhaystack ); } if (uneedle) { efree( uneedle ); } ubrk_close (bi); usearch_close (src); return ret_pos; }
static void TestBreakIteratorCAPI() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *word, *sentence, *line, *character, *b, *bogus; int32_t start,pos,end,to; int32_t i; int32_t count = 0; UChar text[50]; /* Note: the adjacent "" are concatenating strings, not adding a \" to the string, which is probably what whoever wrote this intended. Don't fix, because it would throw off the hard coded break positions in the following tests. */ u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah"); /*test ubrk_open()*/ log_verbose("\nTesting BreakIterator open functions\n"); /* Use french for fun */ word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status); if(status == U_FILE_ACCESS_ERROR) { log_data_err("Check your data - it doesn't seem to be around\n"); return; } else if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status)); } else{ log_verbose("PASS: Successfully opened word breakiterator\n"); } sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened sentence breakiterator\n"); } line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened line breakiterator\n"); } character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened character breakiterator\n"); } /*trying to open an illegal iterator*/ bogus = ubrk_open((UBreakIteratorType)5, "en_US", text, u_strlen(text), &status); if(U_SUCCESS(status)){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n"); } if(U_FAILURE(status)){ if(status != U_ILLEGAL_ARGUMENT_ERROR){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status)); } } status=U_ZERO_ERROR; /* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */ log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n"); count=ubrk_countAvailable(); /* use something sensible w/o hardcoding the count */ if(count < 0){ log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count); } else{ log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count); } for(i=0;i<count;i++) { log_verbose("%s\n", ubrk_getAvailable(i)); if (ubrk_getAvailable(i) == 0) log_err("No locale for which breakiterator is applicable\n"); else log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i)); } /*========Test ubrk_first(), ubrk_last()...... and other functions*/ log_verbose("\nTesting the functions for word\n"); start = ubrk_first(word); if(start!=0) log_err("error ubrk_start(word) did not return 0\n"); log_verbose("first (word = %d\n", (int32_t)start); pos=ubrk_next(word); if(pos!=4) log_err("error ubrk_next(word) did not return 4\n"); log_verbose("next (word = %d\n", (int32_t)pos); pos=ubrk_following(word, 4); if(pos!=5) log_err("error ubrl_following(word,4) did not return 6\n"); log_verbose("next (word = %d\n", (int32_t)pos); end=ubrk_last(word); if(end!=49) log_err("error ubrk_last(word) did not return 49\n"); log_verbose("last (word = %d\n", (int32_t)end); pos=ubrk_previous(word); log_verbose("%d %d\n", end, pos); pos=ubrk_previous(word); log_verbose("%d \n", pos); if (ubrk_isBoundary(word, 2) != FALSE) { log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n"); } pos=ubrk_current(word); if (pos != 4) { log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n"); } if (ubrk_isBoundary(word, 4) != TRUE) { log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n"); } log_verbose("\nTesting the functions for character\n"); ubrk_first(character); pos = ubrk_following(character, 5); if(pos!=6) log_err("error ubrk_following(character,5) did not return 6\n"); log_verbose("Following (character,5) = %d\n", (int32_t)pos); pos=ubrk_following(character, 18); if(pos!=19) log_err("error ubrk_following(character,18) did not return 19\n"); log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos); pos=ubrk_preceding(character, 22); if(pos!=21) log_err("error ubrk_preceding(character,22) did not return 21\n"); log_verbose("preceding(character,22) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for line\n"); pos=ubrk_first(line); if(pos != 0) log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos); pos = ubrk_next(line); pos=ubrk_following(line, 18); if(pos!=22) log_err("error ubrk_following(line) did not return 22\n"); log_verbose("following (line) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for sentence\n"); ubrk_first(sentence); pos = ubrk_current(sentence); log_verbose("Current(sentence) = %d\n", (int32_t)pos); pos = ubrk_last(sentence); if(pos!=49) log_err("error ubrk_last for sentence did not return 49\n"); log_verbose("Last (sentence) = %d\n", (int32_t)pos); ubrk_first(sentence); to = ubrk_following( sentence, 0 ); if (to == 0) log_err("ubrk_following returned 0\n"); to = ubrk_preceding( sentence, to ); if (to != 0) log_err("ubrk_preceding didn't return 0\n"); if (ubrk_first(sentence)!=ubrk_current(sentence)) { log_err("error in ubrk_first() or ubrk_current()\n"); } /*---- */ /*Testing ubrk_open and ubrk_close()*/ log_verbose("\nTesting open and close for us locale\n"); b = ubrk_open(UBRK_WORD, "fr_FR", text, u_strlen(text), &status); if (U_FAILURE(status)) { log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status)); } ubrk_close(b); /* Test setText and setUText */ { UChar s1[] = {0x41, 0x42, 0x20, 0}; UChar s2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0}; UText *ut = NULL; UBreakIterator *bb; int j; log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n"); status = U_ZERO_ERROR; bb = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); ubrk_setText(bb, s1, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_first(bb); j = ubrk_next(bb); TEST_ASSERT(j == 2); ut = utext_openUChars(ut, s2, -1, &status); ubrk_setUText(bb, ut, &status); TEST_ASSERT_SUCCESS(status); j = ubrk_next(bb); TEST_ASSERT(j == 5); ubrk_close(bb); utext_close(ut); } ubrk_close(word); ubrk_close(sentence); ubrk_close(line); ubrk_close(character); }
int icu_breakpoints(lua_State *L) { const char* input = luaL_checkstring(L, 1); int input_l = strlen(input); const char* locale = luaL_checkstring(L, 2); UChar *buffer; int32_t l, breakcount = 0; UErrorCode err = U_ZERO_ERROR; u_strFromUTF8(NULL, 0, &l, input, input_l, &err); /* Above call returns an error every time. */ err = U_ZERO_ERROR; buffer = malloc(l * sizeof(UChar)); u_strFromUTF8(buffer, l, &l, input, input_l, &err); UBreakIterator* wordbreaks, *linebreaks; int32_t i, previous; wordbreaks = ubrk_open(UBRK_WORD, locale, buffer, l, &err); if(U_FAILURE(err)) { luaL_error(L, "Word break parser failure: %s", u_errorName(err)); } linebreaks = ubrk_open(UBRK_LINE, locale, buffer, l, &err); if(U_FAILURE(err)) { luaL_error(L, "Line break parser failure: %s", u_errorName(err)); } previous = 0; i = 0; while (i <= l) { int32_t out_l; int32_t type; if (!ubrk_isBoundary(linebreaks, i) && !ubrk_isBoundary(wordbreaks,i)) { i++; continue; } lua_checkstack(L, 3); /* At some kind of boundary */ lua_newtable(L); lua_pushstring(L, "type"); lua_pushstring(L, ubrk_isBoundary(linebreaks,i) ? "line" : "word"); lua_settable(L, -3); int32_t utf8_index = 0; err = U_ZERO_ERROR; u_strToUTF8(NULL, 0, &utf8_index, buffer, i, &err); assert(U_SUCCESS(err) || err == U_BUFFER_OVERFLOW_ERROR); lua_pushstring(L, "index"); lua_pushinteger(L, utf8_index); lua_settable(L, -3); if (ubrk_isBoundary(linebreaks, i)) { lua_pushstring(L, "subtype"); type = ubrk_getRuleStatus(linebreaks); if (type >= UBRK_LINE_SOFT && type < UBRK_LINE_SOFT_LIMIT) { lua_pushstring(L, "soft"); } else { lua_pushstring(L, "hard"); } lua_settable(L, -3); } lua_pushstring(L, "token"); lua_pushlstring(L, input+previous, utf8_index-previous); lua_settable(L, -3); previous = utf8_index; breakcount++; i++; } ubrk_close(wordbreaks); ubrk_close(linebreaks); return breakcount; }
static engine_return_t engine_fixed_match(error_t **error, void *data, const UString *subject) { int32_t ret; UErrorCode status; FETCH_DATA(data, p, fixed_pattern_t); status = U_ZERO_ERROR; if (ustring_empty(p->pattern)) { if (IS_WORD_BOUNDED(p->flags)) { if (ustring_empty(subject)) { return ENGINE_MATCH_FOUND; } else { int32_t l, u, lastState, state; ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } if (UBRK_DONE != (l = ubrk_first(p->ubrk))) { lastState = ubrk_getRuleStatus(p->ubrk); while (UBRK_DONE != (u = ubrk_next(p->ubrk))) { state = ubrk_getRuleStatus(p->ubrk); if (UBRK_WORD_NONE == lastState && lastState == state) { return ENGINE_MATCH_FOUND; } lastState = state; l = u; } } return ENGINE_NO_MATCH; } } else { return ENGINE_MATCH_FOUND; } } else if (NULL != p->usearch) { if (subject->len > 0) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } ret = usearch_first(p->usearch, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_first"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (ret != USEARCH_DONE ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } else { return ENGINE_NO_MATCH; } } else { UChar *m; int32_t pos; pos = 0; ret = ENGINE_NO_MATCH; if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } } while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) { pos = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) { ret = ENGINE_MATCH_FOUND; } pos += p->pattern->len; } ubrk_unbindText(p->ubrk); return ret; } }
static UBool engine_fixed_split(error_t **error, void *data, const UString *subject, DArray *array, interval_list_t *intervals) { UErrorCode status; int32_t l, lastU; dlist_element_t *el; FETCH_DATA(data, p, fixed_pattern_t); lastU = l = 0; status = U_ZERO_ERROR; if (NULL != p->usearch) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return FALSE; } /* <X> */ if (NULL == intervals) { int32_t u; while (U_SUCCESS(status) && USEARCH_DONE != (u = usearch_next(p->usearch, &status))) { add_match(array, subject, l, u); l = u += usearch_getMatchedLength(p->usearch); } add_match(array, subject, l, subject->len); } else { /* </X> */ for (el = intervals->head; NULL != el; el = el->next) { FETCH_DATA(el->data, i, interval_t); if (i->lower_limit > 0) { if (!usearch_fwd_n(p->usearch, subject, NULL, i->lower_limit - lastU, &l, &status)) { break; } } if (!usearch_fwd_n(p->usearch, subject, array, i->upper_limit - i->lower_limit, &l, &status)) { break; } lastU = i->upper_limit; } /* <X> */ } /* </X> */ usearch_unbindText(p->usearch); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_next"); return FALSE; } } else { if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return FALSE; } } /* <X> */ if (NULL == intervals) { UChar *m; int32_t u; u = 0; while (NULL != (m = u_strFindFirst(subject->ptr + u, subject->len - u, p->pattern->ptr, p->pattern->len))) { u = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, u) && ubrk_isBoundary(p->ubrk, u + p->pattern->len))) { add_match(array, subject, l, u); } l = u = u + p->pattern->len; } add_match(array, subject, l, subject->len); } else { /* </X> */ for (el = intervals->head; NULL != el; el = el->next) { FETCH_DATA(el->data, i, interval_t); if (i->lower_limit > 0) { if (!binary_fwd_n(p->ubrk, p->pattern, subject, NULL, i->lower_limit - lastU, &l)) { break; } } if (!binary_fwd_n(p->ubrk, p->pattern, subject, array, i->upper_limit - i->lower_limit, &l)) { break; } lastU = i->upper_limit; } /* <X> */ } /* </X> */ ubrk_unbindText(p->ubrk); } return TRUE; }
static engine_return_t engine_fixed_match_all(error_t **error, void *data, const UString *subject, interval_list_t *intervals) { int32_t matches; UErrorCode status; FETCH_DATA(data, p, fixed_pattern_t); matches = 0; status = U_ZERO_ERROR; if (ustring_empty(p->pattern)) { if (IS_WORD_BOUNDED(p->flags)) { if (ustring_empty(subject)) { return ENGINE_MATCH_FOUND; } else { int32_t l, u, lastState, state; ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } if (UBRK_DONE != (l = ubrk_first(p->ubrk))) { lastState = ubrk_getRuleStatus(p->ubrk); while (UBRK_DONE != (u = ubrk_next(p->ubrk))) { state = ubrk_getRuleStatus(p->ubrk); if (UBRK_WORD_NONE == lastState && lastState == state) { return ENGINE_MATCH_FOUND; } lastState = state; l = u; } } return ENGINE_NO_MATCH; } } else { return ENGINE_MATCH_FOUND; } } else if (NULL != p->usearch) { int32_t l, u; if (subject->len > 0) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } for (l = usearch_first(p->usearch, &status); U_SUCCESS(status) && USEARCH_DONE != l; l = usearch_next(p->usearch, &status)) { matches++; u = l + usearch_getMatchedLength(p->usearch); if (interval_list_add(intervals, subject->len, l, u)) { return ENGINE_WHOLE_LINE_MATCH; } } if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_[first|next]"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } else { return ENGINE_NO_MATCH; } } else { UChar *m; int32_t pos; pos = 0; if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } } while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) { pos = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) { matches++; if (interval_list_add(intervals, subject->len, pos, pos + p->pattern->len)) { return ENGINE_WHOLE_LINE_MATCH; } } pos += p->pattern->len; } ubrk_unbindText(p->ubrk); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } }
UBool Target::isBreakBoundary(int32_t offset) { return ubrk_isBoundary(charBreakIterator, offset); }
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) { UErrorCode status = U_ZERO_ERROR; OrderList targetOrders(coll, target, offset); OrderList patternOrders(coll, pattern); int32_t targetSize = targetOrders.size() - 1; int32_t patternSize = patternOrders.size() - 1; UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), target.getBuffer(), target.length(), &status); if (patternSize == 0) { // Searching for an empty pattern always fails matchStart = matchEnd = -1; ubrk_close(charBreakIterator); return FALSE; } matchStart = matchEnd = -1; for(int32_t i = 0; i < targetSize; i += 1) { if (targetOrders.matchesAt(i, patternOrders)) { int32_t start = targetOrders.getLowOffset(i); int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); // if the low and high offsets of the first CE in // the match are the same, it means that the match // starts in the middle of an expansion - all but // the first CE of the expansion will have the offset // of the following character. if (start == targetOrders.getHighOffset(i)) { continue; } // Make sure match starts on a grapheme boundary if (! ubrk_isBoundary(charBreakIterator, start)) { continue; } // If the low and high offsets of the CE after the match // are the same, it means that the match ends in the middle // of an expansion sequence. if (maxLimit == targetOrders.getHighOffset(i + patternSize) && targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { continue; } int32_t mend = maxLimit; // Find the first grapheme break after the character index // of the last CE in the match. If it's after character index // that's after the last CE in the match, use that index // as the end of the match. if (minLimit < maxLimit) { // When the last CE's low index is same with its high index, the CE is likely // a part of expansion. In this case, the index is located just after the // character corresponding to the CEs compared above. If the index is right // at the break boundary, move the position to the next boundary will result // incorrect match length when there are ignorable characters exist between // the position and the next character produces CE(s). See ticket#8482. if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { mend = minLimit; } else { int32_t nba = ubrk_following(charBreakIterator, minLimit); if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { mend = nba; } } } if (mend > maxLimit) { continue; } if (! ubrk_isBoundary(charBreakIterator, mend)) { continue; } matchStart = start; matchEnd = mend; ubrk_close(charBreakIterator); return TRUE; } } ubrk_close(charBreakIterator); return FALSE; }
UBool __hs_ubrk_isBoundary(UBreakIterator *bi, int32_t offset) { return ubrk_isBoundary(bi, offset); }