int findNextWordFromIndex(const UChar* chars, int len, int position, bool forward) { UBreakIterator* it = wordBreakIterator(chars, len); if (forward) { position = ubrk_following(it, position); while (position != UBRK_DONE) { // We stop searching when the character preceeding the break // is alphanumeric. if (position < len && u_isalnum(chars[position - 1])) return position; position = ubrk_following(it, position); } return len; } else { position = ubrk_preceding(it, position); while (position != UBRK_DONE) { // We stop searching when the character following the break // is alphanumeric. if (position > 0 && u_isalnum(chars[position])) return position; position = ubrk_preceding(it, position); } return 0; } }
void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end) { UBreakIterator* it = wordBreakIterator(chars, len); *end = ubrk_following(it, position); if (*end < 0) *end = ubrk_last(it); *start = ubrk_previous(it); }
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; buf = python_to_icu(token, &sz); if (buf == NULL) return NULL; if (sz < 1) goto end; needle = buf; if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; } if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { if (word_start > 0 && ( (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) || (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1])) )) continue; if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue; if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; } if ( // Check that the found word is followed by a word boundary ubrk_isBoundary(self->break_iterator, word_start + sz) && // If there is a leading hyphen check that the leading // hyphen is preceded by a word boundary (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) && // Check that there is a word boundary *after* the trailing // hyphen. We cannot rely on ubrk_isBoundary() as that // always returns true because of the trailing hyphen. (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) ) { ans = word_start; break; } if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary() } } if (leading_hyphen && ans > -1) ans -= 1; #ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); #endif Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("l", (long)ans); } // }}}
int KWQFindNextWordFromIndex(const QChar *chars, int len, int position, bool forward) { int pos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_WORD, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { if (forward) { pos = ubrk_following(boundary, position); if (pos == UBRK_DONE) pos = len; } else { pos = ubrk_preceding(boundary, position); if (pos == UBRK_DONE) pos = 0; } ubrk_close(boundary); } return pos; }
void KWQFindSentenceBoundary(const QChar *chars, int len, int position, int *start, int *end) { int startPos = 0; int endPos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_SENTENCE, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { startPos = ubrk_preceding(boundary, position); if (startPos == UBRK_DONE) { startPos = 0; } endPos = ubrk_following(boundary, startPos); if (endPos == UBRK_DONE) endPos = len; ubrk_close(boundary); } *start = startPos; *end = endPos; }
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) { UErrorCode status = U_ZERO_ERROR; OrderList targetOrders(coll, target, offset); OrderList patternOrders(coll, pattern); int32_t targetSize = targetOrders.size() - 1; int32_t patternSize = patternOrders.size() - 1; UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), target.getBuffer(), target.length(), &status); if (patternSize == 0) { // Searching for an empty pattern always fails matchStart = matchEnd = -1; ubrk_close(charBreakIterator); return FALSE; } matchStart = matchEnd = -1; for(int32_t i = 0; i < targetSize; i += 1) { if (targetOrders.matchesAt(i, patternOrders)) { int32_t start = targetOrders.getLowOffset(i); int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); // if the low and high offsets of the first CE in // the match are the same, it means that the match // starts in the middle of an expansion - all but // the first CE of the expansion will have the offset // of the following character. if (start == targetOrders.getHighOffset(i)) { continue; } // Make sure match starts on a grapheme boundary if (! ubrk_isBoundary(charBreakIterator, start)) { continue; } // If the low and high offsets of the CE after the match // are the same, it means that the match ends in the middle // of an expansion sequence. if (maxLimit == targetOrders.getHighOffset(i + patternSize) && targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { continue; } int32_t mend = maxLimit; // Find the first grapheme break after the character index // of the last CE in the match. If it's after character index // that's after the last CE in the match, use that index // as the end of the match. if (minLimit < maxLimit) { // When the last CE's low index is same with its high index, the CE is likely // a part of expansion. In this case, the index is located just after the // character corresponding to the CEs compared above. If the index is right // at the break boundary, move the position to the next boundary will result // incorrect match length when there are ignorable characters exist between // the position and the next character produces CE(s). See ticket#8482. if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { mend = minLimit; } else { int32_t nba = ubrk_following(charBreakIterator, minLimit); if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { mend = nba; } } } if (mend > maxLimit) { continue; } if (! ubrk_isBoundary(charBreakIterator, mend)) { continue; } matchStart = start; matchEnd = mend; ubrk_close(charBreakIterator); return TRUE; } } ubrk_close(charBreakIterator); return FALSE; }
void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) { int32_t start; int32_t end = ubrk_following(boundary, pos); start = ubrk_previous(boundary); printTextRange(str, start, end ); }
int textBreakFollowing(TextBreakIterator* iterator, int pos) { return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); }
static jint followingImpl(JNIEnv*, jclass, jint address, jint offset) { return ubrk_following(breakIterator(address), offset); }
static void TestBreakIteratorCAPI() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *word, *sentence, *line, *character, *b, *bogus; int32_t start,pos,end,to; int32_t i; int32_t count = 0; UChar text[50]; /* Note: the adjacent "" are concatenating strings, not adding a \" to the string, which is probably what whoever wrote this intended. Don't fix, because it would throw off the hard coded break positions in the following tests. */ u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah"); /*test ubrk_open()*/ log_verbose("\nTesting BreakIterator open functions\n"); /* Use french for fun */ word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status); if(status == U_FILE_ACCESS_ERROR) { log_data_err("Check your data - it doesn't seem to be around\n"); return; } else if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status)); } else{ log_verbose("PASS: Successfully opened word breakiterator\n"); } sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened sentence breakiterator\n"); } line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened line breakiterator\n"); } character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened character breakiterator\n"); } /*trying to open an illegal iterator*/ bogus = ubrk_open((UBreakIteratorType)5, "en_US", text, u_strlen(text), &status); if(U_SUCCESS(status)){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n"); } if(U_FAILURE(status)){ if(status != U_ILLEGAL_ARGUMENT_ERROR){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status)); } } status=U_ZERO_ERROR; /* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */ log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n"); count=ubrk_countAvailable(); /* use something sensible w/o hardcoding the count */ if(count < 0){ log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count); } else{ log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count); } for(i=0;i<count;i++) { log_verbose("%s\n", ubrk_getAvailable(i)); if (ubrk_getAvailable(i) == 0) log_err("No locale for which breakiterator is applicable\n"); else log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i)); } /*========Test ubrk_first(), ubrk_last()...... and other functions*/ log_verbose("\nTesting the functions for word\n"); start = ubrk_first(word); if(start!=0) log_err("error ubrk_start(word) did not return 0\n"); log_verbose("first (word = %d\n", (int32_t)start); pos=ubrk_next(word); if(pos!=4) log_err("error ubrk_next(word) did not return 4\n"); log_verbose("next (word = %d\n", (int32_t)pos); pos=ubrk_following(word, 4); if(pos!=5) log_err("error ubrl_following(word,4) did not return 6\n"); log_verbose("next (word = %d\n", (int32_t)pos); end=ubrk_last(word); if(end!=49) log_err("error ubrk_last(word) did not return 49\n"); log_verbose("last (word = %d\n", (int32_t)end); pos=ubrk_previous(word); log_verbose("%d %d\n", end, pos); pos=ubrk_previous(word); log_verbose("%d \n", pos); if (ubrk_isBoundary(word, 2) != FALSE) { log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n"); } pos=ubrk_current(word); if (pos != 4) { log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n"); } if (ubrk_isBoundary(word, 4) != TRUE) { log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n"); } log_verbose("\nTesting the functions for character\n"); ubrk_first(character); pos = ubrk_following(character, 5); if(pos!=6) log_err("error ubrk_following(character,5) did not return 6\n"); log_verbose("Following (character,5) = %d\n", (int32_t)pos); pos=ubrk_following(character, 18); if(pos!=19) log_err("error ubrk_following(character,18) did not return 19\n"); log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos); pos=ubrk_preceding(character, 22); if(pos!=21) log_err("error ubrk_preceding(character,22) did not return 21\n"); log_verbose("preceding(character,22) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for line\n"); pos=ubrk_first(line); if(pos != 0) log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos); pos = ubrk_next(line); pos=ubrk_following(line, 18); if(pos!=22) log_err("error ubrk_following(line) did not return 22\n"); log_verbose("following (line) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for sentence\n"); ubrk_first(sentence); pos = ubrk_current(sentence); log_verbose("Current(sentence) = %d\n", (int32_t)pos); pos = ubrk_last(sentence); if(pos!=49) log_err("error ubrk_last for sentence did not return 49\n"); log_verbose("Last (sentence) = %d\n", (int32_t)pos); ubrk_first(sentence); to = ubrk_following( sentence, 0 ); if (to == 0) log_err("ubrk_following returned 0\n"); to = ubrk_preceding( sentence, to ); if (to != 0) log_err("ubrk_preceding didn't return 0\n"); if (ubrk_first(sentence)!=ubrk_current(sentence)) { log_err("error in ubrk_first() or ubrk_current()\n"); } /*---- */ /*Testing ubrk_open and ubrk_close()*/ log_verbose("\nTesting open and close for us locale\n"); b = ubrk_open(UBRK_WORD, "fr_FR", text, u_strlen(text), &status); if (U_FAILURE(status)) { log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status)); } ubrk_close(b); /* Test setText and setUText */ { UChar s1[] = {0x41, 0x42, 0x20, 0}; UChar s2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0}; UText *ut = NULL; UBreakIterator *bb; int j; log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n"); status = U_ZERO_ERROR; bb = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); ubrk_setText(bb, s1, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_first(bb); j = ubrk_next(bb); TEST_ASSERT(j == 2); ut = utext_openUChars(ut, s2, -1, &status); ubrk_setUText(bb, ut, &status); TEST_ASSERT_SUCCESS(status); j = ubrk_next(bb); TEST_ASSERT(j == 5); ubrk_close(bb); utext_close(ut); } ubrk_close(word); ubrk_close(sentence); ubrk_close(line); ubrk_close(character); }
StringSearchPerformanceTest::StringSearchPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) :UPerfTest(argc,argv,status){ int32_t start, end; #ifdef TEST_BOYER_MOORE_SEARCH bms = NULL; #else srch = NULL; #endif pttrn = NULL; if(status== U_ILLEGAL_ARGUMENT_ERROR || line_mode){ fprintf(stderr,gUsageString, "strsrchperf"); return; } /* Get the Text */ src = getBuffer(srcLen, status); #if 0 /* Get a word to find. Do this by selecting a random word with a word breakiterator. */ UBreakIterator* brk = ubrk_open(UBRK_WORD, locale, src, srcLen, &status); if(U_FAILURE(status)){ fprintf(stderr, "FAILED to create pattern for searching. Error: %s\n", u_errorName(status)); return; } start = ubrk_preceding(brk, 1000); end = ubrk_following(brk, start); pttrnLen = end - start; UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen)); for (int i = 0; i < pttrnLen; i++) { temp[i] = src[start++]; } pttrn = temp; /* store word in pttrn */ ubrk_close(brk); #else /* The first line of the file contains the pattern */ start = 0; for(end = start; ; end += 1) { UChar ch = src[end]; if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { break; } } pttrnLen = end - start; UChar* temp = (UChar*)malloc(sizeof(UChar)*(pttrnLen)); for (int i = 0; i < pttrnLen; i++) { temp[i] = src[start++]; } pttrn = temp; /* store word in pttrn */ #endif #ifdef TEST_BOYER_MOORE_SEARCH UnicodeString patternString(pttrn, pttrnLen); UCollator *coll = ucol_open(locale, &status); CollData *data = CollData::open(coll, status); targetString = new UnicodeString(src, srcLen); bms = new BoyerMooreSearch(data, patternString, targetString, status); #else /* Create the StringSearch object to be use in performance test. */ srch = usearch_open(pttrn, pttrnLen, src, srcLen, locale, NULL, &status); #endif if(U_FAILURE(status)){ fprintf(stderr, "FAILED to create UPerfTest object. Error: %s\n", u_errorName(status)); return; } }
/* * imp: common/ubrk.cpp * hdr: common/unicode/ubrk.h * @stable ICU 2.0 * #if !UCONFIG_NO_BREAK_ITERATION * (don't actually conditionalize this, if the underlying library is not * built with break iteration, we want to fail at build time, not runtime) */ U_CAPI int32_t U_EXPORT2 ubrk_following_4_0(UBreakIterator *bi, int32_t offset) { return ubrk_following(bi, offset); }
int32_t swift::__swift_stdlib_ubrk_following(swift::__swift_stdlib_UBreakIterator *bi, int32_t offset) { return ubrk_following(ptr_cast<UBreakIterator>(bi), offset); }
int32_t Target::nextBreakBoundary(int32_t offset) { return ubrk_following(charBreakIterator, offset); }
static void TestBreakIteratorSuppressions(void) { const TestBISuppressionsItem * itemPtr; for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) { UChar textU[kTextULenMax]; int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax); UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status); log_verbose("#%d: %s\n", (itemPtr-testBISuppressionsItems), itemPtr->locale); if (U_SUCCESS(status)) { int32_t offset, start; const int32_t * expOffsetPtr; const int32_t * expOffsetStart; expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets; ubrk_first(bi); for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { if (offset != *expOffsetPtr) { log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset); } } if (offset != UBRK_DONE || *expOffsetPtr >= 0) { log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr); } expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets; start = ubrk_first(bi) + 1; for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { if (offset != *expOffsetPtr) { log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset); } start = *expOffsetPtr + 1; } if (offset != UBRK_DONE || *expOffsetPtr >= 0) { log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr); } expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets; offset = ubrk_last(bi); log_verbose("___ @%d ubrk_last\n", offset); if(offset == 0) { log_err("FAIL: ubrk_last loc \"%s\" unexpected %d\n", itemPtr->locale, offset); } for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { if (offset != *expOffsetPtr) { log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset); } else { log_verbose("[%d] @%d ubrk_previous()\n", (expOffsetPtr - expOffsetStart), offset); } } if (offset != UBRK_DONE || *expOffsetPtr >= 0) { log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset[%d] -1, got %d and %d\n", itemPtr->locale, expOffsetPtr - expOffsetStart, offset, *expOffsetPtr); } expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets; start = ubrk_last(bi) - 1; for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { if (offset != *expOffsetPtr) { log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset); } start = *expOffsetPtr - 1; } if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) { log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr); } ubrk_close(bi); } else { log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n", itemPtr->locale, u_errorName(status)); } } }
int32_t __hs_ubrk_following(UBreakIterator *bi, int32_t offset) { return ubrk_following(bi, offset); }