/* Print each element in order: */ void printEachForward( UBreakIterator* boundary, UChar* str) { int32_t end; int32_t start = ubrk_first(boundary); for (end = ubrk_next(boundary); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) { printTextRange(str, start, end ); } }
static void TestBug11665(void) { // The problem was with the incorrect breaking of Japanese text beginning // with Katakana characters when no prior Japanese or Chinese text had been // encountered. // // Tested here in cintltst, rather than in intltest, because only cintltst // tests have the ability to reset ICU, which is needed to get the bug // to manifest itself. static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E}; int32_t boundaries[10] = {0}; UBreakIterator *bi = NULL; int32_t brk; int32_t brkIdx = 0; int32_t totalBreaks = 0; UErrorCode status = U_ZERO_ERROR; ctest_resetICU(); bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status); TEST_ASSERT_SUCCESS(status); if (!bi) { return; } for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { boundaries[brkIdx] = brk; if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { break; } } if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) { log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__); } else { totalBreaks = brkIdx; brkIdx = 0; for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { if (brk != boundaries[brkIdx]) { log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx); break; } if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__); break; } } if (totalBreaks != brkIdx) { log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__); } } ubrk_close(bi); }
/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */ static inline int32_t grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) { int pos = 0, prev_pos = 0; int ret_pos = 0, prev_ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); if ( UBRK_DONE == pos ) { break; } prev_ret_pos = ret_pos; U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); if ( ret_pos > bsize ) { ret_pos = prev_ret_pos; break; } if ( prev_ret_pos == ret_pos ) { /* something wrong - malformed utf8? */ break; } prev_pos = pos; } return ret_pos; }
unsigned numGraphemeClusters(StringView string) { unsigned stringLength = string.length(); if (!stringLength) return 0; // The only Latin-1 Extended Grapheme Cluster is CRLF. if (string.is8Bit()) { auto* characters = string.characters8(); unsigned numCRLF = 0; for (unsigned i = 1; i < stringLength; ++i) numCRLF += characters[i - 1] == '\r' && characters[i] == '\n'; return stringLength - numCRLF; } NonSharedCharacterBreakIterator iterator { string }; if (!iterator) { ASSERT_NOT_REACHED(); return stringLength; } unsigned numGraphemeClusters = 0; while (ubrk_next(iterator) != UBRK_DONE) ++numGraphemeClusters; return numGraphemeClusters; }
/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */ int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC ) { unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UErrorCode status = U_ZERO_ERROR; int ret_len, pos; UBreakIterator* bi; bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); if( U_FAILURE(status) ) { return -1; } ubrk_setText(bi, text, text_length, &status); pos = 0; for ( ret_len = 0; pos != UBRK_DONE; ) { pos = ubrk_next(bi); if ( pos != UBRK_DONE ) { if ( NULL != boundary_array && ret_len < boundary_array_len ) { boundary_array[ret_len] = pos; } ret_len++; } } ubrk_close(bi); return ret_len; }
/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */ static inline int32_t grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) { int pos = 0, prev_pos = 0; int ret_pos = 0, prev_ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); if ( UBRK_DONE == pos ) { break; } /* if we are beyond our limit, then the loop is done */ if ( pos > csize ) { break; } /* update our pointer in the original UTF-8 buffer by as many characters as ubrk_next iterated over */ prev_ret_pos = ret_pos; U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); if ( prev_ret_pos == ret_pos ) { /* something wrong - malformed utf8? */ break; } prev_pos = pos; } return ret_pos; }
// BreakIterator.split {{{ static PyObject * icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { int32_t prev = 0, p = 0, sz = 0; PyObject *ans = NULL, *token = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { token = icu_to_python(self->text + prev, sz); if (token == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, token) != 0) { Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(token); } } return ans; } // }}}
std::vector<lstring> convert_split_words(const lstring <) { std::vector<lstring> ret; UBreakIterator* bi; int prev = -1, pos; UErrorCode err = U_ZERO_ERROR; bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err); if (U_FAILURE(err)) return ret; pos = ubrk_first(bi); while (pos != UBRK_DONE) { int rules = ubrk_getRuleStatus(bi); if ((rules == UBRK_WORD_NONE) || (prev == -1)) { prev = pos; } else { ret.emplace_back(lt.substr(prev, pos - prev)); prev = -1; } pos = ubrk_next(bi); } ubrk_close(bi); return ret; }
unsigned numCodeUnitsInGraphemeClusters(StringView string, unsigned numGraphemeClusters) { unsigned stringLength = string.length(); if (stringLength <= numGraphemeClusters) return stringLength; // The only Latin-1 Extended Grapheme Cluster is CRLF. if (string.is8Bit()) { auto* characters = string.characters8(); unsigned i, j; for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j) j += characters[j] == '\r' && characters[j + 1] == '\n'; return j + (i < numGraphemeClusters); } NonSharedCharacterBreakIterator iterator { string }; if (!iterator) { ASSERT_NOT_REACHED(); return stringLength; } for (unsigned i = 0; i < numGraphemeClusters; ++i) { if (ubrk_next(iterator) == UBRK_DONE) return stringLength; } return ubrk_current(iterator); }
static jint nextImpl(JNIEnv* env, jclass, jint address, jint n) { UBreakIterator* bi = breakIterator(address); if (n < 0) { while (n++ < -1) { ubrk_previous(bi); } return ubrk_previous(bi); } else if (n == 0) { return ubrk_current(bi); } else { while (n-- > 1) { ubrk_next(bi); } return ubrk_next(bi); } return -1; }
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; buf = python_to_icu(token, &sz); if (buf == NULL) return NULL; if (sz < 1) goto end; needle = buf; if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; } if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { if (word_start > 0 && ( (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) || (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1])) )) continue; if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue; if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; } if ( // Check that the found word is followed by a word boundary ubrk_isBoundary(self->break_iterator, word_start + sz) && // If there is a leading hyphen check that the leading // hyphen is preceded by a word boundary (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) && // Check that there is a word boundary *after* the trailing // hyphen. We cannot rely on ubrk_isBoundary() as that // always returns true because of the trailing hyphen. (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) ) { ans = word_start; break; } if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary() } } if (leading_hyphen && ans > -1) ans -= 1; #ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); #endif Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("l", (long)ans); } // }}}
/* * static void TestBreakIteratorUText(void); * * Test that ubrk_setUText() is present and works for a simple case. */ static void TestBreakIteratorUText(void) { const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67"; /* c3 85 is utf-8 for A with a ring on top */ /* 0 1 2 34567890 */ UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi = NULL; int32_t pos = 0; UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status); TEST_ASSERT_SUCCESS(status); bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); if (U_FAILURE(status)) { log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } ubrk_setUText(bi, ut, &status); if (U_FAILURE(status)) { log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } pos = ubrk_first(bi); TEST_ASSERT(pos == 0); pos = ubrk_next(bi); TEST_ASSERT(pos == 4); pos = ubrk_next(bi); TEST_ASSERT(pos == 5); pos = ubrk_next(bi); TEST_ASSERT(pos == 10); pos = ubrk_next(bi); TEST_ASSERT(pos == UBRK_DONE); ubrk_close(bi); utext_close(ut); }
static void TestBreakIteratorTailoring(void) { const RBBITailoringTest * testPtr; for (testPtr = tailoringTests; testPtr->locale != NULL; ++testPtr) { UErrorCode status = U_ZERO_ERROR; UBreakIterator* ubrkiter = ubrk_open(testPtr->type, testPtr->locale, testPtr->test, -1, &status); if ( U_SUCCESS(status) ) { int32_t offset, offsindx; UBool foundError; foundError = FALSE; for (offsindx = 0; (offset = ubrk_next(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsFwd[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]); } foundError = FALSE; for (offsindx = 0; (offset = ubrk_previous(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsRev[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]); } ubrk_close(ubrkiter); } else { log_err_status(status, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr->locale, testPtr->type, u_errorName(status)); } } }
/* * TestBreakIteratorRules - Verify that a break iterator can be created from * a set of source rules. */ static void TestBreakIteratorRules() { /* Rules will keep together any run of letters not including 'a', OR * keep together 'abc', but only when followed by 'def', OTHERWISE * just return one char at a time. */ char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};"; /* 0123456789012345678 */ char data[] = "abcdex abcdefgh-def"; /* the test data string */ char breaks[] = "** ** * ** *"; /* * the expected break positions */ char tags[] = "01 21 6 21 2"; /* expected tag values at break positions */ int32_t tagMap[] = {0, 1, 2, 3, 4, 5, 666}; UChar *uData; void *freeHook = NULL; UErrorCode status = U_ZERO_ERROR; int32_t pos; int i; UBreakIterator *bi = testOpenRules(rules); if (bi == NULL) {return;} uData = toUChar(data, &freeHook); ubrk_setText(bi, uData, -1, &status); pos = ubrk_first(bi); for (i=0; i<sizeof(breaks); i++) { if (pos == i && breaks[i] != '*') { log_err("FAIL: unexpected break at position %d found\n", pos); break; } if (pos != i && breaks[i] == '*') { log_err("FAIL: expected break at position %d not found.\n", i); break; } if (pos == i) { int32_t tag, expectedTag; tag = ubrk_getRuleStatus(bi); expectedTag = tagMap[tags[i]&0xf]; if (tag != expectedTag) { log_err("FAIL: incorrect tag value. Position = %d; expected tag %d, got %d", pos, expectedTag, tag); break; } pos = ubrk_next(bi); } } freeToUCharStrings(&freeHook); ubrk_close(bi); }
MojErr MojDbTextTokenizer::tokenize(const MojString& text, MojDbTextCollator* collator, KeySet& keysOut) const { LOG_TRACE("Entering function %s", __FUNCTION__); MojAssert(m_ubrk.get()); // convert to UChar from str MojDbTextUtils::UnicodeVec unicodeStr; MojErr err = MojDbTextUtils::strToUnicode(text, unicodeStr); MojErrCheck(err); // clone break iterator and set text MojByte buf[U_BRK_SAFECLONE_BUFFERSIZE]; UErrorCode status = U_ZERO_ERROR; MojInt32 size = sizeof(buf); IterPtr ubrk(ubrk_safeClone(m_ubrk.get(), buf, &size, &status)); MojUnicodeErrCheck(status); MojAssert(ubrk.get()); ubrk_setText(ubrk.get(), unicodeStr.begin(), (MojInt32) unicodeStr.size(), &status); MojUnicodeErrCheck(status); MojInt32 tokBegin = -1; MojInt32 pos = ubrk_first(ubrk.get()); while (pos != UBRK_DONE) { UWordBreak status = (UWordBreak) ubrk_getRuleStatus(ubrk.get()); if (status != UBRK_WORD_NONE) { MojAssert(tokBegin != -1); MojDbKey key; const UChar* tokChars = unicodeStr.begin() + tokBegin; MojSize tokSize = (MojSize) (pos - tokBegin); if (collator) { err = collator->sortKey(tokChars, tokSize, key); MojErrCheck(err); } else { MojString tok; err = MojDbTextUtils::unicodeToStr(tokChars, tokSize, tok); MojErrCheck(err); err = key.assign(tok); MojErrCheck(err); } err = keysOut.put(key); MojErrCheck(err); } tokBegin = pos; pos = ubrk_next(ubrk.get()); } return MojErrNone; }
int tokenizer_next( tokenizer_t *t, char *word, size_t size ) { UChar savedEndChar; int k; // start iterator if( t->end == 0 ) { t->start = ubrk_first(t->boundary); } // Find next word again: t->end = ubrk_next(t->boundary); if( t->end == UBRK_DONE ) { return -1; } // Null terminate savedEndChar = t->str[t->end]; t->str[t->end] = 0; // Skip unct if( t->end - t->start == 1 && u_ispunct( t->str[t->start] ) ) { t->str[t->end] = savedEndChar; t->start = t->end; goto again; } // Skip whitespace for( k=t->start; k<t->end; k++ ) { if( u_isspace( t->str[k] ) == 1 ) { t->str[t->end] = savedEndChar; t->start = t->end; goto again; } } // Copy to C bffer u_austrncpy(word, t->str+t->start, size-1); word[size-1] = 0; printf("string[%2d..%2d] \"%s\" %d\n", t->start, t->end-1, word, u_isspace( t->str[t->start])); t->str[t->end] = savedEndChar; t->start = t->end; return 0; }
/* * TestsBreakIteratorStatusVals() Test the ubrk_getRuleStatusVec() funciton */ static void TestBreakIteratorStatusVec() { #define RULE_STRING_LENGTH 200 UChar rules[RULE_STRING_LENGTH]; #define TEST_STRING_LENGTH 25 UChar testString[TEST_STRING_LENGTH]; UBreakIterator *bi = NULL; int32_t pos = 0; int32_t vals[10]; int32_t numVals; UErrorCode status = U_ZERO_ERROR; u_uastrncpy(rules, "[A-N]{100}; \n" "[a-w]{200}; \n" "[\\p{L}]{300}; \n" "[\\p{N}]{400}; \n" "[0-5]{500}; \n" "!.*;\n", RULE_STRING_LENGTH); u_uastrncpy(testString, "ABC", TEST_STRING_LENGTH); bi = ubrk_openRules(rules, -1, testString, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(bi != NULL); /* The TEST_ASSERT above should change too... */ if (bi != NULL) { pos = ubrk_next(bi); TEST_ASSERT(pos == 1); memset(vals, -1, sizeof(vals)); numVals = ubrk_getRuleStatusVec(bi, vals, 10, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numVals == 2); TEST_ASSERT(vals[0] == 100); TEST_ASSERT(vals[1] == 300); TEST_ASSERT(vals[2] == -1); numVals = ubrk_getRuleStatusVec(bi, vals, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(numVals == 2); } ubrk_close(bi); }
static void TestBreakIteratorRefresh(void) { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { return; } utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_setUText(bi, &ut1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == ubrk_next(bi)); TEST_ASSERT(3 == ubrk_next(bi)); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_refreshUText(bi, &ut2, &status); TEST_ASSERT_SUCCESS(status); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == ubrk_next(bi)); TEST_ASSERT(7 == ubrk_next(bi)); TEST_ASSERT(8 == ubrk_next(bi)); TEST_ASSERT(UBRK_DONE == ubrk_next(bi)); TEST_ASSERT_SUCCESS(status); utext_close(&ut1); utext_close(&ut2); } ubrk_close(bi); }
ERL_NIF_TERM len(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary in; cloner* ptr; UBreakIterator* iter; int count = 0, pos; UErrorCode status = U_ZERO_ERROR; if (argc != 2) return enif_make_badarg(env); /* Last argument must be a binary */ if (!(enif_inspect_binary(env, argv[1], &in) && enif_get_resource(env, argv[0], iterator_type, (void**) &ptr))) { return enif_make_badarg(env); } iter = (UBreakIterator*) cloner_get(ptr); CHECK_RES(env, iter); if (iter == NULL) { return enif_make_badarg(env); } /* Do count */ ubrk_setText(iter, (UChar *) in.data, TO_ULEN(in.size), &status); CHECK(env, status); pos = ubrk_first(iter); if (pos != UBRK_DONE) while (1) { pos = ubrk_next(iter); if (pos == UBRK_DONE) break; if (is_valid_elem(ptr, iter)) count++; } return enif_make_int(env, count); }
/* {{{ grapheme_count_graphemes */ int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len) { int ret_len = 0; int pos = 0; UErrorCode status = U_ZERO_ERROR; ubrk_setText(bi, string, string_len, &status); do { pos = ubrk_next(bi); if ( UBRK_DONE != pos ) { ret_len++; } } while ( UBRK_DONE != pos ); return ret_len; }
int32_t grapheme_count(UBreakIterator *ubrk, const UString *ustr) { int32_t i, count; UErrorCode status; count = 0; status = U_ZERO_ERROR; ubrk_setText(ubrk, ustr->ptr, ustr->len, &status); if (U_FAILURE(status)) { return -1; } if (UBRK_DONE != (i = ubrk_first(ubrk))) { while (UBRK_DONE != (i = ubrk_next(ubrk))) { ++count; } } ubrk_unbindText(ubrk); return count; }
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL; int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1; PyObject *token = NULL; if (!PyArg_ParseTuple(args, "O", &token)) return NULL; buf = python_to_icu(token, &sz, 1); if (buf == NULL) return NULL; if (sz < 1) goto end; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { #ifdef PY_UNICODE_WIDE ans = u_countChar32(self->text, prev); #else ans = prev; #endif break; } } Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("i", ans); } // }}}
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t prev = 0, p = 0, sz = 0; PyObject *ans = NULL, *temp = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + prev, sz); prev = u_countChar32(self->text, prev); #endif temp = Py_BuildValue("II", prev, sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } return ans; } // }}}
/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */ static inline int32_t grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) { int pos = 0, next_pos = 0; int ret_pos = 0; while ( size ) { next_pos = ubrk_next(bi); if ( UBRK_DONE == next_pos ) { break; } pos = next_pos; size--; } /* pos is one past the last UChar - and represent the number of code units to advance in the utf-8 buffer */ U8_FWD_N(pstr, ret_pos, str_len, pos); return ret_pos; }
/* ** Extract the next token from a tokenization cursor. */ static int icuNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = 0; int iEnd = 0; int nByte = 0; while( iStart==iEnd ){ UChar32 c; iStart = ubrk_current(pCsr->pIter); iEnd = ubrk_next(pCsr->pIter); if( iEnd==UBRK_DONE ){ return SQLITE_DONE; } while( iStart<iEnd ){ int iWhite = iStart; U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ break; } } assert(iStart<=iEnd); } do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ &status /* Output success/failure */ ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; }
/* * Internal titlecasing function. * * Must get titleIter!=NULL. */ static int32_t _toTitle(const UCaseProps *csp, UChar *dest, int32_t destCapacity, const UChar *src, UCaseContext *csc, int32_t srcLength, UBreakIterator *titleIter, const char *locale, int32_t *locCache, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, index, destIndex, length; UBool isFirstIndex; /* set up local variables */ destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; index=ubrk_first(titleIter); } else { index=ubrk_next(titleIter); } if(index==UBRK_DONE || index>srcLength) { index=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<index) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; for(;;) { U16_NEXT(src, titleLimit, srcLength, c); if(UCASE_NONE!=ucase_getType(csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } titleStart=titleLimit; if(titleLimit==index) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); } destIndex+=length; } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc->cpStart=titleStart; csc->cpLimit=titleLimit; c=ucase_toFullTitle(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* lowercase [titleLimit..index[ */ if(titleLimit<index) { destIndex+= _caseMap( csp, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, csc, titleLimit, index, locale, locCache, pErrorCode); } } } prev=index; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
/* * Internal titlecasing function. */ static int32_t _toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, UCaseContext *csc, int32_t srcLength, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; if(csm->iter!=NULL) { ubrk_setText(csm->iter, src, srcLength, pErrorCode); } else { csm->iter=ubrk_open(UBRK_WORD, csm->locale, src, srcLength, pErrorCode); } if(U_FAILURE(*pErrorCode)) { return 0; } /* set up local variables */ destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=ubrk_first(csm->iter); } else { idx=ubrk_next(csm->iter); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U16_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U16_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc->cpStart=titleStart; csc->cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { c=(UChar32) 0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; UChar sep = 0; PyObject *ans = NULL, *temp = NULL, *t = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start; if (sz > 0) { // ICU breaks on words containing hyphens, we do not want that, so we recombine manually is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0; if (word_start > 0) { // Look for a leading hyphen sep = *(self->text + word_start - 1); if (IS_HYPHEN_CHAR(sep)) { leading_hyphen = 1; if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1; } } if (word_start + sz < self->text_len) { // Look for a trailing hyphen sep = *(self->text + word_start + sz); if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; } last_pos = p; #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + word_start, sz); word_start = u_countChar32(self->text, word_start); #endif if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { sz = last_sz + sz + trailing_hyphen; last_sz = sz; t = PyInt_FromLong((long)sz); if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1); Py_DECREF(PyTuple_GET_ITEM(temp, 1)); PyTuple_SET_ITEM(temp, 1, t); } else { sz += leading_hyphen + trailing_hyphen; last_sz = sz; temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } } return ans; } // }}}
int textBreakNext(TextBreakIterator* iterator) { return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); }
/* Print first element */ void printFirst(UBreakIterator* boundary, UChar* str) { int32_t end; int32_t start = ubrk_first(boundary); end = ubrk_next(boundary); printTextRange( str, start, end ); }