swift::__swift_stdlib_UBreakIterator *swift::__swift_stdlib_ubrk_open( swift::__swift_stdlib_UBreakIteratorType type, const char *locale, const uint16_t *text, int32_t textLength, __swift_stdlib_UErrorCode *status) { #if defined(__CYGWIN__) || defined( _MSC_VER) || defined(__linux__) return ptr_cast<swift::__swift_stdlib_UBreakIterator>( ubrk_open(static_cast<UBreakIteratorType>(type), locale, reinterpret_cast<const UChar*>(text), textLength, ptr_cast<UErrorCode>(status))); #else return ptr_cast<swift::__swift_stdlib_UBreakIterator>( ubrk_open(static_cast<UBreakIteratorType>(type), locale, text, textLength, ptr_cast<UErrorCode>(status))); #endif }
int c_main( void ) { UBreakIterator *boundary; char cStringToExamine[] = "Aaa bbb ccc. Ddd eee fff."; UChar stringToExamine[sizeof(cStringToExamine)+1]; UErrorCode status = U_ZERO_ERROR; printf("\n\n" "C Boundary Analysis\n" "-------------------\n\n"); printf("Examining: %s\n", cStringToExamine); u_uastrcpy(stringToExamine, cStringToExamine); /*print each sentence in forward and reverse order*/ boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, -1, &status); if (U_FAILURE(status)) { printf("ubrk_open error: %s\n", u_errorName(status)); exit(1); } printf("\n----- Sentence Boundaries, forward: -----------\n"); printEachForward(boundary, stringToExamine); printf("\n----- Sentence Boundaries, backward: ----------\n"); printEachBackward(boundary, stringToExamine); ubrk_close(boundary); /*print each word in order*/ boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status); printf("\n----- Word Boundaries, forward: -----------\n"); printEachForward(boundary, stringToExamine); printf("\n----- Word Boundaries, backward: ----------\n"); printEachBackward(boundary, stringToExamine); /*print first element*/ printf("\n----- first: -------------\n"); printFirst(boundary, stringToExamine); /*print last element*/ printf("\n----- last: --------------\n"); printLast(boundary, stringToExamine); /*print word at charpos 10 */ printf("\n----- at pos 10: ---------\n"); printAt(boundary, 10 , stringToExamine); ubrk_close(boundary); printf("\nEnd of C boundary analysis\n"); return 0; }
static void *engine_fixed_compile(error_t **error, UString *ustr, uint32_t flags) { UErrorCode status; fixed_pattern_t *p; p = mem_new(*p); p->pattern = ustr; // not needed with usearch ? p->flags = flags; p->ubrk = NULL; p->usearch = NULL; status = U_ZERO_ERROR; if (ustring_empty(ustr)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } } else { if (!IS_WHOLE_LINE(flags)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } else if (WITH_GRAPHEME()) { p->ubrk = ubrk_open(UBRK_CHARACTER, NULL, NULL, 0, &status); } if (U_FAILURE(status)) { fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "ubrk_open"); return NULL; } } if (IS_WORD_BOUNDED(flags) || (IS_CASE_INSENSITIVE(flags) && !IS_WHOLE_LINE(flags))) { p->usearch = usearch_open(ustr->ptr, ustr->len, USEARCH_FAKE_USTR, uloc_getDefault(), p->ubrk, &status); if (U_FAILURE(status)) { if (NULL != p->ubrk) { ubrk_close(p->ubrk); } fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "usearch_open"); return NULL; } if (IS_CASE_INSENSITIVE(flags)) { UCollator *ucol; ucol = usearch_getCollator(p->usearch); ucol_setStrength(ucol, (flags & ~OPT_MASK) > 1 ? UCOL_SECONDARY : UCOL_PRIMARY); } } } return p; }
static UBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID()) { UErrorCode openStatus = U_ZERO_ERROR; UBreakIterator* iterator = ubrk_open(type, locale, 0, 0, &openStatus); ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); return iterator; }
void Target::setTargetString(const UnicodeString *target) { if (charBreakIterator != NULL) { ubrk_close(charBreakIterator); ucol_closeElements(elements); } targetString = target; if (targetString != NULL) { UErrorCode status = U_ZERO_ERROR; targetBuffer = targetString->getBuffer(); targetLength = targetString->length(); elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status); ucol_forceHanImplicit(elements, &status); charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), targetBuffer, targetLength, &status); } else { targetBuffer = NULL; targetLength = 0; } }
std::vector<lstring> convert_split_words(const lstring <) { std::vector<lstring> ret; UBreakIterator* bi; int prev = -1, pos; UErrorCode err = U_ZERO_ERROR; bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err); if (U_FAILURE(err)) return ret; pos = ubrk_first(bi); while (pos != UBRK_DONE) { int rules = ubrk_getRuleStatus(bi); if ((rules == UBRK_WORD_NONE) || (prev == -1)) { prev = pos; } else { ret.emplace_back(lt.substr(prev, pos - prev)); prev = -1; } pos = ubrk_next(bi); } ubrk_close(bi); return ret; }
static jint getIterator(JNIEnv* env, jstring locale, UBreakIteratorType type) { UErrorCode status = U_ZERO_ERROR; const char* localeChars = env->GetStringUTFChars(locale, NULL); UBreakIterator* it = ubrk_open(type, localeChars, NULL, 0, &status); env->ReleaseStringUTFChars(locale, localeChars); icu4jni_error(env, status); return reinterpret_cast<uintptr_t>(it); }
static void *engine_bin_compile(error_t **error, UString *ustr, uint32_t flags) { UErrorCode status; bin_pattern_t *p; p = mem_new(*p); p->pattern = ustr; p->flags = flags; p->ubrk = NULL; p->tmp = NULL; status = U_ZERO_ERROR; if (ustring_empty(ustr)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } } else { /** * Whole line matches are simplified: * 1) we don't need to check graphemes boundaries, * 2) case insensitivity is done directly by the whole_line_match callback **/ if (!IS_WHOLE_LINE(flags)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } else if (WITH_GRAPHEME()) { p->ubrk = ubrk_open(UBRK_CHARACTER, NULL, NULL, 0, &status); } if (U_FAILURE(status)) { bin_pattern_destroy(p); icu_error_set(error, FATAL, status, "ubrk_open"); return NULL; } if (IS_CASE_INSENSITIVE(flags)) { p->tmp = ustring_new(); p->pattern = ustring_sized_new(ustr->len); if (!ustring_fullcase(p->pattern, ustr->ptr, ustr->len, UCASE_FOLD, error)) { bin_pattern_destroy(p); return NULL; } ustring_destroy(ustr); /* no more needed, throw (free) it now */ } } } return p; }
/* * imp: common/ubrk.cpp * hdr: common/unicode/ubrk.h * @stable ICU 2.0 * #if !UCONFIG_NO_BREAK_ITERATION * (don't actually conditionalize this, if the underlying library is not * built with break iteration, we want to fail at build time, not runtime) */ U_CAPI UBreakIterator* U_EXPORT2 ubrk_open_4_0(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status) { return ubrk_open(type, locale, text, textLength, status); }
swift::__swift_stdlib_UBreakIterator *swift::__swift_stdlib_ubrk_open( swift::__swift_stdlib_UBreakIteratorType type, const char *locale, const __swift_stdlib_UChar *text, int32_t textLength, __swift_stdlib_UErrorCode *status) { return ptr_cast<swift::__swift_stdlib_UBreakIterator>( ubrk_open(static_cast<UBreakIteratorType>(type), locale, reinterpret_cast<const UChar *>(text), textLength, ptr_cast<UErrorCode>(status))); }
void tokenizer_init( tokenizer_t *t, const char *input ) { memset( t, 0, sizeof(*t)); UErrorCode status = U_ZERO_ERROR; t->str = malloc( sizeof(UChar) * strlen(input) ); u_uastrcpy(t->str, input); t->boundary = ubrk_open(UBRK_WORD, "en_us", t->str, -1, &status ); }
UBreakIterator* openLineBreakIterator(const AtomicString& locale) { bool localeIsEmpty = locale.isEmpty(); UErrorCode openStatus = U_ZERO_ERROR; UBreakIterator* ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus); // locale comes from a web page and it can be invalid, leading ICU // to fail, in which case we fall back to the default locale. if (!localeIsEmpty && U_FAILURE(openStatus)) { openStatus = U_ZERO_ERROR; ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus); } if (U_FAILURE(openStatus)) { LOG_ERROR("ubrk_open failed with status %d", openStatus); return nullptr; } return ubrkIter; }
static jint getIterator(JNIEnv* env, jstring locale, UBreakIteratorType type) { UErrorCode status = U_ZERO_ERROR; ScopedUtfChars localeChars(env, locale); if (localeChars.c_str() == NULL) { return 0; } UBreakIterator* it = ubrk_open(type, localeChars.c_str(), NULL, 0, &status); icu4jni_error(env, status); return reinterpret_cast<uintptr_t>(it); }
MojErr MojDbTextTokenizer::init(const MojChar* locale) { LOG_TRACE("Entering function %s", __FUNCTION__); MojAssert(locale); UErrorCode status = U_ZERO_ERROR; m_ubrk.reset(ubrk_open(UBRK_WORD, locale, NULL, 0, &status)); MojUnicodeErrCheck(status); MojAssert(m_ubrk.get()); return MojErrNone; }
static void TestBreakIteratorRefresh(void) { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { return; } utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_setUText(bi, &ut1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == ubrk_next(bi)); TEST_ASSERT(3 == ubrk_next(bi)); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_refreshUText(bi, &ut2, &status); TEST_ASSERT_SUCCESS(status); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == ubrk_next(bi)); TEST_ASSERT(7 == ubrk_next(bi)); TEST_ASSERT(8 == ubrk_next(bi)); TEST_ASSERT(UBRK_DONE == ubrk_next(bi)); TEST_ASSERT_SUCCESS(status); utext_close(&ut1); utext_close(&ut2); } ubrk_close(bi); }
static void TestBug11665(void) { // The problem was with the incorrect breaking of Japanese text beginning // with Katakana characters when no prior Japanese or Chinese text had been // encountered. // // Tested here in cintltst, rather than in intltest, because only cintltst // tests have the ability to reset ICU, which is needed to get the bug // to manifest itself. static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E}; int32_t boundaries[10] = {0}; UBreakIterator *bi = NULL; int32_t brk; int32_t brkIdx = 0; int32_t totalBreaks = 0; UErrorCode status = U_ZERO_ERROR; ctest_resetICU(); bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status); TEST_ASSERT_SUCCESS(status); if (!bi) { return; } for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { boundaries[brkIdx] = brk; if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { break; } } if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) { log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__); } else { totalBreaks = brkIdx; brkIdx = 0; for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { if (brk != boundaries[brkIdx]) { log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx); break; } if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__); break; } } if (totalBreaks != brkIdx) { log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__); } } ubrk_close(bi); }
static void TestBreakIteratorTailoring(void) { const RBBITailoringTest * testPtr; for (testPtr = tailoringTests; testPtr->locale != NULL; ++testPtr) { UErrorCode status = U_ZERO_ERROR; UBreakIterator* ubrkiter = ubrk_open(testPtr->type, testPtr->locale, testPtr->test, -1, &status); if ( U_SUCCESS(status) ) { int32_t offset, offsindx; UBool foundError; foundError = FALSE; for (offsindx = 0; (offset = ubrk_next(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsFwd[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]); } foundError = FALSE; for (offsindx = 0; (offset = ubrk_previous(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsRev[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]); } ubrk_close(ubrkiter); } else { log_err_status(status, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr->locale, testPtr->type, u_errorName(status)); } } }
U_CAPI int32_t U_EXPORT2 ucasemap_toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { if(csm->iter!=NULL) { ubrk_setText(csm->iter, src, srcLength, pErrorCode); } else { csm->iter=ubrk_open(UBRK_WORD, csm->locale, src, srcLength, pErrorCode); } return ustrcase_map( csm, dest, destCapacity, src, srcLength, ustrcase_internalToTitle, pErrorCode); }
/* * static void TestBreakIteratorUText(void); * * Test that ubrk_setUText() is present and works for a simple case. */ static void TestBreakIteratorUText(void) { const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67"; /* c3 85 is utf-8 for A with a ring on top */ /* 0 1 2 34567890 */ UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi = NULL; int32_t pos = 0; UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status); TEST_ASSERT_SUCCESS(status); bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); if (U_FAILURE(status)) { log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } ubrk_setUText(bi, ut, &status); if (U_FAILURE(status)) { log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } pos = ubrk_first(bi); TEST_ASSERT(pos == 0); pos = ubrk_next(bi); TEST_ASSERT(pos == 4); pos = ubrk_next(bi); TEST_ASSERT(pos == 5); pos = ubrk_next(bi); TEST_ASSERT(pos == 10); pos = ubrk_next(bi); TEST_ASSERT(pos == UBRK_DONE); ubrk_close(bi); utext_close(ut); }
int KWQFindNextWordFromIndex(const QChar *chars, int len, int position, bool forward) { int pos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_WORD, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { if (forward) { pos = ubrk_following(boundary, position); if (pos == UBRK_DONE) pos = len; } else { pos = ubrk_preceding(boundary, position); if (pos == UBRK_DONE) pos = 0; } ubrk_close(boundary); } return pos; }
/* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */ UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status ) { int32_t buffer_size; UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); if ( NULL == global_break_iterator ) { global_break_iterator = ubrk_open(UBRK_CHARACTER, NULL, /* icu default locale - locale has no effect on this iterator */ NULL, /* text not set in global iterator */ 0, /* text length = 0 */ status); INTL_G(grapheme_iterator) = global_break_iterator; } buffer_size = U_BRK_SAFECLONE_BUFFERSIZE; return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status); }
void KWQFindSentenceBoundary(const QChar *chars, int len, int position, int *start, int *end) { int startPos = 0; int endPos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_SENTENCE, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { startPos = ubrk_preceding(boundary, position); if (startPos == UBRK_DONE) { startPos = 0; } endPos = ubrk_following(boundary, startPos); if (endPos == UBRK_DONE) endPos = len; ubrk_close(boundary); } *start = startPos; *end = endPos; }
static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, UBreakIteratorType type, const UChar* string, int length) { if (!string) return 0; if (!createdIterator) { UErrorCode openStatus = U_ZERO_ERROR; iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus)); createdIterator = true; ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); } if (!iterator) return 0; UErrorCode setTextStatus = U_ZERO_ERROR; ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); if (U_FAILURE(setTextStatus)) return 0; return iterator; }
U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToTitle(UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { UText utext=UTEXT_INITIALIZER; utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } if(csm->iter==NULL) { csm->iter=ubrk_open(UBRK_WORD, csm->locale, NULL, 0, pErrorCode); } ubrk_setUText(csm->iter, &utext, pErrorCode); int32_t length=ucasemap_mapUTF8(csm, (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, ucasemap_internalUTF8ToTitle, pErrorCode); utext_close(&utext); return length; }
U_CAPI int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode) { UCaseMap csm=UCASEMAP_INITIALIZER; setTempCaseMap(&csm, locale); if(titleIter!=NULL) { ubrk_setText(csm.iter=titleIter, src, srcLength, pErrorCode); } else { csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode); } int32_t length=ustrcase_map( &csm, dest, destCapacity, src, srcLength, ustrcase_internalToTitle, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } return length; }
static PyObject * icu_BreakIterator_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { icu_BreakIterator *self = NULL; const char *locale = NULL; int break_iterator_type = UBRK_WORD; UErrorCode status = U_ZERO_ERROR; UBreakIterator *break_iterator; if (!PyArg_ParseTuple(args, "is", &break_iterator_type, &locale)) return NULL; break_iterator = ubrk_open(break_iterator_type, locale, NULL, 0, &status); if (break_iterator == NULL || U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; } self = (icu_BreakIterator *)type->tp_alloc(type, 0); if (self != NULL) { self->break_iterator = break_iterator; } self->text = NULL; self->text_len = 0; self->type = break_iterator_type; return (PyObject *)self; }
/* ** Prepare to begin tokenizing a particular string. The input ** string to be tokenized is pInput[0..nBytes-1]. A cursor ** used to incrementally tokenize this string is returned in ** *ppCursor. */ static int icuOpen( sqlite3_tokenizer *pTokenizer, /* The tokenizer */ const char *zInput, /* Input string */ int nInput, /* Length of zInput in bytes */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ ){ IcuTokenizer *p = (IcuTokenizer *)pTokenizer; IcuCursor *pCsr; const int32_t opt = U_FOLD_CASE_DEFAULT; UErrorCode status = U_ZERO_ERROR; int nChar; UChar32 c; int iInput = 0; int iOut = 0; *ppCursor = 0; if( nInput<0 ){ nInput = strlen(zInput); } nChar = nInput+1; pCsr = (IcuCursor *)sqlite3_malloc( sizeof(IcuCursor) + /* IcuCursor */ nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ ); if( !pCsr ){ return SQLITE_NOMEM; } memset(pCsr, 0, sizeof(IcuCursor)); pCsr->aChar = (UChar *)&pCsr[1]; pCsr->aOffset = (int *)&pCsr->aChar[nChar]; pCsr->aOffset[iOut] = iInput; U8_NEXT(zInput, iInput, nInput, c); while( c>0 ){ int isError = 0; c = u_foldCase(c, opt); U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); if( isError ){ sqlite3_free(pCsr); return SQLITE_ERROR; } pCsr->aOffset[iOut] = iInput; if( iInput<nInput ){ U8_NEXT(zInput, iInput, nInput, c); }else{ c = 0; } } pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); if( !U_SUCCESS(status) ){ sqlite3_free(pCsr); return SQLITE_ERROR; } pCsr->nChar = iOut; ubrk_first(pCsr->pIter); *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; return SQLITE_OK; }
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) { UErrorCode status = U_ZERO_ERROR; OrderList targetOrders(coll, target, offset); OrderList patternOrders(coll, pattern); int32_t targetSize = targetOrders.size() - 1; int32_t patternSize = patternOrders.size() - 1; UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), target.getBuffer(), target.length(), &status); if (patternSize == 0) { // Searching for an empty pattern always fails matchStart = matchEnd = -1; ubrk_close(charBreakIterator); return FALSE; } matchStart = matchEnd = -1; for(int32_t i = 0; i < targetSize; i += 1) { if (targetOrders.matchesAt(i, patternOrders)) { int32_t start = targetOrders.getLowOffset(i); int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); // if the low and high offsets of the first CE in // the match are the same, it means that the match // starts in the middle of an expansion - all but // the first CE of the expansion will have the offset // of the following character. if (start == targetOrders.getHighOffset(i)) { continue; } // Make sure match starts on a grapheme boundary if (! ubrk_isBoundary(charBreakIterator, start)) { continue; } // If the low and high offsets of the CE after the match // are the same, it means that the match ends in the middle // of an expansion sequence. if (maxLimit == targetOrders.getHighOffset(i + patternSize) && targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { continue; } int32_t mend = maxLimit; // Find the first grapheme break after the character index // of the last CE in the match. If it's after character index // that's after the last CE in the match, use that index // as the end of the match. if (minLimit < maxLimit) { // When the last CE's low index is same with its high index, the CE is likely // a part of expansion. In this case, the index is located just after the // character corresponding to the CEs compared above. If the index is right // at the break boundary, move the position to the next boundary will result // incorrect match length when there are ignorable characters exist between // the position and the next character produces CE(s). See ticket#8482. if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { mend = minLimit; } else { int32_t nba = ubrk_following(charBreakIterator, minLimit); if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { mend = nba; } } } if (mend > maxLimit) { continue; } if (! ubrk_isBoundary(charBreakIterator, mend)) { continue; } matchStart = start; matchEnd = mend; ubrk_close(charBreakIterator); return TRUE; } } ubrk_close(charBreakIterator); return FALSE; }
void StringCaseTest::TestCasing() { UErrorCode status = U_ZERO_ERROR; #if !UCONFIG_NO_BREAK_ITERATION LocalUBreakIteratorPointer iter; #endif char cLocaleID[100]; UnicodeString locale, input, output, optionsString, result; uint32_t options; int32_t whichCase, type; LocalPointer<TestDataModule> driver(TestDataModule::getTestDataModule("casing", *this, status)); if(U_SUCCESS(status)) { for(whichCase=0; whichCase<TEST_COUNT; ++whichCase) { #if UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE) { continue; } #endif LocalPointer<TestData> casingTest(driver->createTestData(dataNames[whichCase], status)); if(U_FAILURE(status)) { errln("TestCasing failed to createTestData(%s) - %s", dataNames[whichCase], u_errorName(status)); break; } const DataMap *myCase = NULL; while(casingTest->nextCase(myCase, status)) { input = myCase->getString("Input", status); output = myCase->getString("Output", status); if(whichCase!=TEST_FOLD) { locale = myCase->getString("Locale", status); } locale.extract(0, 0x7fffffff, cLocaleID, sizeof(cLocaleID), ""); #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE) { type = myCase->getInt("Type", status); if(type>=0) { iter.adoptInstead(ubrk_open((UBreakIteratorType)type, cLocaleID, NULL, 0, &status)); } else if(type==-2) { // Open a trivial break iterator that only delivers { 0, length } // or even just { 0 } as boundaries. static const UChar rules[] = { 0x2e, 0x2a, 0x3b }; // ".*;" UParseError parseError; iter.adoptInstead(ubrk_openRules(rules, LENGTHOF(rules), NULL, 0, &parseError, &status)); } } #endif options = 0; if(whichCase==TEST_TITLE || whichCase==TEST_FOLD) { optionsString = myCase->getString("Options", status); if(optionsString.indexOf((UChar)0x54)>=0) { // T options|=U_FOLD_CASE_EXCLUDE_SPECIAL_I; } if(optionsString.indexOf((UChar)0x4c)>=0) { // L options|=U_TITLECASE_NO_LOWERCASE; } if(optionsString.indexOf((UChar)0x41)>=0) { // A options|=U_TITLECASE_NO_BREAK_ADJUSTMENT; } } if(U_FAILURE(status)) { dataerrln("error: TestCasing() setup failed for %s test case from casing.res: %s", dataNames[whichCase], u_errorName(status)); status = U_ZERO_ERROR; } else { #if UCONFIG_NO_BREAK_ITERATION LocalPointer<UMemory> iter; #endif TestCasingImpl(input, output, whichCase, iter.getAlias(), cLocaleID, options); } #if !UCONFIG_NO_BREAK_ITERATION iter.adoptInstead(NULL); #endif } } } #if !UCONFIG_NO_BREAK_ITERATION // more tests for API coverage status=U_ZERO_ERROR; input=UNICODE_STRING_SIMPLE("sTrA\\u00dfE").unescape(); (result=input).toTitle(NULL); if(result!=UNICODE_STRING_SIMPLE("Stra\\u00dfe").unescape()) { dataerrln("UnicodeString::toTitle(NULL) failed."); } #endif }
/* * Internal titlecasing function. */ static int32_t _toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, UCaseContext *csc, int32_t srcLength, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; if(csm->iter!=NULL) { ubrk_setText(csm->iter, src, srcLength, pErrorCode); } else { csm->iter=ubrk_open(UBRK_WORD, csm->locale, src, srcLength, pErrorCode); } if(U_FAILURE(*pErrorCode)) { return 0; } /* set up local variables */ destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=ubrk_first(csm->iter); } else { idx=ubrk_next(csm->iter); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U16_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U16_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc->cpStart=titleStart; csc->cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { c=(UChar32) 0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }