int c_main( void ) { UBreakIterator *boundary; char cStringToExamine[] = "Aaa bbb ccc. Ddd eee fff."; UChar stringToExamine[sizeof(cStringToExamine)+1]; UErrorCode status = U_ZERO_ERROR; printf("\n\n" "C Boundary Analysis\n" "-------------------\n\n"); printf("Examining: %s\n", cStringToExamine); u_uastrcpy(stringToExamine, cStringToExamine); /*print each sentence in forward and reverse order*/ boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, -1, &status); if (U_FAILURE(status)) { printf("ubrk_open error: %s\n", u_errorName(status)); exit(1); } printf("\n----- Sentence Boundaries, forward: -----------\n"); printEachForward(boundary, stringToExamine); printf("\n----- Sentence Boundaries, backward: ----------\n"); printEachBackward(boundary, stringToExamine); ubrk_close(boundary); /*print each word in order*/ boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status); printf("\n----- Word Boundaries, forward: -----------\n"); printEachForward(boundary, stringToExamine); printf("\n----- Word Boundaries, backward: ----------\n"); printEachBackward(boundary, stringToExamine); /*print first element*/ printf("\n----- first: -------------\n"); printFirst(boundary, stringToExamine); /*print last element*/ printf("\n----- last: --------------\n"); printLast(boundary, stringToExamine); /*print word at charpos 10 */ printf("\n----- at pos 10: ---------\n"); printAt(boundary, 10 , stringToExamine); ubrk_close(boundary); printf("\nEnd of C boundary analysis\n"); return 0; }
Target::~Target() { ubrk_close(charBreakIterator); ucol_closeElements(elements); DELETE_ARRAY(ceb); }
void Target::setTargetString(const UnicodeString *target) { if (charBreakIterator != NULL) { ubrk_close(charBreakIterator); ucol_closeElements(elements); } targetString = target; if (targetString != NULL) { UErrorCode status = U_ZERO_ERROR; targetBuffer = targetString->getBuffer(); targetLength = targetString->length(); elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status); ucol_forceHanImplicit(elements, &status); charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), targetBuffer, targetLength, &status); } else { targetBuffer = NULL; targetLength = 0; } }
/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */ int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC ) { unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UErrorCode status = U_ZERO_ERROR; int ret_len, pos; UBreakIterator* bi; bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); if( U_FAILURE(status) ) { return -1; } ubrk_setText(bi, text, text_length, &status); pos = 0; for ( ret_len = 0; pos != UBRK_DONE; ) { pos = ubrk_next(bi); if ( pos != UBRK_DONE ) { if ( NULL != boundary_array && ret_len < boundary_array_len ) { boundary_array[ret_len] = pos; } ret_len++; } } ubrk_close(bi); return ret_len; }
/* ** Close a tokenization cursor previously opened by a call to icuOpen(). */ static int icuClose(sqlite3_tokenizer_cursor *pCursor){ IcuCursor *pCsr = (IcuCursor *)pCursor; ubrk_close(pCsr->pIter); sqlite3_free(pCsr->zBuffer); sqlite3_free(pCsr); return SQLITE_OK; }
static void TestBreakIteratorRuleError() { /* * TestBreakIteratorRuleError - Try to create a BI from rules with syntax errors, * check that the error is reported correctly. */ char rules[] = " # This is a rule comment on line 1\n" "[:L:]; # this rule is OK.\n" "abcdefg); # Error, mismatched parens\n"; UChar *uRules; void *freeHook = NULL; UErrorCode status = U_ZERO_ERROR; UParseError parseErr; UBreakIterator *bi; uRules = toUChar(rules, &freeHook); bi = ubrk_openRules(uRules, -1, /* The rules */ NULL, -1, /* The text to be iterated over. */ &parseErr, &status); if (U_SUCCESS(status)) { log_err("FAIL: construction of break iterator succeeded when it should have failed.\n"); ubrk_close(bi); } else { if (parseErr.line != 3 || parseErr.offset != 8) { log_data_err("FAIL: incorrect error position reported. Got line %d, char %d, expected line 3, char 7 (Are you missing data?)\n", parseErr.line, parseErr.offset); } } freeToUCharStrings(&freeHook); }
std::vector<lstring> convert_split_words(const lstring <) { std::vector<lstring> ret; UBreakIterator* bi; int prev = -1, pos; UErrorCode err = U_ZERO_ERROR; bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err); if (U_FAILURE(err)) return ret; pos = ubrk_first(bi); while (pos != UBRK_DONE) { int rules = ubrk_getRuleStatus(bi); if ((rules == UBRK_WORD_NONE) || (prev == -1)) { prev = pos; } else { ret.emplace_back(lt.substr(prev, pos - prev)); prev = -1; } pos = ubrk_next(bi); } ubrk_close(bi); return ret; }
U_CFUNC int32_t ustr_toTitle(const UCaseProps *csp, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, uint32_t options, UErrorCode *pErrorCode) { UCaseMap csm={ NULL }; UCaseContext csc={ NULL }; int32_t length; csm.csp=csp; csm.iter=titleIter; csm.options=options; setTempCaseMap(&csm, locale, pErrorCode); csc.p=(void *)src; csc.limit=srcLength; length=_toTitle(&csm, dest, destCapacity, src, &csc, srcLength, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } return length; }
void closeLineBreakIterator(UBreakIterator*& iterator) { UBreakIterator* ubrkIter = iterator; ASSERT(ubrkIter); ubrk_close(ubrkIter); iterator = nullptr; }
static void icu_BreakIterator_dealloc(icu_BreakIterator* self) { if (self->break_iterator != NULL) ubrk_close(self->break_iterator); if (self->text != NULL) free(self->text); self->break_iterator = NULL; self->text = NULL; self->ob_type->tp_free((PyObject*)self); }
/* {{{ grapheme_close_global_iterator - clean up */ void grapheme_close_global_iterator( TSRMLS_D ) { UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); if ( NULL != global_break_iterator ) { ubrk_close(global_break_iterator); } }
U_CAPI void U_EXPORT2 ucasemap_close(UCaseMap *csm) { if(csm!=NULL) { #if !UCONFIG_NO_BREAK_ITERATION ubrk_close(csm->iter); #endif uprv_free(csm); } }
static void bin_pattern_destroy(bin_pattern_t *p) { if (NULL != p->tmp) { ustring_destroy(p->tmp); } if (NULL != p->ubrk) { ubrk_close(p->ubrk); } ustring_destroy(p->pattern); free(p); }
static void fixed_pattern_destroy(fixed_pattern_t *p) { if (NULL != p->usearch) { usearch_close(p->usearch); } if (NULL != p->ubrk) { ubrk_close(p->ubrk); } ustring_destroy(p->pattern); free(p); }
static void TestBreakIteratorRefresh(void) { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { return; } utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_setUText(bi, &ut1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == ubrk_next(bi)); TEST_ASSERT(3 == ubrk_next(bi)); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_refreshUText(bi, &ut2, &status); TEST_ASSERT_SUCCESS(status); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == ubrk_next(bi)); TEST_ASSERT(7 == ubrk_next(bi)); TEST_ASSERT(8 == ubrk_next(bi)); TEST_ASSERT(UBRK_DONE == ubrk_next(bi)); TEST_ASSERT_SUCCESS(status); utext_close(&ut1); utext_close(&ut2); } ubrk_close(bi); }
static void TestBug11665(void) { // The problem was with the incorrect breaking of Japanese text beginning // with Katakana characters when no prior Japanese or Chinese text had been // encountered. // // Tested here in cintltst, rather than in intltest, because only cintltst // tests have the ability to reset ICU, which is needed to get the bug // to manifest itself. static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E}; int32_t boundaries[10] = {0}; UBreakIterator *bi = NULL; int32_t brk; int32_t brkIdx = 0; int32_t totalBreaks = 0; UErrorCode status = U_ZERO_ERROR; ctest_resetICU(); bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status); TEST_ASSERT_SUCCESS(status); if (!bi) { return; } for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { boundaries[brkIdx] = brk; if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { break; } } if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) { log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__); } else { totalBreaks = brkIdx; brkIdx = 0; for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) { if (brk != boundaries[brkIdx]) { log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx); break; } if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) { log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__); break; } } if (totalBreaks != brkIdx) { log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__); } } ubrk_close(bi); }
/* * TestBreakIteratorRules - Verify that a break iterator can be created from * a set of source rules. */ static void TestBreakIteratorRules() { /* Rules will keep together any run of letters not including 'a', OR * keep together 'abc', but only when followed by 'def', OTHERWISE * just return one char at a time. */ char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};"; /* 0123456789012345678 */ char data[] = "abcdex abcdefgh-def"; /* the test data string */ char breaks[] = "** ** * ** *"; /* * the expected break positions */ char tags[] = "01 21 6 21 2"; /* expected tag values at break positions */ int32_t tagMap[] = {0, 1, 2, 3, 4, 5, 666}; UChar *uData; void *freeHook = NULL; UErrorCode status = U_ZERO_ERROR; int32_t pos; int i; UBreakIterator *bi = testOpenRules(rules); if (bi == NULL) {return;} uData = toUChar(data, &freeHook); ubrk_setText(bi, uData, -1, &status); pos = ubrk_first(bi); for (i=0; i<sizeof(breaks); i++) { if (pos == i && breaks[i] != '*') { log_err("FAIL: unexpected break at position %d found\n", pos); break; } if (pos != i && breaks[i] == '*') { log_err("FAIL: expected break at position %d not found.\n", i); break; } if (pos == i) { int32_t tag, expectedTag; tag = ubrk_getRuleStatus(bi); expectedTag = tagMap[tags[i]&0xf]; if (tag != expectedTag) { log_err("FAIL: incorrect tag value. Position = %d; expected tag %d, got %d", pos, expectedTag, tag); break; } pos = ubrk_next(bi); } } freeToUCharStrings(&freeHook); ubrk_close(bi); }
static void *engine_fixed_compile(error_t **error, UString *ustr, uint32_t flags) { UErrorCode status; fixed_pattern_t *p; p = mem_new(*p); p->pattern = ustr; // not needed with usearch ? p->flags = flags; p->ubrk = NULL; p->usearch = NULL; status = U_ZERO_ERROR; if (ustring_empty(ustr)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } } else { if (!IS_WHOLE_LINE(flags)) { if (IS_WORD_BOUNDED(flags)) { p->ubrk = ubrk_open(UBRK_WORD, NULL, NULL, 0, &status); } else if (WITH_GRAPHEME()) { p->ubrk = ubrk_open(UBRK_CHARACTER, NULL, NULL, 0, &status); } if (U_FAILURE(status)) { fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "ubrk_open"); return NULL; } } if (IS_WORD_BOUNDED(flags) || (IS_CASE_INSENSITIVE(flags) && !IS_WHOLE_LINE(flags))) { p->usearch = usearch_open(ustr->ptr, ustr->len, USEARCH_FAKE_USTR, uloc_getDefault(), p->ubrk, &status); if (U_FAILURE(status)) { if (NULL != p->ubrk) { ubrk_close(p->ubrk); } fixed_pattern_destroy(p); icu_error_set(error, FATAL, status, "usearch_open"); return NULL; } if (IS_CASE_INSENSITIVE(flags)) { UCollator *ucol; ucol = usearch_getCollator(p->usearch); ucol_setStrength(ucol, (flags & ~OPT_MASK) > 1 ? UCOL_SECONDARY : UCOL_PRIMARY); } } } return p; }
static void TestBreakIteratorTailoring(void) { const RBBITailoringTest * testPtr; for (testPtr = tailoringTests; testPtr->locale != NULL; ++testPtr) { UErrorCode status = U_ZERO_ERROR; UBreakIterator* ubrkiter = ubrk_open(testPtr->type, testPtr->locale, testPtr->test, -1, &status); if ( U_SUCCESS(status) ) { int32_t offset, offsindx; UBool foundError; foundError = FALSE; for (offsindx = 0; (offset = ubrk_next(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsFwd[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]); } foundError = FALSE; for (offsindx = 0; (offset = ubrk_previous(ubrkiter)) != UBRK_DONE; ++offsindx) { if (!foundError && offsindx >= testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n", testPtr->locale, testPtr->type, offset); foundError = TRUE; } else if (!foundError && offset != testPtr->offsRev[offsindx]) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx], offset); foundError = TRUE; } } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n", testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]); } ubrk_close(ubrkiter); } else { log_err_status(status, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr->locale, testPtr->type, u_errorName(status)); } } }
/* * TestsBreakIteratorStatusVals() Test the ubrk_getRuleStatusVec() funciton */ static void TestBreakIteratorStatusVec() { #define RULE_STRING_LENGTH 200 UChar rules[RULE_STRING_LENGTH]; #define TEST_STRING_LENGTH 25 UChar testString[TEST_STRING_LENGTH]; UBreakIterator *bi = NULL; int32_t pos = 0; int32_t vals[10]; int32_t numVals; UErrorCode status = U_ZERO_ERROR; u_uastrncpy(rules, "[A-N]{100}; \n" "[a-w]{200}; \n" "[\\p{L}]{300}; \n" "[\\p{N}]{400}; \n" "[0-5]{500}; \n" "!.*;\n", RULE_STRING_LENGTH); u_uastrncpy(testString, "ABC", TEST_STRING_LENGTH); bi = ubrk_openRules(rules, -1, testString, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(bi != NULL); /* The TEST_ASSERT above should change too... */ if (bi != NULL) { pos = ubrk_next(bi); TEST_ASSERT(pos == 1); memset(vals, -1, sizeof(vals)); numVals = ubrk_getRuleStatusVec(bi, vals, 10, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numVals == 2); TEST_ASSERT(vals[0] == 100); TEST_ASSERT(vals[1] == 300); TEST_ASSERT(vals[2] == -1); numVals = ubrk_getRuleStatusVec(bi, vals, 0, &status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(numVals == 2); } ubrk_close(bi); }
U_CAPI int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode) { UCaseMap csm={ NULL }; int32_t length; csm.iter=titleIter; setTempCaseMap(&csm, locale, pErrorCode); length=caseMap(&csm, dest, destCapacity, src, srcLength, TO_TITLE, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } return length; }
/* * static void TestBreakIteratorUText(void); * * Test that ubrk_setUText() is present and works for a simple case. */ static void TestBreakIteratorUText(void) { const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67"; /* c3 85 is utf-8 for A with a ring on top */ /* 0 1 2 34567890 */ UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi = NULL; int32_t pos = 0; UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status); TEST_ASSERT_SUCCESS(status); bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); if (U_FAILURE(status)) { log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } ubrk_setUText(bi, ut, &status); if (U_FAILURE(status)) { log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } pos = ubrk_first(bi); TEST_ASSERT(pos == 0); pos = ubrk_next(bi); TEST_ASSERT(pos == 4); pos = ubrk_next(bi); TEST_ASSERT(pos == 5); pos = ubrk_next(bi); TEST_ASSERT(pos == 10); pos = ubrk_next(bi); TEST_ASSERT(pos == UBRK_DONE); ubrk_close(bi); utext_close(ut); }
int KWQFindNextWordFromIndex(const QChar *chars, int len, int position, bool forward) { int pos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_WORD, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { if (forward) { pos = ubrk_following(boundary, position); if (pos == UBRK_DONE) pos = len; } else { pos = ubrk_preceding(boundary, position); if (pos == UBRK_DONE) pos = 0; } ubrk_close(boundary); } return pos; }
void KWQFindSentenceBoundary(const QChar *chars, int len, int position, int *start, int *end) { int startPos = 0; int endPos = 0; UErrorCode status = U_ZERO_ERROR; UBreakIterator *boundary = ubrk_open(UBRK_SENTENCE, (const char*)currentTextBreakLocaleID().c_str(), const_cast<UChar *>(reinterpret_cast<const UChar *>(chars)), len, &status); if ( boundary && U_SUCCESS(status) ) { startPos = ubrk_preceding(boundary, position); if (startPos == UBRK_DONE) { startPos = 0; } endPos = ubrk_following(boundary, startPos); if (endPos == UBRK_DONE) endPos = len; ubrk_close(boundary); } *start = startPos; *end = endPos; }
U_CAPI int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode) { UCaseMap csm=UCASEMAP_INITIALIZER; setTempCaseMap(&csm, locale); if(titleIter!=NULL) { ubrk_setText(csm.iter=titleIter, src, srcLength, pErrorCode); } else { csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode); } int32_t length=ustrcase_map( &csm, dest, destCapacity, src, srcLength, ustrcase_internalToTitle, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } return length; }
void StringCaseTest::TestCasing() { UErrorCode status = U_ZERO_ERROR; void *iter; char cLocaleID[100]; UnicodeString locale, input, output, optionsString, result; uint32_t options; int32_t whichCase, type; TestDataModule *driver = TestDataModule::getTestDataModule("casing", *this, status); if(U_SUCCESS(status)) { for(whichCase=0; whichCase<TEST_COUNT; ++whichCase) { #if UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE) { continue; } #endif TestData *casingTest = driver->createTestData(dataNames[whichCase], status); if(U_FAILURE(status)) { errln("TestCasing failed to createTestData(%s) - %s", dataNames[whichCase], u_errorName(status)); break; } const DataMap *myCase = NULL; while(casingTest->nextCase(myCase, status)) { input = myCase->getString("Input", status); output = myCase->getString("Output", status); if(whichCase!=TEST_FOLD) { locale = myCase->getString("Locale", status); } locale.extract(0, 0x7fffffff, cLocaleID, sizeof(cLocaleID), ""); iter=NULL; #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE) { type = myCase->getInt("Type", status); if(type>=0) { iter=ubrk_open((UBreakIteratorType)type, cLocaleID, NULL, 0, &status); } else if(type==-2) { // Open a trivial break iterator that only delivers { 0, length } // or even just { 0 } as boundaries. static const UChar rules[] = { 0x2e, 0x2a, 0x3b }; // ".*;" UParseError parseError; iter=ubrk_openRules(rules, LENGTHOF(rules), NULL, 0, &parseError, &status); } } #endif options = 0; if(whichCase==TEST_TITLE || whichCase==TEST_FOLD) { optionsString = myCase->getString("Options", status); if(optionsString.indexOf((UChar)0x54)>=0) { // T options|=U_FOLD_CASE_EXCLUDE_SPECIAL_I; } if(optionsString.indexOf((UChar)0x4c)>=0) { // L options|=U_TITLECASE_NO_LOWERCASE; } if(optionsString.indexOf((UChar)0x41)>=0) { // A options|=U_TITLECASE_NO_BREAK_ADJUSTMENT; } } if(U_FAILURE(status)) { errln("error: TestCasing() setup failed for %s test case from casing.res: %s", dataNames[whichCase], u_errorName(status)); status = U_ZERO_ERROR; } else { TestCasingImpl(input, output, whichCase, iter, cLocaleID, options); } #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { ubrk_close(iter); } #endif } delete casingTest; } } delete driver; #if !UCONFIG_NO_BREAK_ITERATION // more tests for API coverage status=U_ZERO_ERROR; input=UNICODE_STRING_SIMPLE("sTrA\\u00dfE").unescape(); (result=input).toTitle(NULL); if(result!=UNICODE_STRING_SIMPLE("Stra\\u00dfe").unescape()) { errln("UnicodeString::toTitle(NULL) failed"); } #endif }
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) { UErrorCode status = U_ZERO_ERROR; OrderList targetOrders(coll, target, offset); OrderList patternOrders(coll, pattern); int32_t targetSize = targetOrders.size() - 1; int32_t patternSize = patternOrders.size() - 1; UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), target.getBuffer(), target.length(), &status); if (patternSize == 0) { // Searching for an empty pattern always fails matchStart = matchEnd = -1; ubrk_close(charBreakIterator); return FALSE; } matchStart = matchEnd = -1; for(int32_t i = 0; i < targetSize; i += 1) { if (targetOrders.matchesAt(i, patternOrders)) { int32_t start = targetOrders.getLowOffset(i); int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); // if the low and high offsets of the first CE in // the match are the same, it means that the match // starts in the middle of an expansion - all but // the first CE of the expansion will have the offset // of the following character. if (start == targetOrders.getHighOffset(i)) { continue; } // Make sure match starts on a grapheme boundary if (! ubrk_isBoundary(charBreakIterator, start)) { continue; } // If the low and high offsets of the CE after the match // are the same, it means that the match ends in the middle // of an expansion sequence. if (maxLimit == targetOrders.getHighOffset(i + patternSize) && targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { continue; } int32_t mend = maxLimit; // Find the first grapheme break after the character index // of the last CE in the match. If it's after character index // that's after the last CE in the match, use that index // as the end of the match. if (minLimit < maxLimit) { // When the last CE's low index is same with its high index, the CE is likely // a part of expansion. In this case, the index is located just after the // character corresponding to the CEs compared above. If the index is right // at the break boundary, move the position to the next boundary will result // incorrect match length when there are ignorable characters exist between // the position and the next character produces CE(s). See ticket#8482. if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { mend = minLimit; } else { int32_t nba = ubrk_following(charBreakIterator, minLimit); if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { mend = nba; } } } if (mend > maxLimit) { continue; } if (! ubrk_isBoundary(charBreakIterator, mend)) { continue; } matchStart = start; matchEnd = mend; ubrk_close(charBreakIterator); return TRUE; } } ubrk_close(charBreakIterator); return FALSE; }
static void TestCaseTitle(void) { static const UChar beforeTitle[]= { 0x61, 0x42, 0x20, 0x69, 0x3c2, 0x20, 0xdf, 0x3c3, 0x2f, 0xfb03, 0xd93f, 0xdfff }, titleWord[]= { 0x41, 0x62, 0x20, 0x49, 0x3c2, 0x20, 0x53, 0x73, 0x3c3, 0x2f, 0x46, 0x66, 0x69, 0xd93f, 0xdfff }, titleChar[]= { 0x41, 0x42, 0x20, 0x49, 0x3a3, 0x20, 0x53, 0x73, 0x3a3, 0x2f, 0x46, 0x66, 0x69, 0xd93f, 0xdfff }; UChar buffer[32]; UBreakIterator *titleIterChars; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; titleIterChars=ubrk_open(UBRK_CHARACTER, "", beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, &errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "error: ubrk_open(UBRK_CHARACTER)->%s\n", u_errorName(errorCode)); return; } /* titlecase with standard break iterator and in the same buffer */ uprv_memcpy(buffer, beforeTitle, sizeof(beforeTitle)); errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, buffer, sizeof(beforeTitle)/U_SIZEOF_UCHAR, NULL, "", &errorCode); if( U_FAILURE(errorCode) || length!=(sizeof(titleWord)/U_SIZEOF_UCHAR) || uprv_memcmp(titleWord, buffer, length*U_SIZEOF_UCHAR)!=0 || buffer[length]!=0 ) { log_err("error in u_strToTitle(standard iterator)=%ld error=%s string matches: %s\n", length, u_errorName(errorCode), uprv_memcmp(titleWord, buffer, length*U_SIZEOF_UCHAR)==0 && buffer[length]==0 ? "yes" : "no"); } /* titlecase with UBRK_CHARACTERS and separate buffers */ buffer[0]=0xabcd; errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, titleIterChars, "", &errorCode); if( U_FAILURE(errorCode) || length!=(sizeof(titleChar)/U_SIZEOF_UCHAR) || uprv_memcmp(titleChar, buffer, length*U_SIZEOF_UCHAR)!=0 || buffer[length]!=0 ) { log_err("error in u_strToTitle(UBRK_CHARACTERS)=%ld error=%s string matches: %s\n", length, u_errorName(errorCode), uprv_memcmp(titleChar, buffer, length*U_SIZEOF_UCHAR)==0 && buffer[length]==0 ? "yes" : "no"); } /* test preflighting */ errorCode=U_ZERO_ERROR; length=u_strToTitle(NULL, 0, beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, titleIterChars, "", &errorCode); if( errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=(sizeof(titleChar)/U_SIZEOF_UCHAR) ) { log_err("error in u_strToTitle(UBRK_CHARACTERS pure preflighting)=%ld error=%s\n", length, u_errorName(errorCode)); } /* test error handling */ buffer[0]=0xabcd; errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, NULL, sizeof(beforeTitle)/U_SIZEOF_UCHAR, titleIterChars, "", &errorCode); if( errorCode!=U_ILLEGAL_ARGUMENT_ERROR || buffer[0]!=0xabcd ) { log_err("error in u_strToTitle(UBRK_CHARACTERS src=NULL)=%ld error=%s buffer[0]==0x%lx\n", length, u_errorName(errorCode), buffer[0]); } buffer[0]=0xabcd; errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, beforeTitle, -2, titleIterChars, "", &errorCode); if( errorCode!=U_ILLEGAL_ARGUMENT_ERROR || buffer[0]!=0xabcd ) { log_err("error in u_strToTitle(UBRK_CHARACTERS srcLength=-2)=%ld error=%s buffer[0]==0x%lx\n", length, u_errorName(errorCode), buffer[0]); } ubrk_close(titleIterChars); }
static void TestCaseDutchTitle(void) { static const UChar beforeTitle[]= { 0x69, 0x6A, 0x73, 0x73, 0x45, 0x6c, 0x20, 0x69, 0x67, 0x6c, 0x4f, 0x6f , 0x20 , 0x49, 0x4A, 0x53, 0x53, 0x45, 0x4C }, titleRoot[]= { 0x49, 0x6A, 0x73, 0x73, 0x65, 0x6c, 0x20, 0x49, 0x67, 0x6c, 0x6f, 0x6f , 0x20 , 0x49, 0x6A, 0x73, 0x73, 0x65, 0x6C }, titleDutch[]= { 0x49, 0x4A, 0x73, 0x73, 0x65, 0x6c, 0x20, 0x49, 0x67, 0x6c, 0x6f, 0x6f , 0x20 , 0x49, 0x4A, 0x73, 0x73, 0x65, 0x6C }; UChar buffer[32]; UBreakIterator *titleIterWord; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; titleIterWord=ubrk_open(UBRK_WORD, "", beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, &errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "error: ubrk_open(UBRK_WORD)->%s\n", u_errorName(errorCode)); return; } /* titlecase with default locale */ buffer[0]=0xabcd; errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, titleIterWord, "", &errorCode); if( U_FAILURE(errorCode) || length!=(sizeof(titleRoot)/U_SIZEOF_UCHAR) || uprv_memcmp(titleRoot, buffer, length*U_SIZEOF_UCHAR)!=0 || buffer[length]!=0 ) { char charsOut[21]; u_UCharsToChars(buffer,charsOut,sizeof(charsOut)); log_err("error in u_strToTitle(UBRK_CHARACTERS)=%ld error=%s root locale string matches: %s\noutput buffer is {%s}\n", length, u_errorName(errorCode), uprv_memcmp(titleRoot, buffer, length*U_SIZEOF_UCHAR)==0 && buffer[length]==0 ? "yes" : "no", charsOut); } /* titlecase with Dutch locale */ buffer[0]=0xabcd; errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, sizeof(buffer)/U_SIZEOF_UCHAR, beforeTitle, sizeof(beforeTitle)/U_SIZEOF_UCHAR, titleIterWord, "nl", &errorCode); if( U_FAILURE(errorCode) || length!=(sizeof(titleDutch)/U_SIZEOF_UCHAR) || uprv_memcmp(titleDutch, buffer, length*U_SIZEOF_UCHAR)!=0 || buffer[length]!=0 ) { char charsOut[21]; u_UCharsToChars(buffer,charsOut,sizeof(charsOut)); log_err("error in u_strToTitle(UBRK_CHARACTERS)=%ld error=%s dutch locale string matches: %s\noutput buffer is {%s}\n", length, u_errorName(errorCode), uprv_memcmp(titleDutch, buffer, length*U_SIZEOF_UCHAR)==0 && buffer[length]==0 ? "yes" : "no", charsOut); } ubrk_close(titleIterWord); }
/* Try titlecasing with options. */ static void TestUCaseMapToTitle(void) { /* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */ /* * Note: The sentence BreakIterator does not recognize a '.' * as a sentence terminator if it is followed by lowercase. * That is why the example has the '!'. */ static const UChar beforeTitle[]= { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e }, titleWord[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e }, titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e }, titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e }; UChar buffer[32]; UCaseMap *csm; UBreakIterator *sentenceIter; const UBreakIterator *iter; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open("", 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode)); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=NULL) { log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter); } /* Use default UBreakIterator: Word breaks. */ length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWord) || 0!=u_memcmp(buffer, titleWord, length) || buffer[length]!=0 ) { log_err_status(errorCode, "ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } if (U_SUCCESS(errorCode)) { iter=ucasemap_getBreakIterator(csm); if(iter==NULL) { log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n"); } } /* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */ ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode)); return; } length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWordNoAdjust) || 0!=u_memcmp(buffer, titleWordNoAdjust, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* Set a sentence break iterator. */ errorCode=U_ZERO_ERROR; sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode)); ucasemap_close(csm); return; } ucasemap_setBreakIterator(csm, sentenceIter, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode)); ubrk_close(sentenceIter); ucasemap_close(csm); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=sentenceIter) { log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter); } ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode)); return; } /* Use the sentence break iterator with the option. Preflight first. */ length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=UPRV_LENGTHOF(titleSentNoLower) ) { log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; buffer[0]=0; length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleSentNoLower) || 0!=u_memcmp(buffer, titleSentNoLower, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */ { char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64]; int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength; errorCode=U_ZERO_ERROR; u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, UPRV_LENGTHOF(titleSentNoLower), &errorCode); length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode); if( U_FAILURE(errorCode) || length!=utf8TitleSentNoLowerLength || 0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) || utf8[length]!=0 ) { log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } } ucasemap_close(csm); }