void RBBIAPITest::RoundtripRule(const char *dataFile) { UErrorCode status = U_ZERO_ERROR; UParseError parseError; parseError.line = 0; parseError.offset = 0; LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status)); uint32_t length; const UChar *builtSource; const uint8_t *rbbiRules; const uint8_t *builtRules; if (U_FAILURE(status)) { errcheckln(status, "Can't open \"%s\" - %s", dataFile, u_errorName(status)); return; } builtRules = (const uint8_t *)udata_getMemory(data.getAlias()); builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource); RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status); if (U_FAILURE(status)) { errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", u_errorName(status), parseError.line, parseError.offset); return; }; rbbiRules = brkItr->getBinaryRules(length); logln("Comparing \"%s\" len=%d", dataFile, length); if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) { errln("Built rules and rebuilt rules are different %s", dataFile); return; } delete brkItr; }
// // Bug 2190 Regression test. Builder crash on rule consisting of only a // $variable reference void RBBIAPITest::TestBug2190() { UnicodeString rulesString1 = "$aaa = abcd;\n" "$bbb = $aaa;\n" "$bbb;\n"; UnicodeString testString1 = "abcdabcd"; // 01234567890 int32_t bounds1[] = {0, 4, 8}; UErrorCode status=U_ZERO_ERROR; UParseError parseError; RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); if(U_FAILURE(status)) { dataerrln("Fail : in construction - %s", u_errorName(status)); } else { bi->setText(testString1); doBoundaryTest(*bi, testString1, bounds1); } delete bi; }
// // TestQuoteGrouping // Single quotes within rules imply a grouping, so that a modifier // following the quoted text (* or +) applies to all of the quoted chars. // void RBBIAPITest::TestQuoteGrouping() { UnicodeString rulesString1 = "#Here comes the rule...\n" "'$@!'*;\n" // (\$\@\!)* ".;\n"; UnicodeString testString1 = "$@!$@!X$@!!X"; // 0123456789012 int32_t bounds1[] = {0, 6, 7, 10, 11, 12}; UErrorCode status=U_ZERO_ERROR; UParseError parseError; RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); if(U_FAILURE(status)) { dataerrln("Fail : in construction - %s", u_errorName(status)); } else { bi->setText(testString1); doBoundaryTest(*bi, testString1, bounds1); } delete bi; }
void RBBIAPITest::TestBuilder() { UnicodeString rulesString1 = "$Letters = [:L:];\n" "$Numbers = [:N:];\n" "$Letters+;\n" "$Numbers+;\n" "[^$Letters $Numbers];\n" "!.*;\n"; UnicodeString testString1 = "abc123..abc"; // 01234567890 int32_t bounds1[] = {0, 3, 6, 7, 8, 11}; UErrorCode status=U_ZERO_ERROR; UParseError parseError; RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); if(U_FAILURE(status)) { dataerrln("Fail : in construction - %s", u_errorName(status)); } else { bi->setText(testString1); doBoundaryTest(*bi, testString1, bounds1); } delete bi; }
void RBBIAPITest::TestRefreshInputText() { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); TEST_ASSERT_SUCCESS(status); utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { bi->setText(&ut1, status); TEST_ASSERT_SUCCESS(status); /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == bi->next()); TEST_ASSERT(3 == bi->next()); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(bi == returnedBI); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == bi->next()); TEST_ASSERT(7 == bi->next()); TEST_ASSERT(8 == bi->next()); TEST_ASSERT(UBRK_DONE == bi->next()); utext_close(&ut1); utext_close(&ut2); } delete bi; }
void RBBIAPITest::doBoundaryTest(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries){ logln((UnicodeString)"testIsBoundary():"); int32_t p = 0; UBool isB; for (int32_t i = 0; i < text.length(); i++) { isB = bi.isBoundary(i); logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB); if (i == boundaries[p]) { if (!isB) errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false"); p++; } else { if (isB) errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true"); } } }
U_NAMESPACE_BEGIN // ------------------------------------- BreakIterator* BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status) { char fnbuff[256]; char ext[4]={'\0'}; char actualLocale[ULOC_FULLNAME_CAPACITY]; int32_t size; const UChar* brkfname = NULL; UResourceBundle brkRulesStack; UResourceBundle brkNameStack; UResourceBundle *brkRules = &brkRulesStack; UResourceBundle *brkName = &brkNameStack; RuleBasedBreakIterator *result = NULL; if (U_FAILURE(status)) return NULL; ures_initStackObject(brkRules); ures_initStackObject(brkName); // Get the locale UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, loc.getName(), &status); /* this is a hack for now. Should be fixed when the data is fetched from brk_index.txt */ if(status==U_USING_DEFAULT_WARNING){ status=U_ZERO_ERROR; ures_openFillIn(b, U_ICUDATA_BRKITR, "", &status); } // Get the "boundaries" array. if (U_SUCCESS(status)) { brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status); // Get the string object naming the rules file brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status); // Get the actual string brkfname = ures_getString(brkName, &size, &status); U_ASSERT((size_t)size<sizeof(fnbuff)); if ((size_t)size>=sizeof(fnbuff)) { size=0; if (U_SUCCESS(status)) { status = U_BUFFER_OVERFLOW_ERROR; } } // Use the string if we found it if (U_SUCCESS(status) && brkfname) { uprv_strncpy(actualLocale, ures_getLocale(brkName, &status), sizeof(actualLocale)/sizeof(actualLocale[0])); UChar* extStart=u_strchr(brkfname, 0x002e); int len = 0; if(extStart!=NULL){ len = (int)(extStart-brkfname); u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff u_UCharsToChars(brkfname, fnbuff, len); } fnbuff[len]=0; // nul terminate } } ures_close(brkRules); ures_close(brkName); UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status); if (U_FAILURE(status)) { ures_close(b); return NULL; } // Create a RuleBasedBreakIterator result = new RuleBasedBreakIterator(file, status); // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != NULL) { U_LOCALE_BASED(locBased, *(BreakIterator*)result); locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale); result->setBreakType(kind); } ures_close(b); if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple delete result; return NULL; } if (result == NULL) { udata_close(file); if (U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } } return result; }
// // TestRuleStatusVec // Test the vector form of break rule status. // void RBBIAPITest::TestRuleStatusVec() { UnicodeString rulesString( "[A-N]{100}; \n" "[a-w]{200}; \n" "[\\p{L}]{300}; \n" "[\\p{N}]{400}; \n" "[0-5]{500}; \n" "!.*;\n", -1, US_INV); UnicodeString testString1 = "Aapz5?"; int32_t statusVals[10]; int32_t numStatuses; int32_t pos; UErrorCode status=U_ZERO_ERROR; UParseError parseError; RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status); if (U_FAILURE(status)) { dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); } else { bi->setText(testString1); // A pos = bi->next(); TEST_ASSERT(pos==1); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 100); TEST_ASSERT(statusVals[1] == 300); // a pos = bi->next(); TEST_ASSERT(pos==2); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 200); TEST_ASSERT(statusVals[1] == 300); // p pos = bi->next(); TEST_ASSERT(pos==3); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 200); TEST_ASSERT(statusVals[1] == 300); // z pos = bi->next(); TEST_ASSERT(pos==4); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 1); TEST_ASSERT(statusVals[0] == 300); // 5 pos = bi->next(); TEST_ASSERT(pos==5); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 400); TEST_ASSERT(statusVals[1] == 500); // ? pos = bi->next(); TEST_ASSERT(pos==6); numStatuses = bi->getRuleStatusVec(statusVals, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 1); TEST_ASSERT(statusVals[0] == 0); // // Check buffer overflow error handling. Char == A // bi->first(); pos = bi->next(); TEST_ASSERT(pos==1); memset(statusVals, -1, sizeof(statusVals)); numStatuses = bi->getRuleStatusVec(statusVals, 0, status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == -1); status = U_ZERO_ERROR; memset(statusVals, -1, sizeof(statusVals)); numStatuses = bi->getRuleStatusVec(statusVals, 1, status); TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 100); TEST_ASSERT(statusVals[1] == -1); status = U_ZERO_ERROR; memset(statusVals, -1, sizeof(statusVals)); numStatuses = bi->getRuleStatusVec(statusVals, 2, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(numStatuses == 2); TEST_ASSERT(statusVals[0] == 100); TEST_ASSERT(statusVals[1] == 300); TEST_ASSERT(statusVals[2] == -1); } delete bi; }
void RBBIAPITest::TestCloneEquals() { UErrorCode status=U_ZERO_ERROR; RuleBasedBreakIterator* bi1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); RuleBasedBreakIterator* biequal = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); RuleBasedBreakIterator* bi3 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); RuleBasedBreakIterator* bi2 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); if(U_FAILURE(status)){ errcheckln(status, "Fail : in construction - %s", u_errorName(status)); return; } UnicodeString testString="Testing word break iterators's clone() and equals()"; bi1->setText(testString); bi2->setText(testString); biequal->setText(testString); bi3->setText("hello"); logln((UnicodeString)"Testing equals()"); logln((UnicodeString)"Testing == and !="); UBool b = (*bi1 != *biequal); b |= *bi1 == *bi2; b |= *bi1 == *bi3; if (b) { errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed."); } if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3) errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed."); // Quick test of RulesBasedBreakIterator assignment - // Check that // two different iterators are != // they are == after assignment // source and dest iterator produce the same next() after assignment. // deleting one doesn't disable the other. logln("Testing assignment"); RuleBasedBreakIterator *bix = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); if(U_FAILURE(status)){ errcheckln(status, "Fail : in construction - %s", u_errorName(status)); return; } RuleBasedBreakIterator biDefault, biDefault2; if(U_FAILURE(status)){ errln((UnicodeString)"FAIL : in construction of default iterator"); return; } if (biDefault == *bix) { errln((UnicodeString)"ERROR: iterators should not compare =="); return; } if (biDefault != biDefault2) { errln((UnicodeString)"ERROR: iterators should compare =="); return; } UnicodeString HelloString("Hello Kitty"); bix->setText(HelloString); if (*bix == *bi2) { errln(UnicodeString("ERROR: strings should not be equal before assignment.")); } *bix = *bi2; if (*bix != *bi2) { errln(UnicodeString("ERROR: strings should be equal before assignment.")); } int bixnext = bix->next(); int bi2next = bi2->next(); if (! (bixnext == bi2next && bixnext == 7)) { errln(UnicodeString("ERROR: iterators behaved differently after assignment.")); } delete bix; if (bi2->next() != 8) { errln(UnicodeString("ERROR: iterator.next() failed after deleting copy.")); } logln((UnicodeString)"Testing clone()"); RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone(); RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone(); if(*bi1clone != *bi1 || *bi1clone != *biequal || *bi1clone == *bi3 || *bi1clone == *bi2) errln((UnicodeString)"ERROR:1 RBBI's clone() method failed"); if(*bi2clone == *bi1 || *bi2clone == *biequal || *bi2clone == *bi3 || *bi2clone != *bi2) errln((UnicodeString)"ERROR:2 RBBI's clone() method failed"); if(bi1->getText() != bi1clone->getText() || bi2clone->getText() != bi2->getText() || *bi2clone == *bi1clone ) errln((UnicodeString)"ERROR: RBBI's clone() method failed"); delete bi1clone; delete bi2clone; delete bi1; delete bi3; delete bi2; delete biequal; }
void RBBIAPITest::TestIteration() { // This test just verifies that the API is present. // Testing for correct operation of the break rules happens elsewhere. UErrorCode status=U_ZERO_ERROR; RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status)); } delete bi; status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating Word break iterator. Status = %s", u_errorName(status)); } delete bi; status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating Line break iterator. Status = %s", u_errorName(status)); } delete bi; status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating Sentence break iterator. Status = %s", u_errorName(status)); } delete bi; status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating Title break iterator. Status = %s", u_errorName(status)); } delete bi; status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status)); return; // Skip the rest of these tests. } UnicodeString testString="0123456789"; bi->setText(testString); int32_t i; i = bi->first(); if (i != 0) { errln("Incorrect value from bi->first(). Expected 0, got %d.", i); } i = bi->last(); if (i != 10) { errln("Incorrect value from bi->last(). Expected 10, got %d", i); } // // Previous // bi->last(); i = bi->previous(); if (i != 9) { errln("Incorrect value from bi->last() at line %d. Expected 9, got %d", __LINE__, i); } bi->first(); i = bi->previous(); if (i != BreakIterator::DONE) { errln("Incorrect value from bi->previous() at line %d. Expected DONE, got %d", __LINE__, i); } // // next() // bi->first(); i = bi->next(); if (i != 1) { errln("Incorrect value from bi->next() at line %d. Expected 1, got %d", __LINE__, i); } bi->last(); i = bi->next(); if (i != BreakIterator::DONE) { errln("Incorrect value from bi->next() at line %d. Expected DONE, got %d", __LINE__, i); } // // current() // bi->first(); i = bi->current(); if (i != 0) { errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i); } bi->next(); i = bi->current(); if (i != 1) { errln("Incorrect value from bi->previous() at line %d. Expected 1, got %d", __LINE__, i); } bi->last(); bi->next(); i = bi->current(); if (i != 10) { errln("Incorrect value from bi->previous() at line %d. Expected 10, got %d", __LINE__, i); } bi->first(); bi->previous(); i = bi->current(); if (i != 0) { errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i); } // // Following() // i = bi->following(4); if (i != 5) { errln("Incorrect value from bi->following() at line %d. Expected 5, got %d", __LINE__, i); } i = bi->following(9); if (i != 10) { errln("Incorrect value from bi->following() at line %d. Expected 10, got %d", __LINE__, i); } i = bi->following(10); if (i != BreakIterator::DONE) { errln("Incorrect value from bi->following() at line %d. Expected DONE, got %d", __LINE__, i); } // // Preceding // i = bi->preceding(4); if (i != 3) { errln("Incorrect value from bi->preceding() at line %d. Expected 3, got %d", __LINE__, i); } i = bi->preceding(10); if (i != 9) { errln("Incorrect value from bi->preceding() at line %d. Expected 9, got %d", __LINE__, i); } i = bi->preceding(1); if (i != 0) { errln("Incorrect value from bi->preceding() at line %d. Expected 0, got %d", __LINE__, i); } i = bi->preceding(0); if (i != BreakIterator::DONE) { errln("Incorrect value from bi->preceding() at line %d. Expected DONE, got %d", __LINE__, i); } // // isBoundary() // bi->first(); if (bi->isBoundary(3) != TRUE) { errln("Incorrect value from bi->isBoudary() at line %d. Expected TRUE, got FALSE", __LINE__, i); } i = bi->current(); if (i != 3) { errln("Incorrect value from bi->current() at line %d. Expected 3, got %d", __LINE__, i); } if (bi->isBoundary(11) != FALSE) { errln("Incorrect value from bi->isBoudary() at line %d. Expected FALSE, got TRUE", __LINE__, i); } i = bi->current(); if (i != 10) { errln("Incorrect value from bi->current() at line %d. Expected 10, got %d", __LINE__, i); } // // next(n) // bi->first(); i = bi->next(4); if (i != 4) { errln("Incorrect value from bi->next() at line %d. Expected 4, got %d", __LINE__, i); } i = bi->next(6); if (i != 10) { errln("Incorrect value from bi->next() at line %d. Expected 10, got %d", __LINE__, i); } bi->first(); i = bi->next(11); if (i != BreakIterator::DONE) { errln("Incorrect value from bi->next() at line %d. Expected BreakIterator::DONE, got %d", __LINE__, i); } delete bi; }
// Try out the RuleBasedBreakIterator constructors that take RBBIDataHeader* // (these are protected so we access them via a local class RBBIWithProtectedFunctions). // This is just a sanity check, not a thorough test (e.g. we don't check that the // first delete actually frees rulesCopy). void RBBIAPITest::TestCreateFromRBBIData() { // Get some handy RBBIData const char *brkName = "word"; // or "sent", "line", "char", etc. UErrorCode status = U_ZERO_ERROR; LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", brkName, &status)); if ( U_SUCCESS(status) ) { const RBBIDataHeader * builtRules = (const RBBIDataHeader *)udata_getMemory(data.getAlias()); uint32_t length = builtRules->fLength; RBBIWithProtectedFunctions * brkItr; // Try the memory-adopting constructor, need to copy the data first RBBIDataHeader * rulesCopy = (RBBIDataHeader *) uprv_malloc(length); if ( rulesCopy ) { uprv_memcpy( rulesCopy, builtRules, length ); brkItr = new RBBIWithProtectedFunctions(rulesCopy, status); if ( U_SUCCESS(status) ) { delete brkItr; // this should free rulesCopy } else { errln("create RuleBasedBreakIterator from RBBIData (adopted): ICU Error \"%s\"\n", u_errorName(status) ); status = U_ZERO_ERROR;// reset for the next test uprv_free( rulesCopy ); } } // Now try the non-adopting constructor brkItr = new RBBIWithProtectedFunctions(builtRules, RBBIWithProtectedFunctions::kDontAdopt, status); if ( U_SUCCESS(status) ) { delete brkItr; // this should NOT attempt to free builtRules if (builtRules->fLength != length) { // sanity check errln("create RuleBasedBreakIterator from RBBIData (non-adopted): delete affects data\n" ); } } else { errln("create RuleBasedBreakIterator from RBBIData (non-adopted): ICU Error \"%s\"\n", u_errorName(status) ); } } // getBinaryRules() and RuleBasedBreakIterator(uint8_t binaryRules, ...) // status = U_ZERO_ERROR; RuleBasedBreakIterator *rb = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); if (rb == NULL || U_FAILURE(status)) { dataerrln("Unable to create BreakIterator::createWordInstance (Locale::getEnglish) - %s", u_errorName(status)); } else { uint32_t length; const uint8_t *rules = rb->getBinaryRules(length); RuleBasedBreakIterator *rb2 = new RuleBasedBreakIterator(rules, length, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(*rb == *rb2); UnicodeString words = "one two three "; rb2->setText(words); int wordCounter = 0; while (rb2->next() != UBRK_DONE) { wordCounter++; } TEST_ASSERT(wordCounter == 6); status = U_ZERO_ERROR; RuleBasedBreakIterator *rb3 = new RuleBasedBreakIterator(rules, length-1, status); TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); delete rb; delete rb2; delete rb3; } }
//---------------------------------------------------------------------------- // // main for genbrk // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *ruleFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!(options[3].doesOccur && options[4].doesOccur)) { fprintf(stderr, "rule file and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } ruleFileName = options[3].value; outFileName = options[4].value; if (options[5].doesOccur) { u_setDataDirectory(options[5].value); } /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[6].doesOccur) { outDir = options[6].value; } if (options[7].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else // // Read in the rule source file // long result; long ruleFileSize; FILE *file; char *ruleBufferC; file = fopen(ruleFileName, "rb"); if( file == 0 ) { fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); exit(-1); } fseek(file, 0, SEEK_END); ruleFileSize = ftell(file); fseek(file, 0, SEEK_SET); ruleBufferC = new char[ruleFileSize+10]; result = (long)fread(ruleBufferC, 1, ruleFileSize, file); if (result != ruleFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); exit (-1); } ruleBufferC[ruleFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the rule file // int32_t signatureLength; const char * ruleSourceC = ruleBufferC; const char* encoding = ucnv_detectUnicodeSignature( ruleSourceC, ruleFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ ruleSourceC += signatureLength; ruleFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the rules to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, ruleSourceC, ruleFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *ruleSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, ruleSourceU, // dest, destCap+1, ruleSourceC, ruleFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // // Put the source rules into a UnicodeString // UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); // // Create the break iterator from the rules // This will compile the rules. // UParseError parseError; parseError.line = 0; parseError.offset = 0; RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); if (U_FAILURE(status)) { fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", u_errorName(status), (int)parseError.line, (int)parseError.offset); exit(status); }; // // Get the compiled rule data from the break iterator. // uint32_t outDataSize; const uint8_t *outData; outData = bi->getBinaryRules(outDataSize); // Copy the data format version numbers from the RBBI data header into the UDataMemory header. uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genbrk: error %d writing the output file\n", status); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } delete bi; delete[] ruleSourceU; delete[] ruleBufferC; u_cleanup(); printf("genbrk: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
// // TestRuleStatus // Test word break rule status constants. // void RBBIAPITest::TestRuleStatus() { UChar str[30]; u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094", // 012345678901234567 8 9 0 1 2 3 4 5 6 // Ideographic Katakana Hiragana str, 30); UnicodeString testString1(str); int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; UErrorCode status=U_ZERO_ERROR; RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); if(U_FAILURE(status)) { errcheckln(status, "Fail : in construction - %s", u_errorName(status)); } else { bi->setText(testString1); // First test that the breaks are in the right spots. doBoundaryTest(*bi, testString1, bounds1); // Then go back and check tag values int32_t i = 0; int32_t pos, tag; for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) { if (pos != bounds1[i]) { errln("FAIL: unexpected word break at postion %d", pos); break; } tag = bi->getRuleStatus(); if (tag < tag_lo[i] || tag >= tag_hi[i]) { errln("FAIL: incorrect tag value %d at position %d", tag, pos); break; } // Check that we get the same tag values from getRuleStatusVec() int32_t vec[10]; int t = bi->getRuleStatusVec(vec, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t==1); TEST_ASSERT(vec[0] == tag); } } delete bi; // Now test line break status. This test mostly is to confirm that the status constants // are correctly declared in the header. testString1 = "test line. \n"; // break type s s h bi = (RuleBasedBreakIterator *) BreakIterator::createLineInstance(Locale::getEnglish(), status); if(U_FAILURE(status)) { errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status)); } else { int32_t i = 0; int32_t pos, tag; UBool success; bi->setText(testString1); pos = bi->current(); tag = bi->getRuleStatus(); for (i=0; i<3; i++) { switch (i) { case 0: success = pos==0 && tag==UBRK_LINE_SOFT; break; case 1: success = pos==5 && tag==UBRK_LINE_SOFT; break; case 2: success = pos==12 && tag==UBRK_LINE_HARD; break; default: success = FALSE; break; } if (success == FALSE) { errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d", i, pos, tag); break; } pos = bi->next(); tag = bi->getRuleStatus(); } if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT || UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT || (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) { errln("UBRK_LINE_* constants from header are inconsistent."); } } delete bi; }