Example #1
0
void RBBIAPITest::RoundtripRule(const char *dataFile) {
    UErrorCode status = U_ZERO_ERROR;
    UParseError parseError;
    parseError.line = 0;
    parseError.offset = 0;
    LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status));
    uint32_t length;
    const UChar *builtSource;
    const uint8_t *rbbiRules;
    const uint8_t *builtRules;

    if (U_FAILURE(status)) {
        errcheckln(status, "Can't open \"%s\" - %s", dataFile, u_errorName(status));
        return;
    }

    builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
    builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
    RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
    if (U_FAILURE(status)) {
        errln("createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
                u_errorName(status), parseError.line, parseError.offset);
        return;
    };
    rbbiRules = brkItr->getBinaryRules(length);
    logln("Comparing \"%s\" len=%d", dataFile, length);
    if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
        errln("Built rules and rebuilt rules are different %s", dataFile);
        return;
    }
    delete brkItr;
}
Example #2
0
//
//   Bug 2190 Regression test.   Builder crash on rule consisting of only a
//                               $variable reference
void RBBIAPITest::TestBug2190() {
     UnicodeString rulesString1 = "$aaa = abcd;\n"
                                  "$bbb = $aaa;\n"
                                  "$bbb;\n";
     UnicodeString testString1  = "abcdabcd";
                                // 01234567890
     int32_t bounds1[] = {0, 4, 8};
     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;

     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
     if(U_FAILURE(status)) {
         dataerrln("Fail : in construction - %s", u_errorName(status));
     } else {
         bi->setText(testString1);
         doBoundaryTest(*bi, testString1, bounds1);
     }
     delete bi;
}
Example #3
0
//
//  TestQuoteGrouping
//       Single quotes within rules imply a grouping, so that a modifier
//       following the quoted text (* or +) applies to all of the quoted chars.
//
void RBBIAPITest::TestQuoteGrouping() {
     UnicodeString rulesString1 = "#Here comes the rule...\n"
                                  "'$@!'*;\n"   //  (\$\@\!)*
                                  ".;\n";

     UnicodeString testString1  = "$@!$@!X$@!!X";
                                // 0123456789012
     int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;

     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
     if(U_FAILURE(status)) {
         dataerrln("Fail : in construction - %s", u_errorName(status));
     } else {
         bi->setText(testString1);
         doBoundaryTest(*bi, testString1, bounds1);
     }
     delete bi;
}
Example #4
0
void RBBIAPITest::TestBuilder() {
     UnicodeString rulesString1 = "$Letters = [:L:];\n"
                                  "$Numbers = [:N:];\n"
                                  "$Letters+;\n"
                                  "$Numbers+;\n"
                                  "[^$Letters $Numbers];\n"
                                  "!.*;\n";
     UnicodeString testString1  = "abc123..abc";
                                // 01234567890
     int32_t bounds1[] = {0, 3, 6, 7, 8, 11};
     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;

     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
     if(U_FAILURE(status)) {
         dataerrln("Fail : in construction - %s", u_errorName(status));
     } else {
         bi->setText(testString1);
         doBoundaryTest(*bi, testString1, bounds1);
     }
     delete bi;
}
Example #5
0
void RBBIAPITest::TestRefreshInputText() {
    /*
     *  RefreshInput changes out the input of a Break Iterator without
     *    changing anything else in the iterator's state.  Used with Java JNI,
     *    when Java moves the underlying string storage.   This test
     *    runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
     *    The right set of boundaries should still be found.
     */
    UChar testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
    UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
    UErrorCode status = U_ZERO_ERROR;
    UText ut1 = UTEXT_INITIALIZER;
    UText ut2 = UTEXT_INITIALIZER;
    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
    TEST_ASSERT_SUCCESS(status);

    utext_openUChars(&ut1, testStr, -1, &status);
    TEST_ASSERT_SUCCESS(status);

    if (U_SUCCESS(status)) {
        bi->setText(&ut1, status);
        TEST_ASSERT_SUCCESS(status);

        /* Line boundaries will occur before each letter in the original string */
        TEST_ASSERT(1 == bi->next());
        TEST_ASSERT(3 == bi->next());

        /* Move the string, kill the original string.  */
        u_strcpy(movedStr, testStr);
        u_memset(testStr, 0x20, u_strlen(testStr));
        utext_openUChars(&ut2, movedStr, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(bi == returnedBI);

        /* Find the following matches, now working in the moved string. */
        TEST_ASSERT(5 == bi->next());
        TEST_ASSERT(7 == bi->next());
        TEST_ASSERT(8 == bi->next());
        TEST_ASSERT(UBRK_DONE == bi->next());
    
        utext_close(&ut1);
        utext_close(&ut2);
    }
    delete bi;

}
void RBBIAPITest::doBoundaryTest(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries){
     logln((UnicodeString)"testIsBoundary():");
        int32_t p = 0;
        UBool isB;
        for (int32_t i = 0; i < text.length(); i++) {
            isB = bi.isBoundary(i);
            logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);

            if (i == boundaries[p]) {
                if (!isB)
                    errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
                p++;
            }
            else {
                if (isB)
                    errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
            }
        }
}
Example #7
0
U_NAMESPACE_BEGIN

// -------------------------------------

BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
{
    char fnbuff[256];
    char ext[4]={'\0'};
    char actualLocale[ULOC_FULLNAME_CAPACITY];
    int32_t size;
    const UChar* brkfname = NULL;
    UResourceBundle brkRulesStack;
    UResourceBundle brkNameStack;
    UResourceBundle *brkRules = &brkRulesStack;
    UResourceBundle *brkName  = &brkNameStack;
    RuleBasedBreakIterator *result = NULL;

    if (U_FAILURE(status))
        return NULL;

    ures_initStackObject(brkRules);
    ures_initStackObject(brkName);

    // Get the locale
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, loc.getName(), &status);
    /* this is a hack for now. Should be fixed when the data is fetched from
        brk_index.txt */
    if(status==U_USING_DEFAULT_WARNING){
        status=U_ZERO_ERROR;
        ures_openFillIn(b, U_ICUDATA_BRKITR, "", &status);
    }

    // Get the "boundaries" array.
    if (U_SUCCESS(status)) {
        brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
        // Get the string object naming the rules file
        brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
        // Get the actual string
        brkfname = ures_getString(brkName, &size, &status);
        U_ASSERT((size_t)size<sizeof(fnbuff));
        if ((size_t)size>=sizeof(fnbuff)) {
            size=0;
            if (U_SUCCESS(status)) {
                status = U_BUFFER_OVERFLOW_ERROR;
            }
        }

        // Use the string if we found it
        if (U_SUCCESS(status) && brkfname) {
            uprv_strncpy(actualLocale,
                ures_getLocale(brkName, &status),
                sizeof(actualLocale)/sizeof(actualLocale[0]));

            UChar* extStart=u_strchr(brkfname, 0x002e);
            int len = 0;
            if(extStart!=NULL){
                len = (int)(extStart-brkfname);
                u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
                u_UCharsToChars(brkfname, fnbuff, len);
            }
            fnbuff[len]=0; // nul terminate
        }
    }

    ures_close(brkRules);
    ures_close(brkName);

    UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
    if (U_FAILURE(status)) {
        ures_close(b);
        return NULL;
    }

    // Create a RuleBasedBreakIterator
    result = new RuleBasedBreakIterator(file, status);

    // If there is a result, set the valid locale and actual locale, and the kind
    if (U_SUCCESS(status) && result != NULL) {
        U_LOCALE_BASED(locBased, *(BreakIterator*)result);
        locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
        result->setBreakType(kind);
    }

    ures_close(b);

    if (U_FAILURE(status) && result != NULL) {  // Sometimes redundant check, but simple
        delete result;
        return NULL;
    }

    if (result == NULL) {
        udata_close(file);
        if (U_SUCCESS(status)) {
            status = U_MEMORY_ALLOCATION_ERROR;
        }
    }

    return result;
}
Example #8
0
//
//  TestRuleStatusVec
//      Test the vector form of  break rule status.
//
void RBBIAPITest::TestRuleStatusVec() {
    UnicodeString rulesString(   "[A-N]{100}; \n"
                                 "[a-w]{200}; \n"
                                 "[\\p{L}]{300}; \n"
                                 "[\\p{N}]{400}; \n"
                                 "[0-5]{500}; \n"
                                  "!.*;\n", -1, US_INV);
     UnicodeString testString1  = "Aapz5?";
     int32_t  statusVals[10];
     int32_t  numStatuses;
     int32_t  pos;

     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;

     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status);
     if (U_FAILURE(status)) {
         dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));
     } else {
         bi->setText(testString1);

         // A
         pos = bi->next();
         TEST_ASSERT(pos==1);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 100);
         TEST_ASSERT(statusVals[1] == 300);

         // a
         pos = bi->next();
         TEST_ASSERT(pos==2);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 200);
         TEST_ASSERT(statusVals[1] == 300);

         // p
         pos = bi->next();
         TEST_ASSERT(pos==3);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 200);
         TEST_ASSERT(statusVals[1] == 300);

         // z
         pos = bi->next();
         TEST_ASSERT(pos==4);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 1);
         TEST_ASSERT(statusVals[0] == 300);

         // 5
         pos = bi->next();
         TEST_ASSERT(pos==5);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 400);
         TEST_ASSERT(statusVals[1] == 500);

         // ?
         pos = bi->next();
         TEST_ASSERT(pos==6);
         numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 1);
         TEST_ASSERT(statusVals[0] == 0);

         //
         //  Check buffer overflow error handling.   Char == A
         //
         bi->first();
         pos = bi->next();
         TEST_ASSERT(pos==1);
         memset(statusVals, -1, sizeof(statusVals));
         numStatuses = bi->getRuleStatusVec(statusVals, 0, status);
         TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == -1);

         status = U_ZERO_ERROR;
         memset(statusVals, -1, sizeof(statusVals));
         numStatuses = bi->getRuleStatusVec(statusVals, 1, status);
         TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 100);
         TEST_ASSERT(statusVals[1] == -1);

         status = U_ZERO_ERROR;
         memset(statusVals, -1, sizeof(statusVals));
         numStatuses = bi->getRuleStatusVec(statusVals, 2, status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(numStatuses == 2);
         TEST_ASSERT(statusVals[0] == 100);
         TEST_ASSERT(statusVals[1] == 300);
         TEST_ASSERT(statusVals[2] == -1);
     }
     delete bi;

}
Example #9
0
void RBBIAPITest::TestCloneEquals()
{

    UErrorCode status=U_ZERO_ERROR;
    RuleBasedBreakIterator* bi1     = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    RuleBasedBreakIterator* biequal = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    RuleBasedBreakIterator* bi3     = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    RuleBasedBreakIterator* bi2     = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
    if(U_FAILURE(status)){
        errcheckln(status, "Fail : in construction - %s", u_errorName(status));
        return;
    }


    UnicodeString testString="Testing word break iterators's clone() and equals()";
    bi1->setText(testString);
    bi2->setText(testString);
    biequal->setText(testString);

    bi3->setText("hello");

    logln((UnicodeString)"Testing equals()");

    logln((UnicodeString)"Testing == and !=");
    UBool b = (*bi1 != *biequal);
    b |= *bi1 == *bi2;
    b |= *bi1 == *bi3;
    if (b) {
        errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed.");
    }

    if(*bi2 == *biequal || *bi2 == *bi1  || *biequal == *bi3)
        errln((UnicodeString)"ERROR:2 RBBI's == and != operator  failed.");


    // Quick test of RulesBasedBreakIterator assignment -
    // Check that
    //    two different iterators are !=
    //    they are == after assignment
    //    source and dest iterator produce the same next() after assignment.
    //    deleting one doesn't disable the other.
    logln("Testing assignment");
    RuleBasedBreakIterator *bix = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
    if(U_FAILURE(status)){
        errcheckln(status, "Fail : in construction - %s", u_errorName(status));
        return;
    }

    RuleBasedBreakIterator biDefault, biDefault2;
    if(U_FAILURE(status)){
        errln((UnicodeString)"FAIL : in construction of default iterator");
        return;
    }
    if (biDefault == *bix) {
        errln((UnicodeString)"ERROR: iterators should not compare ==");
        return;
    }
    if (biDefault != biDefault2) {
        errln((UnicodeString)"ERROR: iterators should compare ==");
        return;
    }


    UnicodeString   HelloString("Hello Kitty");
    bix->setText(HelloString);
    if (*bix == *bi2) {
        errln(UnicodeString("ERROR: strings should not be equal before assignment."));
    }
    *bix = *bi2;
    if (*bix != *bi2) {
        errln(UnicodeString("ERROR: strings should be equal before assignment."));
    }

    int bixnext = bix->next();
    int bi2next = bi2->next();
    if (! (bixnext == bi2next && bixnext == 7)) {
        errln(UnicodeString("ERROR: iterators behaved differently after assignment."));
    }
    delete bix;
    if (bi2->next() != 8) {
        errln(UnicodeString("ERROR: iterator.next() failed after deleting copy."));
    }



    logln((UnicodeString)"Testing clone()");
    RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone();
    RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone();

    if(*bi1clone != *bi1 || *bi1clone  != *biequal  ||
      *bi1clone == *bi3 || *bi1clone == *bi2)
        errln((UnicodeString)"ERROR:1 RBBI's clone() method failed");

    if(*bi2clone == *bi1 || *bi2clone == *biequal ||
       *bi2clone == *bi3 || *bi2clone != *bi2)
        errln((UnicodeString)"ERROR:2 RBBI's clone() method failed");

    if(bi1->getText() != bi1clone->getText()   ||
       bi2clone->getText() != bi2->getText()   ||
       *bi2clone == *bi1clone )
        errln((UnicodeString)"ERROR: RBBI's clone() method failed");

    delete bi1clone;
    delete bi2clone;
    delete bi1;
    delete bi3;
    delete bi2;
    delete biequal;
}
Example #10
0
void RBBIAPITest::TestIteration()
{
    // This test just verifies that the API is present.
    // Testing for correct operation of the break rules happens elsewhere.

    UErrorCode status=U_ZERO_ERROR;
    RuleBasedBreakIterator* bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating character break iterator.  Status = %s", u_errorName(status));
    }
    delete bi;

    status=U_ZERO_ERROR;
    bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating Word break iterator.  Status = %s", u_errorName(status));
    }
    delete bi;

    status=U_ZERO_ERROR;
    bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating Line break iterator.  Status = %s", u_errorName(status));
    }
    delete bi;

    status=U_ZERO_ERROR;
    bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating Sentence break iterator.  Status = %s", u_errorName(status));
    }
    delete bi;

    status=U_ZERO_ERROR;
    bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating Title break iterator.  Status = %s", u_errorName(status));
    }
    delete bi;

    status=U_ZERO_ERROR;
    bi  = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
    if (U_FAILURE(status) || bi == NULL)  {
        errcheckln(status, "Failure creating character break iterator.  Status = %s", u_errorName(status));
        return;   // Skip the rest of these tests.
    }


    UnicodeString testString="0123456789";
    bi->setText(testString);

    int32_t i;
    i = bi->first();
    if (i != 0) {
        errln("Incorrect value from bi->first().  Expected 0, got %d.", i);
    }

    i = bi->last();
    if (i != 10) {
        errln("Incorrect value from bi->last().  Expected 10, got %d", i);
    }

    //
    // Previous
    //
    bi->last();
    i = bi->previous();
    if (i != 9) {
        errln("Incorrect value from bi->last() at line %d.  Expected 9, got %d", __LINE__, i);
    }


    bi->first();
    i = bi->previous();
    if (i != BreakIterator::DONE) {
        errln("Incorrect value from bi->previous() at line %d.  Expected DONE, got %d", __LINE__, i);
    }

    //
    // next()
    //
    bi->first();
    i = bi->next();
    if (i != 1) {
        errln("Incorrect value from bi->next() at line %d.  Expected 1, got %d", __LINE__, i);
    }

    bi->last();
    i = bi->next();
    if (i != BreakIterator::DONE) {
        errln("Incorrect value from bi->next() at line %d.  Expected DONE, got %d", __LINE__, i);
    }


    //
    //  current()
    //
    bi->first();
    i = bi->current();
    if (i != 0) {
        errln("Incorrect value from bi->previous() at line %d.  Expected 0, got %d", __LINE__, i);
    }

    bi->next();
    i = bi->current();
    if (i != 1) {
        errln("Incorrect value from bi->previous() at line %d.  Expected 1, got %d", __LINE__, i);
    }

    bi->last();
    bi->next();
    i = bi->current();
    if (i != 10) {
        errln("Incorrect value from bi->previous() at line %d.  Expected 10, got %d", __LINE__, i);
    }

    bi->first();
    bi->previous();
    i = bi->current();
    if (i != 0) {
        errln("Incorrect value from bi->previous() at line %d.  Expected 0, got %d", __LINE__, i);
    }


    //
    // Following()
    //
    i = bi->following(4);
    if (i != 5) {
        errln("Incorrect value from bi->following() at line %d.  Expected 5, got %d", __LINE__, i);
    }

    i = bi->following(9);
    if (i != 10) {
        errln("Incorrect value from bi->following() at line %d.  Expected 10, got %d", __LINE__, i);
    }

    i = bi->following(10);
    if (i != BreakIterator::DONE) {
        errln("Incorrect value from bi->following() at line %d.  Expected DONE, got %d", __LINE__, i);
    }


    //
    // Preceding
    //
    i = bi->preceding(4);
    if (i != 3) {
        errln("Incorrect value from bi->preceding() at line %d.  Expected 3, got %d", __LINE__, i);
    }

    i = bi->preceding(10);
    if (i != 9) {
        errln("Incorrect value from bi->preceding() at line %d.  Expected 9, got %d", __LINE__, i);
    }

    i = bi->preceding(1);
    if (i != 0) {
        errln("Incorrect value from bi->preceding() at line %d.  Expected 0, got %d", __LINE__, i);
    }

    i = bi->preceding(0);
    if (i != BreakIterator::DONE) {
        errln("Incorrect value from bi->preceding() at line %d.  Expected DONE, got %d", __LINE__, i);
    }


    //
    // isBoundary()
    //
    bi->first();
    if (bi->isBoundary(3) != TRUE) {
        errln("Incorrect value from bi->isBoudary() at line %d.  Expected TRUE, got FALSE", __LINE__, i);
    }
    i = bi->current();
    if (i != 3) {
        errln("Incorrect value from bi->current() at line %d.  Expected 3, got %d", __LINE__, i);
    }


    if (bi->isBoundary(11) != FALSE) {
        errln("Incorrect value from bi->isBoudary() at line %d.  Expected FALSE, got TRUE", __LINE__, i);
    }
    i = bi->current();
    if (i != 10) {
        errln("Incorrect value from bi->current() at line %d.  Expected 10, got %d", __LINE__, i);
    }

    //
    // next(n)
    //
    bi->first();
    i = bi->next(4);
    if (i != 4) {
        errln("Incorrect value from bi->next() at line %d.  Expected 4, got %d", __LINE__, i);
    }

    i = bi->next(6);
    if (i != 10) {
        errln("Incorrect value from bi->next() at line %d.  Expected 10, got %d", __LINE__, i);
    }

    bi->first();
    i = bi->next(11);
    if (i != BreakIterator::DONE) {
        errln("Incorrect value from bi->next() at line %d.  Expected BreakIterator::DONE, got %d", __LINE__, i);
    }

    delete bi;

}
Example #11
0
// Try out the RuleBasedBreakIterator constructors that take RBBIDataHeader*
// (these are protected so we access them via a local class RBBIWithProtectedFunctions).
// This is just a sanity check, not a thorough test (e.g. we don't check that the 
// first delete actually frees rulesCopy).
void RBBIAPITest::TestCreateFromRBBIData() {
    // Get some handy RBBIData
    const char *brkName = "word"; // or "sent", "line", "char", etc.
    UErrorCode status = U_ZERO_ERROR;
    LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", brkName, &status));
    if ( U_SUCCESS(status) ) {
        const RBBIDataHeader * builtRules = (const RBBIDataHeader *)udata_getMemory(data.getAlias());
        uint32_t length = builtRules->fLength;
        RBBIWithProtectedFunctions * brkItr;

        // Try the memory-adopting constructor, need to copy the data first
        RBBIDataHeader * rulesCopy = (RBBIDataHeader *) uprv_malloc(length);
        if ( rulesCopy ) {
            uprv_memcpy( rulesCopy, builtRules, length );

            brkItr = new RBBIWithProtectedFunctions(rulesCopy, status);
            if ( U_SUCCESS(status) ) {
                delete brkItr; // this should free rulesCopy
            } else {
                errln("create RuleBasedBreakIterator from RBBIData (adopted): ICU Error \"%s\"\n", u_errorName(status) );
                status = U_ZERO_ERROR;// reset for the next test
                uprv_free( rulesCopy );
            }
        }
        
        // Now try the non-adopting constructor
        brkItr = new RBBIWithProtectedFunctions(builtRules, RBBIWithProtectedFunctions::kDontAdopt, status);
        if ( U_SUCCESS(status) ) {
            delete brkItr; // this should NOT attempt to free builtRules
            if (builtRules->fLength != length) { // sanity check
                errln("create RuleBasedBreakIterator from RBBIData (non-adopted): delete affects data\n" );
            }
        } else {
            errln("create RuleBasedBreakIterator from RBBIData (non-adopted): ICU Error \"%s\"\n", u_errorName(status) );
        }
    }

    // getBinaryRules() and RuleBasedBreakIterator(uint8_t binaryRules, ...)
    //
    status = U_ZERO_ERROR;
    RuleBasedBreakIterator *rb = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
    if (rb == NULL || U_FAILURE(status)) {
        dataerrln("Unable to create BreakIterator::createWordInstance (Locale::getEnglish) - %s", u_errorName(status));
    } else {
        uint32_t length;
        const uint8_t *rules = rb->getBinaryRules(length);
        RuleBasedBreakIterator *rb2 = new RuleBasedBreakIterator(rules, length, status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(*rb == *rb2);
        UnicodeString words = "one two three ";
        rb2->setText(words);
        int wordCounter = 0;
        while (rb2->next() != UBRK_DONE) {
            wordCounter++;
        }
        TEST_ASSERT(wordCounter == 6);

        status = U_ZERO_ERROR;
        RuleBasedBreakIterator *rb3 = new RuleBasedBreakIterator(rules, length-1, status);
        TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

        delete rb;
        delete rb2;
        delete rb3;
    }
}
Example #12
0
//----------------------------------------------------------------------------
//
//  main      for genbrk
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    UErrorCode  status = U_ZERO_ERROR;
    const char *ruleFileName;
    const char *outFileName;
    const char *outDir = NULL;
    const char *copyright = NULL;

    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[0].doesOccur || options[1].doesOccur) {
        //  -? or -h for help.
        usageAndDie(0);
    }

    if (!(options[3].doesOccur && options[4].doesOccur)) {
        fprintf(stderr, "rule file and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    ruleFileName = options[3].value;
    outFileName  = options[4].value;

    if (options[5].doesOccur) {
        u_setDataDirectory(options[5].value);
    }

    /* Initialize ICU */
    u_init(&status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
            argv[0], u_errorName(status));
        exit(1);
    }
    status = U_ZERO_ERROR;

    /* Combine the directory with the file name */
    if(options[6].doesOccur) {
        outDir = options[6].value;
    }
    if (options[7].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

#if UCONFIG_NO_BREAK_ITERATION

    UNewDataMemory *pData;
    char msg[1024];

    /* write message with just the name */
    sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &status);
    return (int)status;

#else

    //
    //  Read in the rule source file
    //
    long        result;
    long        ruleFileSize;
    FILE        *file;
    char        *ruleBufferC;

    file = fopen(ruleFileName, "rb");
    if( file == 0 ) {
        fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
        exit(-1);
    }
    fseek(file, 0, SEEK_END);
    ruleFileSize = ftell(file);
    fseek(file, 0, SEEK_SET);
    ruleBufferC = new char[ruleFileSize+10];

    result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
    if (result != ruleFileSize)  {
        fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
        exit (-1);
    }
    ruleBufferC[ruleFileSize]=0;
    fclose(file);

    //
    // Look for a Unicode Signature (BOM) on the rule file
    //
    int32_t        signatureLength;
    const char *   ruleSourceC = ruleBufferC;
    const char*    encoding = ucnv_detectUnicodeSignature(
                           ruleSourceC, ruleFileSize, &signatureLength, &status);
    if (U_FAILURE(status)) {
        exit(status);
    }
    if(encoding!=NULL ){
        ruleSourceC  += signatureLength;
        ruleFileSize -= signatureLength;
    }

    //
    // Open a converter to take the rule file to UTF-16
    //
    UConverter* conv;
    conv = ucnv_open(encoding, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
        exit(status);
    }

    //
    // Convert the rules to UChar.
    //  Preflight first to determine required buffer size.
    //
    uint32_t destCap = ucnv_toUChars(conv,
                       NULL,           //  dest,
                       0,              //  destCapacity,
                       ruleSourceC,
                       ruleFileSize,
                       &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        exit(status);
    };

    status = U_ZERO_ERROR;
    UChar *ruleSourceU = new UChar[destCap+1];
    ucnv_toUChars(conv,
                  ruleSourceU,     //  dest,
                  destCap+1,
                  ruleSourceC,
                  ruleFileSize,
                  &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        exit(status);
    };
    ucnv_close(conv);


    //
    //  Put the source rules into a UnicodeString
    //
    UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);

    //
    //  Create the break iterator from the rules
    //     This will compile the rules.
    //
    UParseError parseError;
    parseError.line = 0;
    parseError.offset = 0;
    RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
                u_errorName(status), (int)parseError.line, (int)parseError.offset);
        exit(status);
    };


    //
    //  Get the compiled rule data from the break iterator.
    //
    uint32_t        outDataSize;
    const uint8_t  *outData;
    outData = bi->getBinaryRules(outDataSize);

    // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
    uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));

    //
    //  Create the output file
    //
    size_t bytesWritten;
    UNewDataMemory *pData;
    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 
                         outFileName, u_errorName(status));
        exit(status);
    }


    //  Write the data itself.
    udata_writeBlock(pData, outData, outDataSize);
    // finish up 
    bytesWritten = udata_finish(pData, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genbrk: error %d writing the output file\n", status);
        exit(status);
    }
    
    if (bytesWritten != outDataSize) {
        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
        exit(-1);
    }

    delete bi;
    delete[] ruleSourceU;
    delete[] ruleBufferC;
    u_cleanup();


    printf("genbrk: tool completed successfully.\n");
    return 0;

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
//
//  TestRuleStatus
//      Test word break rule status constants.
//
void RBBIAPITest::TestRuleStatus() {
     UChar str[30];
     u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
              // 012345678901234567  8      9    0  1      2    3  4      5    6
              //                    Ideographic    Katakana       Hiragana
                str, 30);
     UnicodeString testString1(str);
     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
     int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
                          UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
                          UBRK_WORD_KANA,     UBRK_WORD_NONE,   UBRK_WORD_KANA,    UBRK_WORD_KANA};

     int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
                          UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
                          UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};

     UErrorCode status=U_ZERO_ERROR;

     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
     if(U_FAILURE(status)) {
         errcheckln(status, "Fail : in construction - %s", u_errorName(status));
     } else {
         bi->setText(testString1);
         // First test that the breaks are in the right spots.
         doBoundaryTest(*bi, testString1, bounds1);

         // Then go back and check tag values
         int32_t i = 0;
         int32_t pos, tag;
         for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
             if (pos != bounds1[i]) {
                 errln("FAIL: unexpected word break at postion %d", pos);
                 break;
             }
             tag = bi->getRuleStatus();
             if (tag < tag_lo[i] || tag >= tag_hi[i]) {
                 errln("FAIL: incorrect tag value %d at position %d", tag, pos);
                 break;
             }

             // Check that we get the same tag values from getRuleStatusVec()
             int32_t vec[10];
             int t = bi->getRuleStatusVec(vec, 10, status);
             TEST_ASSERT_SUCCESS(status);
             TEST_ASSERT(t==1);
             TEST_ASSERT(vec[0] == tag);
         }
     }
     delete bi;

     // Now test line break status.  This test mostly is to confirm that the status constants
     //                              are correctly declared in the header.
     testString1 =   "test line. \n";
     // break type    s    s     h

     bi = (RuleBasedBreakIterator *)
         BreakIterator::createLineInstance(Locale::getEnglish(), status);
     if(U_FAILURE(status)) {
         errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status));
     } else {
         int32_t i = 0;
         int32_t pos, tag;
         UBool   success;

         bi->setText(testString1);
         pos = bi->current();
         tag = bi->getRuleStatus();
         for (i=0; i<3; i++) {
             switch (i) {
             case 0:
                 success = pos==0  && tag==UBRK_LINE_SOFT; break;
             case 1:
                 success = pos==5  && tag==UBRK_LINE_SOFT; break;
             case 2:
                 success = pos==12 && tag==UBRK_LINE_HARD; break;
             default:
                 success = FALSE; break;
             }
             if (success == FALSE) {
                 errln("Fail: incorrect word break status or position.  i=%d, pos=%d, tag=%d",
                     i, pos, tag);
                 break;
             }
             pos = bi->next();
             tag = bi->getRuleStatus();
         }
         if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
             UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
             (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
             errln("UBRK_LINE_* constants from header are inconsistent.");
         }
     }
     delete bi;

}