//---------------------------------------------------------------------------------------- // // ubrk_openRules open a break iterator from a set of break rules. // Invokes the rule builder. // //---------------------------------------------------------------------------------------- U_CAPI UBreakIterator* U_EXPORT2 ubrk_openRules( const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status) { if (status == NULL || U_FAILURE(*status)){ return 0; } BreakIterator *result = 0; UnicodeString ruleString(rules, rulesLength); result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status); if(U_FAILURE(*status)) { return 0; } if (text != NULL) { UCharCharacterIterator *iter = 0; iter = new UCharCharacterIterator(text, textLength); if(iter == 0) { *status = U_MEMORY_ALLOCATION_ERROR; delete result; return 0; } result->adoptText(iter); } return (UBreakIterator *)result; }
//---------------------------------------------------------------------------------------- // // ubrk_open Create a canned type of break iterator based on type (word, line, etc.) // and locale. // //---------------------------------------------------------------------------------------- U_CAPI UBreakIterator* U_EXPORT2 ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status) { if(U_FAILURE(*status)) return 0; BreakIterator *result = 0; switch(type) { case UBRK_CHARACTER: result = BreakIterator::createCharacterInstance(Locale(locale), *status); break; case UBRK_WORD: result = BreakIterator::createWordInstance(Locale(locale), *status); break; case UBRK_LINE: result = BreakIterator::createLineInstance(Locale(locale), *status); break; case UBRK_SENTENCE: result = BreakIterator::createSentenceInstance(Locale(locale), *status); break; case UBRK_TITLE: result = BreakIterator::createTitleInstance(Locale(locale), *status); break; } // check for allocation error if (U_FAILURE(*status)) { return 0; } if(result == 0) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } if (text != NULL) { UCharCharacterIterator *iter = 0; iter = new UCharCharacterIterator(text, textLength); if(iter == 0) { *status = U_MEMORY_ALLOCATION_ERROR; delete result; return 0; } result->adoptText(iter); } return (UBreakIterator*)result; }
/* * This method does the acutal break comparison and reports the results. * It uses a SpaceBreakIterator to iterate over the text with spaces, * and a word instance of a Thai BreakIterator to iterate over the text * without spaces. */ UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount) { UBool result = TRUE; Locale thai("th"); UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); UErrorCode status = U_ZERO_ERROR; BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); breakIter->adoptText(noSpaceIter); SpaceBreakIterator spaceIter(spaces, spaceCount); int32_t nextBreak = 0; int32_t nextSpaceBreak = 0; int32_t iterCount = 0; while (TRUE) { nextSpaceBreak = spaceIter.next(); nextBreak = breakIter->next(); if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { if (nextBreak != BreakIterator::DONE) { fprintf(stderr, "break iterator didn't end.\n"); } else if (nextSpaceBreak != BreakIterator::DONE) { fprintf(stderr, "premature break iterator end.\n"); } break; } while (nextSpaceBreak != nextBreak && nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { if (nextSpaceBreak < nextBreak) { breakNotFound(nextSpaceBreak); result = FALSE; nextSpaceBreak = spaceIter.next(); } else if (nextSpaceBreak > nextBreak) { foundInvalidBreak(nextBreak); result = FALSE; nextBreak = breakIter->next(); } } if (fVerbose) { printf("%d %d\n", nextSpaceBreak, nextBreak); } } fWordCount = spaceIter.getWordCount(); delete breakIter; return result; }
/* * Generate a text file with spaces in it from a file without. */ int generateFile(const UChar *chars, int32_t length) { Locale root(""); UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); UErrorCode status = U_ZERO_ERROR; UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); breakIter->adoptText(noSpaceIter); char outbuf[1024]; int32_t strlength; UChar bom = 0xFEFF; printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); int32_t prevbreak = 0; while (U_SUCCESS(status)) { int32_t nextbreak = breakIter->next(); if (nextbreak == BreakIterator::DONE) { break; } printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], nextbreak-prevbreak, &status)); if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) && complexContext.contains(chars[nextbreak])) { printf(" "); } prevbreak = nextbreak; } if (U_FAILURE(status)) { fprintf(stderr, "generate failed: %s\n", u_errorName(status)); return status; } else { return 0; } }