static void TestBreakIteratorRefresh(void) { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { return; } utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_setUText(bi, &ut1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == ubrk_next(bi)); TEST_ASSERT(3 == ubrk_next(bi)); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_refreshUText(bi, &ut2, &status); TEST_ASSERT_SUCCESS(status); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == ubrk_next(bi)); TEST_ASSERT(7 == ubrk_next(bi)); TEST_ASSERT(8 == ubrk_next(bi)); TEST_ASSERT(UBRK_DONE == ubrk_next(bi)); TEST_ASSERT_SUCCESS(status); utext_close(&ut1); utext_close(&ut2); } ubrk_close(bi); }
void RBBIAPITest::TestRefreshInputText() { /* * RefreshInput changes out the input of a Break Iterator without * changing anything else in the iterator's state. Used with Java JNI, * when Java moves the underlying string storage. This test * runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence. * The right set of boundaries should still be found. */ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; UErrorCode status = U_ZERO_ERROR; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); TEST_ASSERT_SUCCESS(status); utext_openUChars(&ut1, testStr, -1, &status); TEST_ASSERT_SUCCESS(status); if (U_SUCCESS(status)) { bi->setText(&ut1, status); TEST_ASSERT_SUCCESS(status); /* Line boundaries will occur before each letter in the original string */ TEST_ASSERT(1 == bi->next()); TEST_ASSERT(3 == bi->next()); /* Move the string, kill the original string. */ u_strcpy(movedStr, testStr); u_memset(testStr, 0x20, u_strlen(testStr)); utext_openUChars(&ut2, movedStr, -1, &status); TEST_ASSERT_SUCCESS(status); RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(bi == returnedBI); /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == bi->next()); TEST_ASSERT(7 == bi->next()); TEST_ASSERT(8 == bi->next()); TEST_ASSERT(UBRK_DONE == bi->next()); utext_close(&ut1); utext_close(&ut2); } delete bi; }
static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string) { if (string.is8Bit()) { UTextWithBuffer textLocal; textLocal.text = UTEXT_INITIALIZER; textLocal.text.extraSize = sizeof(textLocal.buffer); textLocal.text.pExtra = textLocal.buffer; UErrorCode openStatus = U_ZERO_ERROR; UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus); if (U_FAILURE(openStatus)) { LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus); return nullptr; } UErrorCode setTextStatus = U_ZERO_ERROR; ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus); if (U_FAILURE(setTextStatus)) { LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); return nullptr; } utext_close(text); } else { UErrorCode setTextStatus = U_ZERO_ERROR; ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus); if (U_FAILURE(setTextStatus)) return nullptr; } return &iterator; }
U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToTitle(UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { if (U_FAILURE(*pErrorCode)) { return 0; } UText utext=UTEXT_INITIALIZER; utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); if(csm->iter==NULL) { csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode); } if (U_FAILURE(*pErrorCode)) { return 0; } csm->iter->setText(&utext, *pErrorCode); int32_t length=ucasemap_mapUTF8( csm->caseLocale, csm->options, csm->iter, (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, ucasemap_internalUTF8ToTitle, pErrorCode); utext_close(&utext); return length; }
//-------------------------------------------------------------------------- // // zap Delete everything owned by this RegexPattern. // //-------------------------------------------------------------------------- void RegexPattern::zap() { delete fCompiledPat; fCompiledPat = NULL; int i; for (i=1; i<fSets->size(); i++) { UnicodeSet *s; s = (UnicodeSet *)fSets->elementAt(i); if (s != NULL) { delete s; } } delete fSets; fSets = NULL; delete[] fSets8; fSets8 = NULL; delete fGroupMap; fGroupMap = NULL; delete fInitialChars; fInitialChars = NULL; delete fInitialChars8; fInitialChars8 = NULL; if (fPattern != NULL) { utext_close(fPattern); fPattern = NULL; } if (fPatternString != NULL) { delete fPatternString; fPatternString = NULL; } uhash_close(fNamedCaptureMap); fNamedCaptureMap = NULL; }
~MatcherAccessor() { utext_close(mUText); if (mJavaInput) { mEnv->ReleaseStringChars(mJavaInput, mChars); } maybeThrowIcuException(mEnv, mStatus); }
static UBreakIterator* setContextAwareTextForIterator(UBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength) { if (string.is8Bit()) { UTextWithBuffer textLocal; textLocal.text = UTEXT_INITIALIZER; textLocal.text.extraSize = sizeof(textLocal.buffer); textLocal.text.pExtra = textLocal.buffer; UErrorCode openStatus = U_ZERO_ERROR; UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus); if (U_FAILURE(openStatus)) { LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus); return nullptr; } UErrorCode setTextStatus = U_ZERO_ERROR; ubrk_setUText(&iterator, text, &setTextStatus); if (U_FAILURE(setTextStatus)) { LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); return nullptr; } utext_close(text); } else { UText textLocal = UTEXT_INITIALIZER; UErrorCode openStatus = U_ZERO_ERROR; UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus); if (U_FAILURE(openStatus)) { LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus); return 0; } UErrorCode setTextStatus = U_ZERO_ERROR; ubrk_setUText(&iterator, text, &setTextStatus); if (U_FAILURE(setTextStatus)) { LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); return nullptr; } utext_close(text); } return &iterator; }
/** * Generic segmentation method that operates on the substring between * the given indices, using the given strategy for segmenting that * substring. * * @param first The index of the beginning of the string to work on * @param last The index of the end of the string to work on * @param type The type of segmentation to perform * @return a vector of segments (whose meaning depends on `type`) */ std::vector<segment> segments(int32_t first, int32_t last, segment_t type) const { std::vector<segment> results; icu::BreakIterator* iter; if (type == segment_t::SENTENCES) iter = sentence_iter_.get(); else if (type == segment_t::WORDS) iter = word_iter_.get(); else throw std::runtime_error{"Unknown segmentation type"}; auto status = U_ZERO_ERROR; UText utxt = UTEXT_INITIALIZER; utext_openUTF8(&utxt, text_.data() + first, last - first, &status); if (!U_SUCCESS(status)) { std::string err = "Failed to open UText: "; err += u_errorName(status); throw std::runtime_error{err}; } iter->setText(&utxt, status); if (!U_SUCCESS(status)) { utext_close(&utxt); std::string err = "Failed to setText: "; err += u_errorName(status); throw std::runtime_error{err}; } auto start = iter->first(); auto end = iter->next(); while (end != icu::BreakIterator::DONE) { results.emplace_back(first + start, first + end); start = end; end = iter->next(); } utext_close(&utxt); return results; }
RegexStaticSets::~RegexStaticSets() { int32_t i; for (i=0; i<URX_LAST_SET; i++) { delete fPropSets[i]; fPropSets[i] = NULL; } fRuleDigitsAlias = NULL; utext_close(fEmptyText); }
/* * static void TestBreakIteratorUText(void); * * Test that ubrk_setUText() is present and works for a simple case. */ static void TestBreakIteratorUText(void) { const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67"; /* c3 85 is utf-8 for A with a ring on top */ /* 0 1 2 34567890 */ UErrorCode status = U_ZERO_ERROR; UBreakIterator *bi = NULL; int32_t pos = 0; UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status); TEST_ASSERT_SUCCESS(status); bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); if (U_FAILURE(status)) { log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } ubrk_setUText(bi, ut, &status); if (U_FAILURE(status)) { log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); return; } pos = ubrk_first(bi); TEST_ASSERT(pos == 0); pos = ubrk_next(bi); TEST_ASSERT(pos == 4); pos = ubrk_next(bi); TEST_ASSERT(pos == 5); pos = ubrk_next(bi); TEST_ASSERT(pos == 10); pos = ubrk_next(bi); TEST_ASSERT(pos == UBRK_DONE); ubrk_close(bi); utext_close(ut); }
U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToTitle(UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { UText utext=UTEXT_INITIALIZER; utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } if(csm->iter==NULL) { csm->iter=ubrk_open(UBRK_WORD, csm->locale, NULL, 0, pErrorCode); } ubrk_setUText(csm->iter, &utext, pErrorCode); int32_t length=ucasemap_mapUTF8(csm, (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, ucasemap_internalUTF8ToTitle, pErrorCode); utext_close(&utext); return length; }
/** Word wrap text * * @param str character vector * @param width single integer * @param cost_exponent single double * @param indent single integer * @param exdent single integer * @param prefix single string * @param initial single string * @param locale locale identifier or NULL for default locale * @param use_length single logical value * * @return list * * @version 0.1-?? (Bartek Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-27) * single function for wrap_greedy and wrap_dynamic * (dispatch inside); * use BreakIterator * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: indent, exdent, prefix, initial * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) * #133 allow width <= 0 * * @version 0.5-1 (Marek Gagolewski, 2015-02-28) * don't trim so many white spaces at the end of each word (normalize arg does that) * #139: allow a "whitespace" break iterator * * @version 0.5-1 (Marek Gagolewski, 2015-04-23) * `use_length` arg added * * * @version 0.5-1 (Marek Gagolewski, 2015-06-09) * BIGSKIP: no more CHARSXP on out on "" input */ SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent, SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only, SEXP use_length, SEXP locale) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent"); bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only"); int width_val = stri__prepare_arg_integer_1_notNA(width, "width"); if (width_val <= 0) width_val = 0; int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent"); if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent"); int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent"); if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent"); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ Locale loc = Locale::createFromName(qloc); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(prefix = stri_prepare_arg_string_1(prefix, "prefix")); PROTECT(initial = stri_prepare_arg_string_1(initial, "initial")); BreakIterator* briter = NULL; UText* str_text = NULL; STRI__ERROR_HANDLER_BEGIN(3) UErrorCode status = U_ZERO_ERROR; briter = BreakIterator::createLineInstance(loc, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector<R_len_t> end_pos_orig(nwords); deque<R_len_t>::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector<R_len_t> widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector<R_len_t> widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector<R_len_t> end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque<R_len_t>::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) }
~ScopedBreakIterator() { utext_close(mUText); delete mBreakIterator; }
/* * Internal titlecasing function. */ static int32_t _toTitle(UCaseMap *csm, uint8_t *dest, int32_t destCapacity, const uint8_t *src, UCaseContext *csc, int32_t srcLength, UErrorCode *pErrorCode) { UText utext=UTEXT_INITIALIZER; const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } if(csm->iter==NULL) { csm->iter=ubrk_open(UBRK_WORD, csm->locale, NULL, 0, pErrorCode); } ubrk_setUText(csm->iter, &utext, pErrorCode); if(U_FAILURE(*pErrorCode)) { utext_close(&utext); return 0; } /* set up local variables */ destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=ubrk_first(csm->iter); } else { idx=ubrk_next(csm->iter); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U8_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U8_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc->cpStart=titleStart; csc->cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { c=0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } utext_close(&utext); return destIndex; }
static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; (void)gFailed; /* Suppress set but not used warning. */ /* Open */ { UText utLoc = UTEXT_INITIALIZER; const char * cString = "\x61\x62\x63\x64"; UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UText *utb; UChar c; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); c = utext_next32(uta); TEST_ASSERT(c == 0x41); utb = utext_close(uta); TEST_ASSERT(utb == NULL); uta = utext_openUTF8(&utLoc, cString, -1, &status); TEST_SUCCESS(status); TEST_ASSERT(uta == &utLoc); uta = utext_close(&utLoc); TEST_ASSERT(uta == &utLoc); } /* utext_clone() */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; int64_t len; UText *uta; UText *utb; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); utb = utext_clone(NULL, uta, FALSE, FALSE, &status); TEST_SUCCESS(status); TEST_ASSERT(utb != NULL); TEST_ASSERT(utb != uta); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); utext_close(uta); utext_close(utb); } /* basic access functions */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UChar32 c; int64_t len; UBool b; int64_t i; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_ASSERT(uta!=NULL); TEST_SUCCESS(status); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==TRUE); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==FALSE); c = utext_char32At(uta, 0); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = utext_previous32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32From(uta, 1); TEST_ASSERT(c==uString[1]); c = utext_next32From(uta, u_strlen(uString)); TEST_ASSERT(c==U_SENTINEL); c = utext_previous32From(uta, 2); TEST_ASSERT(c==uString[1]); i = utext_getNativeIndex(uta); TEST_ASSERT(i == 1); utext_setNativeIndex(uta, 0); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==1); b = utext_moveIndex32(uta, u_strlen(uString)-1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==FALSE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); utext_setNativeIndex(uta, 0); c = UTEXT_NEXT32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==uString[0]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==U_SENTINEL); utext_close(uta); } { /* * UText opened on a NULL string with zero length */ UText *uta; UChar32 c; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); uta = utext_openUTF8(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); } { /* * extract */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UChar buf[100]; int32_t i; /* Test pinning of input bounds */ UChar uString2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0}; UChar * uString2Ptr = uString2 + 5; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; i = utext_extract(uta, 0, 100, NULL, 0, &status); TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(i == u_strlen(uString)); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, 0, 100, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString)); i = u_strcmp(uString, buf); TEST_ASSERT(i == 0); utext_close(uta); /* Test pinning of input bounds */ status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString2Ptr, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, -3, 20, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString2Ptr)); i = u_strcmp(uString2Ptr, buf); TEST_ASSERT(i == 0); utext_close(uta); } { /* * Copy, Replace, isWritable * Can't create an editable UText from plain C, so all we * can easily do is check that errors returned. */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UBool b; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); b = utext_isWritable(uta); TEST_ASSERT(b == FALSE); b = utext_hasMetaData(uta); TEST_ASSERT(b == FALSE); utext_replace(uta, 0, 1, /* start, limit */ uString, -1, /* replacement, replacement length */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_copy(uta, 0, 1, /* start, limit */ 2, /* destination index */ FALSE, /* move flag */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_close(uta); } }
/* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @return The number of breaks found */ int32_t CjkBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if (rangeStart >= rangeEnd) { return 0; } const size_t defaultInputLength = 80; size_t inputLength = rangeEnd - rangeStart; // TODO: Replace by UnicodeString. AutoBuffer<UChar, defaultInputLength> charString(inputLength); // Normalize the input string and put it in normalizedText. // The map from the indices of the normalized input to the raw // input is kept in charPositions. UErrorCode status = U_ZERO_ERROR; utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); if (U_FAILURE(status)) { return 0; } UnicodeString inputString(charString.elems(), inputLength); // TODO: Use Normalizer2. UNormalizationMode norm_mode = UNORM_NFKC; UBool isNormalized = Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || Normalizer::isNormalized(inputString, norm_mode, status); // TODO: Replace by UVector32. AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); int numChars = 0; UText normalizedText = UTEXT_INITIALIZER; // Needs to be declared here because normalizedText holds onto its buffer. UnicodeString normalizedString; if (isNormalized) { int32_t index = 0; charPositions[0] = 0; while(index < inputString.length()) { index = inputString.moveIndex32(index, 1); charPositions[++numChars] = index; } utext_openUnicodeString(&normalizedText, &inputString, &status); } else { Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); if (U_FAILURE(status)) { return 0; } charPositions.resize(normalizedString.length() + 1); Normalizer normalizer(charString.elems(), inputLength, norm_mode); int32_t index = 0; charPositions[0] = 0; while(index < normalizer.endIndex()){ /* UChar32 uc = */ normalizer.next(); charPositions[++numChars] = index = normalizer.getIndex(); } utext_openUnicodeString(&normalizedText, &normalizedString, &status); } if (U_FAILURE(status)) { return 0; } // From this point on, all the indices refer to the indices of // the normalized input string. // bestSnlp[i] is the snlp of the best segmentation of the first i // characters in the range to be matched. // TODO: Replace by UVector32. AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); bestSnlp[0] = 0; for(int i = 1; i <= numChars; i++) { bestSnlp[i] = kuint32max; } // prev[i] is the index of the last CJK character in the previous word in // the best segmentation of the first i characters. // TODO: Replace by UVector32. AutoBuffer<int, defaultInputLength> prev(numChars + 1); for(int i = 0; i <= numChars; i++){ prev[i] = -1; } const size_t maxWordSize = 20; // TODO: Replace both with UVector32. AutoBuffer<int32_t, maxWordSize> values(numChars); AutoBuffer<int32_t, maxWordSize> lengths(numChars); // Dynamic programming to find the best segmentation. bool is_prev_katakana = false; for (int32_t i = 0; i < numChars; ++i) { //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); if (bestSnlp[i] == kuint32max) continue; int32_t count; // limit maximum word length matched to size of current substring int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); // if there are no single character matches found in the dictionary // starting with this charcter, treat character as a 1-character word // with the highest value possible, i.e. the least likely to occur. // Exclude Korean characters from this treatment, as they should be left // together by default. if((count == 0 || lengths[0] != 1) && !fHangulWordSet.contains(utext_current32(&normalizedText))) { values[count] = maxSnlp; lengths[count++] = 1; } for (int j = 0; j < count; j++) { uint32_t newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, // Katakana word in single character is pretty rare. So we apply // the following heuristic to Katakana: any continuous run of Katakana // characters is considered a candidate word with a default cost // specified in the katakanaCost table according to its length. //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); bool is_katakana = isKatakana(utext_current32(&normalizedText)); if (!is_prev_katakana && is_katakana) { int j = i + 1; utext_next32(&normalizedText); // Find the end of the continuous run of Katakana characters while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(utext_current32(&normalizedText))) { utext_next32(&normalizedText); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } // Start pushing the optimal offset index into t_boundary (t for tentative). // prev[numChars] is guaranteed to be meaningful. // We'll first push in the reverse order, i.e., // t_boundary[0] = numChars, and afterwards do a swap. // TODO: Replace by UVector32. AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); int numBreaks = 0; // No segmentation found, set boundary to end of range if (bestSnlp[numChars] == kuint32max) { t_boundary[numBreaks++] = numChars; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks++] = i; } U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); } // Reverse offset index in t_boundary. // Don't add a break for the start of the dictionary range if there is one // there already. if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { t_boundary[numBreaks++] = 0; } // Now that we're done, convert positions in t_bdry[] (indices in // the normalized input string) back to indices in the raw input string // while reversing t_bdry and pushing values to foundBreaks. for (int i = numBreaks-1; i >= 0; i--) { foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); } utext_close(&normalizedText); return numBreaks; }
static void TestBreakIteratorCAPI() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *word, *sentence, *line, *character, *b, *bogus; int32_t start,pos,end,to; int32_t i; int32_t count = 0; UChar text[50]; /* Note: the adjacent "" are concatenating strings, not adding a \" to the string, which is probably what whoever wrote this intended. Don't fix, because it would throw off the hard coded break positions in the following tests. */ u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah"); /*test ubrk_open()*/ log_verbose("\nTesting BreakIterator open functions\n"); /* Use french for fun */ word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status); if(status == U_FILE_ACCESS_ERROR) { log_data_err("Check your data - it doesn't seem to be around\n"); return; } else if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status)); } else{ log_verbose("PASS: Successfully opened word breakiterator\n"); } sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened sentence breakiterator\n"); } line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened line breakiterator\n"); } character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status)); return; } else{ log_verbose("PASS: Successfully opened character breakiterator\n"); } /*trying to open an illegal iterator*/ bogus = ubrk_open((UBreakIteratorType)5, "en_US", text, u_strlen(text), &status); if(U_SUCCESS(status)){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n"); } if(U_FAILURE(status)){ if(status != U_ILLEGAL_ARGUMENT_ERROR){ log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status)); } } status=U_ZERO_ERROR; /* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */ log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n"); count=ubrk_countAvailable(); /* use something sensible w/o hardcoding the count */ if(count < 0){ log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count); } else{ log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count); } for(i=0;i<count;i++) { log_verbose("%s\n", ubrk_getAvailable(i)); if (ubrk_getAvailable(i) == 0) log_err("No locale for which breakiterator is applicable\n"); else log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i)); } /*========Test ubrk_first(), ubrk_last()...... and other functions*/ log_verbose("\nTesting the functions for word\n"); start = ubrk_first(word); if(start!=0) log_err("error ubrk_start(word) did not return 0\n"); log_verbose("first (word = %d\n", (int32_t)start); pos=ubrk_next(word); if(pos!=4) log_err("error ubrk_next(word) did not return 4\n"); log_verbose("next (word = %d\n", (int32_t)pos); pos=ubrk_following(word, 4); if(pos!=5) log_err("error ubrl_following(word,4) did not return 6\n"); log_verbose("next (word = %d\n", (int32_t)pos); end=ubrk_last(word); if(end!=49) log_err("error ubrk_last(word) did not return 49\n"); log_verbose("last (word = %d\n", (int32_t)end); pos=ubrk_previous(word); log_verbose("%d %d\n", end, pos); pos=ubrk_previous(word); log_verbose("%d \n", pos); if (ubrk_isBoundary(word, 2) != FALSE) { log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n"); } pos=ubrk_current(word); if (pos != 4) { log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n"); } if (ubrk_isBoundary(word, 4) != TRUE) { log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n"); } log_verbose("\nTesting the functions for character\n"); ubrk_first(character); pos = ubrk_following(character, 5); if(pos!=6) log_err("error ubrk_following(character,5) did not return 6\n"); log_verbose("Following (character,5) = %d\n", (int32_t)pos); pos=ubrk_following(character, 18); if(pos!=19) log_err("error ubrk_following(character,18) did not return 19\n"); log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos); pos=ubrk_preceding(character, 22); if(pos!=21) log_err("error ubrk_preceding(character,22) did not return 21\n"); log_verbose("preceding(character,22) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for line\n"); pos=ubrk_first(line); if(pos != 0) log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos); pos = ubrk_next(line); pos=ubrk_following(line, 18); if(pos!=22) log_err("error ubrk_following(line) did not return 22\n"); log_verbose("following (line) = %d\n", (int32_t)pos); log_verbose("\nTesting the functions for sentence\n"); ubrk_first(sentence); pos = ubrk_current(sentence); log_verbose("Current(sentence) = %d\n", (int32_t)pos); pos = ubrk_last(sentence); if(pos!=49) log_err("error ubrk_last for sentence did not return 49\n"); log_verbose("Last (sentence) = %d\n", (int32_t)pos); ubrk_first(sentence); to = ubrk_following( sentence, 0 ); if (to == 0) log_err("ubrk_following returned 0\n"); to = ubrk_preceding( sentence, to ); if (to != 0) log_err("ubrk_preceding didn't return 0\n"); if (ubrk_first(sentence)!=ubrk_current(sentence)) { log_err("error in ubrk_first() or ubrk_current()\n"); } /*---- */ /*Testing ubrk_open and ubrk_close()*/ log_verbose("\nTesting open and close for us locale\n"); b = ubrk_open(UBRK_WORD, "fr_FR", text, u_strlen(text), &status); if (U_FAILURE(status)) { log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status)); } ubrk_close(b); /* Test setText and setUText */ { UChar s1[] = {0x41, 0x42, 0x20, 0}; UChar s2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0}; UText *ut = NULL; UBreakIterator *bb; int j; log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n"); status = U_ZERO_ERROR; bb = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); ubrk_setText(bb, s1, -1, &status); TEST_ASSERT_SUCCESS(status); ubrk_first(bb); j = ubrk_next(bb); TEST_ASSERT(j == 2); ut = utext_openUChars(ut, s2, -1, &status); ubrk_setUText(bb, ut, &status); TEST_ASSERT_SUCCESS(status); j = ubrk_next(bb); TEST_ASSERT(j == 5); ubrk_close(bb); utext_close(ut); } ubrk_close(word); ubrk_close(sentence); ubrk_close(line); ubrk_close(character); }