/* Creating and using text boundaries */ int main( void ) { puts("ICU Break Iterator Sample Program\n"); puts("C++ Break Iteration\n"); BreakIterator* boundary; UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff."); printf("Examining: "); printUnicodeString(stringToExamine); puts(""); //print each sentence in forward and reverse order UErrorCode status = U_ZERO_ERROR; boundary = BreakIterator::createSentenceInstance( Locale::getUS(), status ); if (U_FAILURE(status)) { printf("failed to create sentence break iterator. status = %s", u_errorName(status)); exit(1); } boundary->setText(stringToExamine); puts("\n Sentence Boundaries... "); puts("----- forward: -----------"); printEachForward(*boundary); puts("----- backward: ----------"); printEachBackward(*boundary); delete boundary; //print each word in order printf("\n Word Boundaries... \n"); boundary = BreakIterator::createWordInstance( Locale::getUS(), status); boundary->setText(stringToExamine); puts("----- forward: -----------"); printEachForward(*boundary); //print first element puts("----- first: -------------"); printFirst(*boundary); //print last element puts("----- last: --------------"); printLast(*boundary); //print word at charpos 10 puts("----- at pos 10: ---------"); printAt(*boundary, 10 ); delete boundary; puts("\nEnd C++ Break Iteration"); // Call the C version return c_main(); }
U_NAMESPACE_USE /* functions available in the common library (for unistr_case.cpp) */ /* public API functions */ U_CAPI int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode) { LocalPointer<BreakIterator> ownedIter; BreakIterator *iter; if(titleIter!=NULL) { iter=reinterpret_cast<BreakIterator *>(titleIter); } else { iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode); ownedIter.adoptInstead(iter); } if(U_FAILURE(*pErrorCode)) { return 0; } UnicodeString s(srcLength<0, src, srcLength); iter->setText(s); return ustrcase_mapWithOverlap( ustrcase_getCaseLocale(locale), 0, iter, dest, destCapacity, src, srcLength, ustrcase_internalToTitle, *pErrorCode); }
U_DRAFT void U_EXPORT2 ubrk_setUText(UBreakIterator *bi, UText *text, UErrorCode *status) { BreakIterator *brit = (BreakIterator *)bi; brit->setText(text, *status); }
U_CAPI void U_EXPORT2 ubrk_setText(UBreakIterator* bi, const UChar* text, int32_t textLength, UErrorCode* status) { BreakIterator *brit = (BreakIterator *)bi; UText ut = UTEXT_INITIALIZER; utext_openUChars(&ut, text, textLength, status); brit->setText(&ut, *status); // A stack allocated UText wrapping a UCHar * string // can be dumped without explicitly closing it. }
void GetWordBoundaryPositions(const FunctionCallbackInfo<Value>& args) { Isolate* isolate = Isolate::GetCurrent(); HandleScope scope(isolate); if (args.Length() != 2) { isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "must supply locale and text"))); return; } if (!args[0]->IsString()) { isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "text is not specified"))); return; } if (!args[1]->IsString()) { isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "locale is not specified"))); return; } // convert v8 locale to ICU String::Utf8Value locale(args[1]->ToString()); const char* country = strtok(*locale, "_"), *language = strtok(NULL, "_"); Locale icuLocale(language, country); // create the BreakIterator instance UErrorCode err = U_ZERO_ERROR; BreakIterator *iterator = BreakIterator::createWordInstance(icuLocale, err); if (U_FAILURE(err)) { ErrorCode errCode; errCode.set(err); isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, errCode.errorName()))); return; } // Convert v8 text to ICU Unicode value Local<String> textStr = args[0]->ToString(); String::Utf8Value textValue(textStr); UnicodeString uTextValue(*textValue, "UTF-8"); if (uTextValue.isBogus()) { isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "unable to create unicode string"))); return; } iterator->setText(uTextValue); // populate boundaries Local<Array> results = Array::New(isolate); int32_t arrayPosition = 0; int32_t currentBoundary = iterator->first(); int32_t previousBoundary = 0; while (currentBoundary != BreakIterator::DONE) { if (currentBoundary > 0) { Local<Object> boundaryResult = Object::New(isolate); boundaryResult->Set(String::NewFromUtf8(isolate, "start"), Number::New(isolate, previousBoundary)); boundaryResult->Set(String::NewFromUtf8(isolate, "end"), Number::New(isolate, currentBoundary)); results->Set(arrayPosition++, boundaryResult); } previousBoundary = currentBoundary; currentBoundary = iterator->next(); } // cleanup delete iterator; args.GetReturnValue().Set(results); }
void Hyphenator::slotHyphenate(PageItem* it) { if (!(it->asTextFrame()) || (it->itemText.length() == 0)) return; m_doc->DoDrawing = false; QString text = ""; int startC = 0; if (it->itemText.selectionLength() > 0) { startC = it->itemText.startOfSelection(); text = it->itemText.text(startC, it->itemText.selectionLength()); } else { text = it->itemText.text(0, it->itemText.length()); } rememberedWords.clear(); qApp->setOverrideCursor(QCursor(Qt::WaitCursor)); BreakIterator* bi = StoryText::getWordIterator(); bi->setText((const UChar*) text.utf16()); int pos = bi->first(); while (pos != BreakIterator::DONE) { int firstC = pos; pos = bi->next(); int lastC = pos; int countC = lastC - firstC; const CharStyle& style = it->itemText.charStyle(firstC); if (countC > 0 && countC > style.hyphenWordMin() - 1) { QString word = text.mid(firstC, countC); QString wordLower = QLocale(style.language()).toLower(word); if (wordLower.contains(SpecialChars::SHYPHEN)) break; bool ok = loadDict(style.language()); if (!ok) continue; QByteArray te = m_codec->fromUnicode(wordLower); char *buffer = static_cast<char*>(malloc(te.length() + 5)); if (buffer == nullptr) break; char **rep = nullptr; int *pos = nullptr; int *cut = nullptr; // TODO: support non-standard hyphenation, see hnj_hyphen_hyphenate2 docs if (!hnj_hyphen_hyphenate2(m_hdict, te.data(), te.length(), buffer, nullptr, &rep, &pos, &cut)) { int i = 0; buffer[te.length()] = '\0'; bool hasHyphen = false; for (i = 1; i < wordLower.length()-1; ++i) { if(buffer[i] & 1) { hasHyphen = true; break; } } QString outs = ""; QString input = ""; outs += word[0]; for (i = 1; i < wordLower.length()-1; ++i) { outs += word[i]; if(buffer[i] & 1) outs += "-"; } outs += word.rightRef(1); input = outs; if (!ignoredWords.contains(word)) { if (!hasHyphen) it->itemText.hyphenateWord(startC + firstC, wordLower.length(), nullptr); else if (m_automatic) { if (specialWords.contains(word)) { outs = specialWords.value(word); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } it->itemText.hyphenateWord(startC + firstC, wordLower.length(), buffer); } else { if (specialWords.contains(word)) { outs = specialWords.value(word); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } if (rememberedWords.contains(input)) { outs = rememberedWords.value(input); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } it->itemText.hyphenateWord(firstC, wordLower.length(), buffer); } else { qApp->changeOverrideCursor(QCursor(Qt::ArrowCursor)); PrefsContext* prefs = PrefsManager::instance()->prefsFile->getContext("hyhpen_options"); int xpos = prefs->getInt("Xposition", -9999); int ypos = prefs->getInt("Yposition", -9999); HyAsk *dia = new HyAsk((QWidget*)parent(), outs); if ((xpos != -9999) && (ypos != -9999)) dia->move(xpos, ypos); qApp->processEvents(); if (dia->exec()) { outs = dia->Wort->text(); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } if (!rememberedWords.contains(input)) rememberedWords.insert(input, outs); if (dia->addToIgnoreList->isChecked()) { if (!ignoredWords.contains(word)) ignoredWords.insert(word); } if (dia->addToExceptionList->isChecked()) { if (!specialWords.contains(word)) specialWords.insert(word, outs); } it->itemText.hyphenateWord(firstC, wordLower.length(), buffer); } else { free(buffer); buffer = nullptr; prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; break; } prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; qApp->changeOverrideCursor(QCursor(Qt::WaitCursor)); } } } } free(buffer); if (rep) { for (int i = 0; i < te.length() - 1; ++i) free(rep[i]); } free(rep); free(pos); free(cut); } } qApp->restoreOverrideCursor(); m_doc->DoDrawing = true; rememberedWords.clear(); }
TRI_vector_string_t* Utf8Helper::getWords (const char* const text, const size_t textLength, const size_t minimalLength, const size_t maximalLength, bool lowerCase) { TRI_vector_string_t* words; UErrorCode status = U_ZERO_ERROR; UnicodeString word; if (textLength == 0) { // input text is empty return NULL; } if (textLength < minimalLength) { // input text is shorter than required minimum length return NULL; } size_t textUtf16Length = 0; UChar* textUtf16 = NULL; if (lowerCase) { // lower case string int32_t lowerLength = 0; char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength); if (lower == NULL) { // out of memory return NULL; } if (lowerLength == 0) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower); return NULL; } textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length); TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower); } else { textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length); } if (textUtf16 == NULL) { return NULL; } ULocDataLocaleType type = ULOC_VALID_LOCALE; const Locale& locale = _coll->getLocale(type, status); if (U_FAILURE(status)) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); LOG_ERROR("error in Collator::getLocale(...): %s", u_errorName(status)); return NULL; } UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false); if (tempUtf16 == NULL) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); return NULL; } words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false); if (words == NULL) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16); return NULL; } // estimate an initial vector size. this is not accurate, but setting the initial size to some // value in the correct order of magnitude will save a lot of vector reallocations later size_t initialWordCount = textLength / (2 * (minimalLength + 1)); if (initialWordCount < 32) { // alloc at least 32 pointers (= 256b) initialWordCount = 32; } else if (initialWordCount > 8192) { // alloc at most 8192 pointers (= 64kb) initialWordCount = 8192; } TRI_InitVectorString2(words, TRI_UNKNOWN_MEM_ZONE, initialWordCount); BreakIterator* wordIterator = BreakIterator::createWordInstance(locale, status); UnicodeString utext(textUtf16); wordIterator->setText(utext); int32_t start = wordIterator->first(); for(int32_t end = wordIterator->next(); end != BreakIterator::DONE; start = end, end = wordIterator->next()) { size_t tempUtf16Length = (size_t) (end - start); // end - start = word length if (tempUtf16Length >= minimalLength) { size_t chunkLength = tempUtf16Length; if (chunkLength > maximalLength) { chunkLength = maximalLength; } utext.extractBetween(start, (int32_t) (start + chunkLength), tempUtf16, 0); size_t utf8WordLength; char* utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, chunkLength, &utf8WordLength); if (utf8Word != 0) { TRI_PushBackVectorString(words, utf8Word); } } } delete wordIterator; TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16); if (words->_length == 0) { // no words found TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); return NULL; } return words; }
// // TestRuleStatus // Test word break rule status constants. // void RBBIAPITest::TestRuleStatus() { UChar str[30]; //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing // changed UBRK_WORD_KANA to UBRK_WORD_IDEO u_unescape("plain word 123.45 \\u30a1\\u30a2 ", // 012345678901234567 8 9 0 // Katakana str, 30); UnicodeString testString1(str); int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, UBRK_WORD_IDEO, UBRK_WORD_NONE}; int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; UErrorCode status=U_ZERO_ERROR; BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); if(U_FAILURE(status)) { errcheckln(status, "Fail : in construction - %s", u_errorName(status)); } else { bi->setText(testString1); // First test that the breaks are in the right spots. doBoundaryTest(*bi, testString1, bounds1); // Then go back and check tag values int32_t i = 0; int32_t pos, tag; for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) { if (pos != bounds1[i]) { errln("FAIL: unexpected word break at postion %d", pos); break; } tag = bi->getRuleStatus(); if (tag < tag_lo[i] || tag >= tag_hi[i]) { errln("FAIL: incorrect tag value %d at position %d", tag, pos); break; } // Check that we get the same tag values from getRuleStatusVec() int32_t vec[10]; int t = bi->getRuleStatusVec(vec, 10, status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(t==1); TEST_ASSERT(vec[0] == tag); } } delete bi; // Now test line break status. This test mostly is to confirm that the status constants // are correctly declared in the header. testString1 = "test line. \n"; // break type s s h bi = BreakIterator::createLineInstance(Locale::getEnglish(), status); if(U_FAILURE(status)) { errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status)); } else { int32_t i = 0; int32_t pos, tag; UBool success; bi->setText(testString1); pos = bi->current(); tag = bi->getRuleStatus(); for (i=0; i<3; i++) { switch (i) { case 0: success = pos==0 && tag==UBRK_LINE_SOFT; break; case 1: success = pos==5 && tag==UBRK_LINE_SOFT; break; case 2: success = pos==12 && tag==UBRK_LINE_HARD; break; default: success = FALSE; break; } if (success == FALSE) { errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d", i, pos, tag); break; } pos = bi->next(); tag = bi->getRuleStatus(); } if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT || UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT || (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) { errln("UBRK_LINE_* constants from header are inconsistent."); } } delete bi; }
// In the Unicode string characters are always stored in logical order. // This makes line breaking easy. One word is added to the current line at a time. Once the line is too long // we either go back one step or inset the line break at the current position (depending on "wrap_before" setting). // At the end everything that is left over is added as the final line. void text_layout::break_line(text_line & line, double wrap_width, unsigned text_ratio, bool wrap_before) { shape_text(line); if (!wrap_width || line.width() < wrap_width) { add_line(line); return; } if (text_ratio) { double wrap_at; double string_width = line.width(); double string_height = line.line_height(); for (double i = 1.0; ((wrap_at = string_width/i)/(string_height*i)) > text_ratio && (string_width/i) > wrap_width; i += 1.0) ; wrap_width = wrap_at; } mapnik::value_unicode_string const& text = itemizer_.text(); Locale locale; // TODO: Is the default constructor correct? UErrorCode status = U_ZERO_ERROR; BreakIterator *breakitr = BreakIterator::createLineInstance(locale, status); // Not breaking the text if an error occurs is probably the best thing we can do. // https://github.com/mapnik/mapnik/issues/2072 if (!U_SUCCESS(status)) { add_line(line); MAPNIK_LOG_ERROR(text_layout) << " could not create BreakIterator: " << u_errorName(status); return; } breakitr->setText(text); double current_line_length = 0; int last_break_position = static_cast<int>(line.first_char()); for (unsigned i=line.first_char(); i < line.last_char(); ++i) { // TODO: character_spacing std::map<unsigned, double>::const_iterator width_itr = width_map_.find(i); if (width_itr != width_map_.end()) { current_line_length += width_itr->second; } if (current_line_length <= wrap_width) continue; int break_position = wrap_before ? breakitr->preceding(i) : breakitr->following(i); // following() returns a break position after the last word. So DONE should only be returned // when calling preceding. if (break_position <= last_break_position || break_position == static_cast<int>(BreakIterator::DONE)) { // A single word is longer than the maximum line width. // Violate line width requirement and choose next break position break_position = breakitr->following(i); if (break_position == static_cast<int>(BreakIterator::DONE)) { break_position = line.last_char(); MAPNIK_LOG_ERROR(text_layout) << "Unexpected result in break_line. Trying to recover...\n"; } } // Break iterator operates on the whole string, while we only look at one line. So we need to // clamp break values. if (break_position < static_cast<int>(line.first_char())) { break_position = line.first_char(); } if (break_position > static_cast<int>(line.last_char())) { break_position = line.last_char(); } text_line new_line(last_break_position, break_position); clear_cluster_widths(last_break_position, break_position); shape_text(new_line); add_line(new_line); last_break_position = break_position; i = break_position - 1; current_line_length = 0; } if (last_break_position == static_cast<int>(line.first_char())) { // No line breaks => no reshaping required add_line(line); } else if (last_break_position != static_cast<int>(line.last_char())) { text_line new_line(last_break_position, line.last_char()); clear_cluster_widths(last_break_position, line.last_char()); shape_text(new_line); add_line(new_line); } }
/** Word wrap text * * @param str character vector * @param width single integer * @param cost_exponent single double * @param indent single integer * @param exdent single integer * @param prefix single string * @param initial single string * @param locale locale identifier or NULL for default locale * @param use_length single logical value * * @return list * * @version 0.1-?? (Bartek Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-27) * single function for wrap_greedy and wrap_dynamic * (dispatch inside); * use BreakIterator * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: indent, exdent, prefix, initial * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) * #133 allow width <= 0 * * @version 0.5-1 (Marek Gagolewski, 2015-02-28) * don't trim so many white spaces at the end of each word (normalize arg does that) * #139: allow a "whitespace" break iterator * * @version 0.5-1 (Marek Gagolewski, 2015-04-23) * `use_length` arg added * * * @version 0.5-1 (Marek Gagolewski, 2015-06-09) * BIGSKIP: no more CHARSXP on out on "" input */ SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent, SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only, SEXP use_length, SEXP locale) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent"); bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only"); int width_val = stri__prepare_arg_integer_1_notNA(width, "width"); if (width_val <= 0) width_val = 0; int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent"); if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent"); int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent"); if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent"); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ Locale loc = Locale::createFromName(qloc); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(prefix = stri_prepare_arg_string_1(prefix, "prefix")); PROTECT(initial = stri_prepare_arg_string_1(initial, "initial")); BreakIterator* briter = NULL; UText* str_text = NULL; STRI__ERROR_HANDLER_BEGIN(3) UErrorCode status = U_ZERO_ERROR; briter = BreakIterator::createLineInstance(loc, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector<R_len_t> end_pos_orig(nwords); deque<R_len_t>::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector<R_len_t> widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector<R_len_t> widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector<R_len_t> end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque<R_len_t>::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) }