/**
 * Split a string into parts [byte compare]
 *
 * The pattern matches identify delimiters that separate the input into fields.
 * The input data between the matches becomes the fields themselves.
 *
 * @param str character vector
 * @param pattern character vector
 * @param n_max integer vector
 * @param omit_empty logical vector
 *
 *
 * @version 0.1 (Bartek Tartanus)
 * @version 0.2 (Marek Gagolewski, 2013-06-25) StriException friendly, use StriContainerUTF8
 * @version 0.3 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str
 */
SEXP stri__split_fixed_byte(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty)
{
    str = stri_prepare_arg_string(str, "str");
    pattern = stri_prepare_arg_string(pattern, "pattern");
    n_max = stri_prepare_arg_integer(n_max, "n_max");
    omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty");

    STRI__ERROR_HANDLER_BEGIN
    R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty));
    StriContainerUTF8 str_cont(str, vectorize_length);
    StriContainerByteSearch pattern_cont(pattern, vectorize_length);
    StriContainerInteger   n_max_cont(n_max, vectorize_length);
    StriContainerLogical   omit_empty_cont(omit_empty, vectorize_length);

    SEXP ret;
    PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

    for (R_len_t i = pattern_cont.vectorize_init();
            i != pattern_cont.vectorize_end();
            i = pattern_cont.vectorize_next(i))
    {
        if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) {
            SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
            continue;
        }
        int  n_max_cur        = n_max_cont.get(i);
        int  omit_empty_cur   = omit_empty_cont.get(i);

        STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
                SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); ,
                SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/**
 * Split a string into parts.
 *
 * The pattern matches identify delimiters that separate the input into fields.
 * The input data between the matches becomes the fields themselves.
 *
 * @param str character vector
 * @param pattern character vector
 * @param n_max integer vector
 * @param opts_regex
 * @return list of character vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-06-21)
 * @version 0.2 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str
 */
SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty, SEXP opts_regex)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   n_max = stri_prepare_arg_integer(n_max, "n_max");
   omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty");
   R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty));

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   UText* str_text = NULL; // may potentially be slower, but definitely is more convenient!
   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8      str_cont(str, vectorize_length);
   StriContainerInteger   n_max_cont(n_max, vectorize_length);
   StriContainerLogical   omit_empty_cont(omit_empty, vectorize_length);
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      int  n_max_cur        = n_max_cont.get(i);
      int  omit_empty_cur   = omit_empty_cont.get(i);

      STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));,
         SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/**
 * Split a string into text lines
 *
 * @param str character vector
 * @param n_max integer vector
 * @param omit_empty logical vector
 *
 * @return list of character vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-08-04)
 */
SEXP stri_split_lines(SEXP str, SEXP n_max, SEXP omit_empty)
{
   str = stri_prepare_arg_string(str, "str");
   n_max = stri_prepare_arg_integer(n_max, "n_max");
   omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty");
   R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(n_max), LENGTH(omit_empty));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerInteger   n_max_cont(n_max, vectorize_length);
   StriContainerLogical   omit_empty_cont(omit_empty, vectorize_length);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      int  n_max_cur        = n_max_cont.get(i);
      int  omit_empty_cur   = omit_empty_cont.get(i);

      if (n_max_cur < 0)
         n_max_cur = INT_MAX;
      else if (n_max_cur == 0) {
         SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
         continue;
      }

//#define STRI_INDEX_NEWLINE_CR   0
//#define STRI_INDEX_NEWLINE_LF   1
//#define STRI_INDEX_NEWLINE_CRLF 2
//#define STRI_INDEX_NEWLINE_NEL  3
//#define STRI_INDEX_NEWLINE_VT   4
//#define STRI_INDEX_NEWLINE_FF   5
//#define STRI_INDEX_NEWLINE_LS   6
//#define STRI_INDEX_NEWLINE_PS   7
//#define STRI_INDEX_NEWLINE_LAST 8

//      int counts[STRI_INDEX_NEWLINE_LAST];
//      for (R_len_t j=0; j<STRI_INDEX_NEWLINE_LAST; ++j)
//         counts[j] = 0;

      UChar32 c;
      R_len_t jlast, k=1;
      deque<R_len_t_x2> occurences;
      occurences.push_back(R_len_t_x2(0, 0));
      for (R_len_t j=0; j < str_cur_n && k < n_max_cur; /* null */) {
         jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);

         switch (c) {
            case ASCII_CR: /* CR */
//               counts[STRI_INDEX_NEWLINE_CR]++;
               /* check if next is LF */
               if (str_cur_s[j] == ASCII_LF) { // look ahead one byte
//                  counts[STRI_INDEX_NEWLINE_LF]++;
//                  counts[STRI_INDEX_NEWLINE_CRLF]++;
                  j++; // just one byte
               }
               break;

            case ASCII_LF: /* LF */
//               counts[STRI_INDEX_NEWLINE_LF]++;
               break;

            case UCHAR_NEL: /* NEL */
//               counts[STRI_INDEX_NEWLINE_NEL]++;
               break;

            case ASCII_VT: /* VT */
//               counts[STRI_INDEX_NEWLINE_VT]++;
               break;

            case ASCII_FF: /* FF */
//               counts[STRI_INDEX_NEWLINE_FF]++;
               break;

            case UCHAR_LS: /* LS */
//               counts[STRI_INDEX_NEWLINE_LS]++;
               break;

            case UCHAR_PS: /* PS */
//               counts[STRI_INDEX_NEWLINE_PS]++;
               break;

            default:
               /* not a newline character */
               occurences.back().v2 = j;
               continue;
         }

         // if here, then at newline
         if (omit_empty_cur && occurences.back().v2 == occurences.back().v1)
            occurences.back().v1 = occurences.back().v2 = j; // don't start new field
         else {
            occurences.back().v2 = jlast;
            occurences.push_back(R_len_t_x2(j, j));
            ++k; // another field
         }
      }

      if (k == n_max_cur)
         occurences.back().v2 = str_cur_n;
      if (omit_empty_cur && occurences.back().v1 == occurences.back().v2)
         occurences.pop_back();

      SEXP ans;
      PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurences.size()));

      deque<R_len_t_x2>::iterator iter = occurences.begin();
      for (R_len_t l = 0; iter != occurences.end(); ++iter, ++l) {
         R_len_t_x2 curoccur = *iter;
         SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.v1, curoccur.v2-curoccur.v1, CE_UTF8));
      }

      SET_VECTOR_ELT(ret, i, ans);
      UNPROTECT(1);
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Extract all occurences of a character class in each string
 *
 * @param str character vector
 * @param pattern character vector
 * @return list of character vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-06-08)
 * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   merge = stri_prepare_arg_logical(merge, "merge");
   R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);
   StriContainerLogical merge_cont(merge, vectorize_length);

   SEXP notfound; // this vector will be set iff not found or NA
   PROTECT(notfound = stri__vector_NA_strings(1));

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, notfound);
         continue;
      }


      bool merge_cur = merge_cont.get(i);
      CharClass pattern_cur = pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t j, jlast;
      UChar32 chr;
      deque<R_len_t_x2> occurences; // codepoint based-indices

      for (jlast=j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (pattern_cur.test(chr)) {
            occurences.push_back(R_len_t_x2(jlast, j));
         }
         jlast = j;
      }

      R_len_t noccurences = (R_len_t)occurences.size();
      if (noccurences == 0)
         SET_VECTOR_ELT(ret, i, notfound);
      else if (merge_cur && noccurences > 1) {
         // do merge
         deque<R_len_t_x2> occurences2;
         deque<R_len_t_x2>::iterator iter = occurences.begin();
         occurences2.push_back(*iter);
         for (++iter; iter != occurences.end(); ++iter) {
            R_len_t_x2 curoccur = *iter;
            if (occurences2.back().v2 == curoccur.v1) { // continue seq
               occurences2.back().v2 = curoccur.v2;  // change `end`
            }
            else { // new seq
               occurences2.push_back(curoccur);
            }
         }

         // create resulting matrix from occurences2
         R_len_t noccurences2 = (R_len_t)occurences2.size();
         SEXP cur_res;
         PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2));
         iter = occurences2.begin();
         for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) {
            R_len_t_x2 curo = *iter;
            SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8));
         }
         SET_VECTOR_ELT(ret, i, cur_res);
         UNPROTECT(1);
      }
      else {
         // do not merge
         SEXP cur_res;
         PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences));
         deque<R_len_t_x2>::iterator iter = occurences.begin();
         for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) {
            R_len_t_x2 curo = *iter;
            SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8));
         }
         SET_VECTOR_ELT(ret, i, cur_res);
         UNPROTECT(1);
      }
   }

   UNPROTECT(2);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #5
0
/** Word wrap text
 *
 * @param str character vector
 * @param width single integer
 * @param cost_exponent single double
 * @param indent single integer
 * @param exdent single integer
 * @param prefix single string
 * @param initial single string
 * @param locale locale identifier or NULL for default locale
 * @param use_length single logical value
 *
 * @return list
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
 *          single function for wrap_greedy and wrap_dynamic
 *          (dispatch inside);
 *          use BreakIterator
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new args: indent, exdent, prefix, initial
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
 *    #133 allow width <= 0
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
 *    don't trim so many white spaces at the end of each word (normalize arg does that)
 *    #139: allow a "whitespace" break iterator
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
 *    `use_length` arg added
 *
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
 *    BIGSKIP: no more CHARSXP on out on "" input
 */
SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
   SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
   SEXP use_length, SEXP locale)
{
   bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
   bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");

   int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
   if (width_val <= 0) width_val = 0;

   int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
   if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent");

   int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
   if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent");


   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   Locale loc = Locale::createFromName(qloc);
   PROTECT(str     = stri_prepare_arg_string(str, "str"));
   PROTECT(prefix  = stri_prepare_arg_string_1(prefix, "prefix"));
   PROTECT(initial = stri_prepare_arg_string_1(initial, "initial"));

   BreakIterator* briter = NULL;
   UText* str_text = NULL;

   STRI__ERROR_HANDLER_BEGIN(3)
   UErrorCode status = U_ZERO_ERROR;
   briter = BreakIterator::createLineInstance(loc, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriContainerUTF8 prefix_cont(prefix, 1);
   StriContainerUTF8 initial_cont(initial, 1);


   // prepare indent/exdent/prefix/initial stuff:
   // 1st line, 1st para (i==0, u==0): initial+indent
   // nth line, 1st para (i==0, u> 0): prefix +exdent
   // 1st line, nth para (i> 0, u==0): prefix +indent
   // nth line, nth para (i> 0, u> 0): prefix +exdent
   StriWrapLineStart ii(initial_cont.get(0), indent_val);
   StriWrapLineStart pi(prefix_cont.get(0), indent_val);
   StriWrapLineStart pe(prefix_cont.get(0), exdent_val);


   status = U_ZERO_ERROR;
   //Unicode Newline Guidelines - Unicode Technical Report #13
   UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_linebreaks.freeze();

   status = U_ZERO_ERROR;
   UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_whitespaces.freeze();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      status = U_ZERO_ERROR;
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      status = U_ZERO_ERROR;
      briter->setText(str_text, status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      // all right, first let's generate a list of places at which we may do line breaks
      deque< R_len_t > occurrences_list; // this could be an R_len_t queue
      R_len_t match = briter->first();
      while (match != BreakIterator::DONE) {

         if (!whitespace_only_val)
            occurrences_list.push_back(match);
         else {
            if (match > 0 && match < str_cur_n) {
               UChar32 c;
               U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
               if (uset_whitespaces.contains(c))
                  occurrences_list.push_back(match);
            }
            else
               occurrences_list.push_back(match);
         }

         match = briter->next();
      }

      R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
      if (noccurrences <= 1) { // no match (1 boundary == 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
         continue;
      }

      // the number of "words" is:
      R_len_t nwords = noccurrences - 1;

      // convert occurrences_list to a vector
      // in order to obtain end positions (in a string) of each "words",
      // noting that occurrences_list.at(0) == 0
#ifndef NDEBUG
      if (occurrences_list.at(0) != 0)
         throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
#endif

      std::vector<R_len_t> end_pos_orig(nwords);
      deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
      for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
         end_pos_orig[j] = (*iter); // this is a UTF-8 index
      }


      // now:
      // we'll get the total widths/number of code points in each "word"
      std::vector<R_len_t> widths_orig(nwords);
      // we'll get the total widths/number of code points without trailing whitespaces
      std::vector<R_len_t> widths_trim(nwords);
      // we'll get the end positions without trailing whitespaces
      std::vector<R_len_t> end_pos_trim(nwords);
      // detect line endings (fail on a match)

      UChar32 c = 0;
      R_len_t j = 0;
      R_len_t cur_block = 0;
      R_len_t cur_width_orig = 0;
      R_len_t cur_width_trim = 0;
      R_len_t cur_count_orig = 0;
      R_len_t cur_count_trim = 0;
      R_len_t cur_end_pos_trim = 0;
      while (j < str_cur_n) {
         R_len_t jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);

         if (uset_linebreaks.contains(c))
            throw StriException(MSG__NEWLINE_FOUND);

         cur_width_orig += stri__width_char(c);
         ++cur_count_orig;
         if (uset_whitespaces.contains(c)) {
// OLD: trim all white spaces from the end:
//            ++cur_count_trim;
//           [we have the normalize arg for that]

// NEW: trim just one white space at the end:
            cur_width_trim = stri__width_char(c);
            cur_count_trim = 1;
            cur_end_pos_trim = jlast;
         }
         else {
            cur_width_trim = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }

         if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
            // we'll start a new block in a moment
            if (use_length_val) {
               widths_orig[cur_block] = cur_count_orig;
               widths_trim[cur_block] = cur_count_orig-cur_count_trim;
            }
            else {
               widths_orig[cur_block] = cur_width_orig;
               widths_trim[cur_block] = cur_width_orig-cur_width_trim;
            }
            end_pos_trim[cur_block] = cur_end_pos_trim;
            cur_block++;
            cur_width_orig = 0;
            cur_width_trim = 0;
            cur_count_orig = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }
      }

      // do wrap
      std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
      if (exponent_val <= 0.0) {
         stri__wrap_greedy(wrap_after, nwords, width_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }
      else {
         stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }

      // wrap_after.size() line breaks => wrap_after.size()+1 lines
      R_len_t nlines = (R_len_t)wrap_after.size()+1;
      R_len_t last_pos = 0;
      SEXP ans;
      STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
      deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
      for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
         R_len_t wrap_after_cur = *iter_wrap;
         R_len_t cur_pos = end_pos_trim[wrap_after_cur];

         std::string cs;
         if (i == 0 && u == 0)     cs = ii.str;
         else if (i > 0 && u == 0) cs = pi.str;
         else                      cs = pe.str;
         cs.append(str_cur_s+last_pos, cur_pos-last_pos);
         SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

         last_pos = end_pos_orig[wrap_after_cur];
      }

      // last line goes here:
      std::string cs;
      if (i == 0 && nlines-1 == 0)     cs = ii.str;
      else if (i > 0 && nlines-1 == 0) cs = pi.str;
      else                             cs = pe.str;
      cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
      SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   if (briter) { delete briter; briter = NULL; }
   if (str_text) { utext_close(str_text); str_text = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (briter) { delete briter; briter = NULL; }
      if (str_text) { utext_close(str_text); str_text = NULL; }
   })
}