Beispiel #1
0
/**
 * Get number of bytes in each string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R object coercible to a character vector
 * @return integer vector
 * @version 0.1 (Marcin Bujarski)
 */
SEXP stri_numbytes(SEXP str)
{
   str = stri_prepare_arg_string(str, "str"); // prepare string argument
   R_len_t n = LENGTH(str);
   SEXP ret;
   PROTECT(ret = Rf_allocVector(INTSXP, n));
   int* retint = INTEGER(ret);
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      /* INPUT ENCODING CHECK: this function does not need this. */
      if (curs == NA_STRING)
         retint[i] = NA_INTEGER;
      else
         retint[i] = LENGTH(curs); // O(1) - stored by R
   }
   UNPROTECT(1);
   return ret;
}
/**
 * Get number of bytes in each string
 *
 * Note that ICU permits only strings of length < 2^31.
 *
 * @param s R object coercible to a character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_numbytes(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument
    R_len_t str_n = LENGTH(str);

    STRI__ERROR_HANDLER_BEGIN(1)
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = LOGICAL(ret);
    for (R_len_t i=0; i<str_n; ++i) {
        SEXP curs = STRING_ELT(str, i);
        /* INPUT ENCODING CHECK: this function does not need this. */
        retint[i] = (curs == NA_STRING)?NA_INTEGER:LENGTH(curs); // O(1) - stored by R
    }
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
/** Get Declared Encodings of Each String
 *
 * @param str a character vector or an object coercible to
 * @return a character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_mark(SEXP str) {
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);

   // some of them will not be used in this call, but we're lazy
   SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes;
   STRI__PROTECT(mark_ascii  = Rf_mkChar("ASCII"));
   STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1"));
   STRI__PROTECT(mark_utf8   = Rf_mkChar("UTF-8"));
   STRI__PROTECT(mark_native = Rf_mkChar("native"));
   STRI__PROTECT(mark_bytes  = Rf_mkChar("bytes"));

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i=0; i<str_len; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      if (IS_ASCII(curs))
         SET_STRING_ELT(ret, i, mark_ascii);
      else if (IS_UTF8(curs))
         SET_STRING_ELT(ret, i, mark_utf8);
      else if (IS_BYTES(curs))
         SET_STRING_ELT(ret, i, mark_bytes);
      else if (IS_LATIN1(curs))
         SET_STRING_ELT(ret, i, mark_latin1);
      else
         SET_STRING_ELT(ret, i, mark_native);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Convert character vector between a marked encodings and the encoding provided
 *
 * @param str     input character vector or list of raw vectors
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-11-12)
 */
SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
{
   str = stri_prepare_arg_string(str, "str");
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true);
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   UConverter* uconv_to = NULL;

   STRI__ERROR_HANDLER_BEGIN
   R_len_t str_n = LENGTH(str);
   StriContainerUTF16 str_cont(str, str_n);

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   uconv_to = stri__ucnv_open(selected_to);

   // Get target encoding mark
   UErrorCode err = U_ZERO_ERROR;
   const char* uconv_to_name = ucnv_getName(uconv_to, &err);
   if (U_FAILURE(err))
      throw StriException(err);
   cetype_t encmark_to = CE_BYTES; // all other cases than the below ones
      // - bytes enc (this is reasonable, isn't it?)
   if (!to_raw_logical) { // otherwise not needed
      if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8"))
         encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE
      else if (!strcmp(uconv_to_name, "ISO-8859-1"))
         encmark_to = CE_LATIN1;
      else if (!strcmp(uconv_to_name, ucnv_getDefaultName()))
         encmark_to = CE_NATIVE;
   }

   // Prepare out val
   SEXP ret;
   PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   String8 buf(0); // will be extended in a moment

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t curn_tmp = str_cont.get(i).length();
      const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer contents is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to));
      // "The calculated size is guaranteed to be sufficient for this conversion."
      buf.resize(bufneed);

      err = U_ZERO_ERROR;
      ucnv_resetFromUnicode(uconv_to);
      bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
      if (bufneed <= buf.size()) {
         if (U_FAILURE(err))
            throw StriException(err);
      }
      else {// larger buffer needed [this shouldn't happen?]
//         warning("buf extending");
         buf.resize(bufneed);
         err = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
         if (U_FAILURE(err))
            throw StriException(err);
         if (bufneed > buf.size())
            throw StriException(MSG__INTERNAL_ERROR);
      }

      if (to_raw_logical) {
         SEXP outobj = Rf_allocVector(RAWSXP, bufneed);
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
      }
      else {
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   if (uconv_to) {
      ucnv_close(uconv_to);
      uconv_to = NULL;
   }

   UNPROTECT(1);
   return ret;

   STRI__ERROR_HANDLER_END({
      if (uconv_to)
         ucnv_close(uconv_to);
   })
}
/**
 * Split a string into text lines
 *
 * @param str character vector
 * @param n_max integer vector
 * @param omit_empty logical vector
 *
 * @return list of character vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-08-04)
 */
SEXP stri_split_lines(SEXP str, SEXP n_max, SEXP omit_empty)
{
   str = stri_prepare_arg_string(str, "str");
   n_max = stri_prepare_arg_integer(n_max, "n_max");
   omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty");
   R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(n_max), LENGTH(omit_empty));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerInteger   n_max_cont(n_max, vectorize_length);
   StriContainerLogical   omit_empty_cont(omit_empty, vectorize_length);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      int  n_max_cur        = n_max_cont.get(i);
      int  omit_empty_cur   = omit_empty_cont.get(i);

      if (n_max_cur < 0)
         n_max_cur = INT_MAX;
      else if (n_max_cur == 0) {
         SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
         continue;
      }

//#define STRI_INDEX_NEWLINE_CR   0
//#define STRI_INDEX_NEWLINE_LF   1
//#define STRI_INDEX_NEWLINE_CRLF 2
//#define STRI_INDEX_NEWLINE_NEL  3
//#define STRI_INDEX_NEWLINE_VT   4
//#define STRI_INDEX_NEWLINE_FF   5
//#define STRI_INDEX_NEWLINE_LS   6
//#define STRI_INDEX_NEWLINE_PS   7
//#define STRI_INDEX_NEWLINE_LAST 8

//      int counts[STRI_INDEX_NEWLINE_LAST];
//      for (R_len_t j=0; j<STRI_INDEX_NEWLINE_LAST; ++j)
//         counts[j] = 0;

      UChar32 c;
      R_len_t jlast, k=1;
      deque<R_len_t_x2> occurences;
      occurences.push_back(R_len_t_x2(0, 0));
      for (R_len_t j=0; j < str_cur_n && k < n_max_cur; /* null */) {
         jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);

         switch (c) {
            case ASCII_CR: /* CR */
//               counts[STRI_INDEX_NEWLINE_CR]++;
               /* check if next is LF */
               if (str_cur_s[j] == ASCII_LF) { // look ahead one byte
//                  counts[STRI_INDEX_NEWLINE_LF]++;
//                  counts[STRI_INDEX_NEWLINE_CRLF]++;
                  j++; // just one byte
               }
               break;

            case ASCII_LF: /* LF */
//               counts[STRI_INDEX_NEWLINE_LF]++;
               break;

            case UCHAR_NEL: /* NEL */
//               counts[STRI_INDEX_NEWLINE_NEL]++;
               break;

            case ASCII_VT: /* VT */
//               counts[STRI_INDEX_NEWLINE_VT]++;
               break;

            case ASCII_FF: /* FF */
//               counts[STRI_INDEX_NEWLINE_FF]++;
               break;

            case UCHAR_LS: /* LS */
//               counts[STRI_INDEX_NEWLINE_LS]++;
               break;

            case UCHAR_PS: /* PS */
//               counts[STRI_INDEX_NEWLINE_PS]++;
               break;

            default:
               /* not a newline character */
               occurences.back().v2 = j;
               continue;
         }

         // if here, then at newline
         if (omit_empty_cur && occurences.back().v2 == occurences.back().v1)
            occurences.back().v1 = occurences.back().v2 = j; // don't start new field
         else {
            occurences.back().v2 = jlast;
            occurences.push_back(R_len_t_x2(j, j));
            ++k; // another field
         }
      }

      if (k == n_max_cur)
         occurences.back().v2 = str_cur_n;
      if (omit_empty_cur && occurences.back().v1 == occurences.back().v2)
         occurences.pop_back();

      SEXP ans;
      PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurences.size()));

      deque<R_len_t_x2>::iterator iter = occurences.begin();
      for (R_len_t l = 0; iter != occurences.end(); ++iter, ++l) {
         R_len_t_x2 curoccur = *iter;
         SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.v1, curoccur.v2-curoccur.v1, CE_UTF8));
      }

      SET_VECTOR_ELT(ret, i, ans);
      UNPROTECT(1);
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #6
0
/** Word wrap text
 *
 * @param str character vector
 * @param width single integer
 * @param cost_exponent single double
 * @param indent single integer
 * @param exdent single integer
 * @param prefix single string
 * @param initial single string
 * @param locale locale identifier or NULL for default locale
 * @param use_length single logical value
 *
 * @return list
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
 *          single function for wrap_greedy and wrap_dynamic
 *          (dispatch inside);
 *          use BreakIterator
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new args: indent, exdent, prefix, initial
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
 *    #133 allow width <= 0
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
 *    don't trim so many white spaces at the end of each word (normalize arg does that)
 *    #139: allow a "whitespace" break iterator
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
 *    `use_length` arg added
 *
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
 *    BIGSKIP: no more CHARSXP on out on "" input
 */
SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
   SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
   SEXP use_length, SEXP locale)
{
   bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
   bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");

   int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
   if (width_val <= 0) width_val = 0;

   int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
   if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent");

   int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
   if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent");


   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   Locale loc = Locale::createFromName(qloc);
   PROTECT(str     = stri_prepare_arg_string(str, "str"));
   PROTECT(prefix  = stri_prepare_arg_string_1(prefix, "prefix"));
   PROTECT(initial = stri_prepare_arg_string_1(initial, "initial"));

   BreakIterator* briter = NULL;
   UText* str_text = NULL;

   STRI__ERROR_HANDLER_BEGIN(3)
   UErrorCode status = U_ZERO_ERROR;
   briter = BreakIterator::createLineInstance(loc, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriContainerUTF8 prefix_cont(prefix, 1);
   StriContainerUTF8 initial_cont(initial, 1);


   // prepare indent/exdent/prefix/initial stuff:
   // 1st line, 1st para (i==0, u==0): initial+indent
   // nth line, 1st para (i==0, u> 0): prefix +exdent
   // 1st line, nth para (i> 0, u==0): prefix +indent
   // nth line, nth para (i> 0, u> 0): prefix +exdent
   StriWrapLineStart ii(initial_cont.get(0), indent_val);
   StriWrapLineStart pi(prefix_cont.get(0), indent_val);
   StriWrapLineStart pe(prefix_cont.get(0), exdent_val);


   status = U_ZERO_ERROR;
   //Unicode Newline Guidelines - Unicode Technical Report #13
   UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_linebreaks.freeze();

   status = U_ZERO_ERROR;
   UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_whitespaces.freeze();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      status = U_ZERO_ERROR;
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      status = U_ZERO_ERROR;
      briter->setText(str_text, status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      // all right, first let's generate a list of places at which we may do line breaks
      deque< R_len_t > occurrences_list; // this could be an R_len_t queue
      R_len_t match = briter->first();
      while (match != BreakIterator::DONE) {

         if (!whitespace_only_val)
            occurrences_list.push_back(match);
         else {
            if (match > 0 && match < str_cur_n) {
               UChar32 c;
               U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
               if (uset_whitespaces.contains(c))
                  occurrences_list.push_back(match);
            }
            else
               occurrences_list.push_back(match);
         }

         match = briter->next();
      }

      R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
      if (noccurrences <= 1) { // no match (1 boundary == 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
         continue;
      }

      // the number of "words" is:
      R_len_t nwords = noccurrences - 1;

      // convert occurrences_list to a vector
      // in order to obtain end positions (in a string) of each "words",
      // noting that occurrences_list.at(0) == 0
#ifndef NDEBUG
      if (occurrences_list.at(0) != 0)
         throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
#endif

      std::vector<R_len_t> end_pos_orig(nwords);
      deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
      for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
         end_pos_orig[j] = (*iter); // this is a UTF-8 index
      }


      // now:
      // we'll get the total widths/number of code points in each "word"
      std::vector<R_len_t> widths_orig(nwords);
      // we'll get the total widths/number of code points without trailing whitespaces
      std::vector<R_len_t> widths_trim(nwords);
      // we'll get the end positions without trailing whitespaces
      std::vector<R_len_t> end_pos_trim(nwords);
      // detect line endings (fail on a match)

      UChar32 c = 0;
      R_len_t j = 0;
      R_len_t cur_block = 0;
      R_len_t cur_width_orig = 0;
      R_len_t cur_width_trim = 0;
      R_len_t cur_count_orig = 0;
      R_len_t cur_count_trim = 0;
      R_len_t cur_end_pos_trim = 0;
      while (j < str_cur_n) {
         R_len_t jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);

         if (uset_linebreaks.contains(c))
            throw StriException(MSG__NEWLINE_FOUND);

         cur_width_orig += stri__width_char(c);
         ++cur_count_orig;
         if (uset_whitespaces.contains(c)) {
// OLD: trim all white spaces from the end:
//            ++cur_count_trim;
//           [we have the normalize arg for that]

// NEW: trim just one white space at the end:
            cur_width_trim = stri__width_char(c);
            cur_count_trim = 1;
            cur_end_pos_trim = jlast;
         }
         else {
            cur_width_trim = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }

         if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
            // we'll start a new block in a moment
            if (use_length_val) {
               widths_orig[cur_block] = cur_count_orig;
               widths_trim[cur_block] = cur_count_orig-cur_count_trim;
            }
            else {
               widths_orig[cur_block] = cur_width_orig;
               widths_trim[cur_block] = cur_width_orig-cur_width_trim;
            }
            end_pos_trim[cur_block] = cur_end_pos_trim;
            cur_block++;
            cur_width_orig = 0;
            cur_width_trim = 0;
            cur_count_orig = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }
      }

      // do wrap
      std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
      if (exponent_val <= 0.0) {
         stri__wrap_greedy(wrap_after, nwords, width_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }
      else {
         stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }

      // wrap_after.size() line breaks => wrap_after.size()+1 lines
      R_len_t nlines = (R_len_t)wrap_after.size()+1;
      R_len_t last_pos = 0;
      SEXP ans;
      STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
      deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
      for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
         R_len_t wrap_after_cur = *iter_wrap;
         R_len_t cur_pos = end_pos_trim[wrap_after_cur];

         std::string cs;
         if (i == 0 && u == 0)     cs = ii.str;
         else if (i > 0 && u == 0) cs = pi.str;
         else                      cs = pe.str;
         cs.append(str_cur_s+last_pos, cur_pos-last_pos);
         SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

         last_pos = end_pos_orig[wrap_after_cur];
      }

      // last line goes here:
      std::string cs;
      if (i == 0 && nlines-1 == 0)     cs = ii.str;
      else if (i > 0 && nlines-1 == 0) cs = pi.str;
      else                             cs = pe.str;
      cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
      SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   if (briter) { delete briter; briter = NULL; }
   if (str_text) { utext_close(str_text); str_text = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (briter) { delete briter; briter = NULL; }
      if (str_text) { utext_close(str_text); str_text = NULL; }
   })
}
/** Convert character vector to ASCII
 *
 * All charcodes > 127 are replaced with subst chars (0x1A)
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-30)
 *          use single common buf;
 *          warn on invalid utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toascii(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)

   // get buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING)
         continue;

      R_len_t ni = LENGTH(curs);
      if (ni > bufsize) bufsize = ni;
   }
   String8buf buf(bufsize); // no more bytes than this needed
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING || IS_ASCII(curs)) {
         // nothing to do
         SET_STRING_ELT(ret, i, curs);
         continue;
      }

      R_len_t curn = LENGTH(curs);
      const char* curs_tab = CHAR(curs);

      if (IS_UTF8(curs)) {
         R_len_t k = 0, j = 0;
         UChar32 c;
         while (j<curn) {
            U8_NEXT(curs_tab, j, curn, c);
            if (c < 0) {
               Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
               bufdata[k++] = ASCII_SUBSTITUTE;
            }
            else if (c > ASCII_MAXCHARCODE)
               bufdata[k++] = ASCII_SUBSTITUTE;
            else
               bufdata[k++] = (char)c;
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
      else { // some 8-bit encoding
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else {
               bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #8
0
/**
 * Extract multiple substrings
 *
 *
 * @param str character vector
 * @param from list
 * @param to list
 * @param length list
 * @return list of character vectors
 *
 * @version 1.3.2 (Marek Gagolewski, 2019-02-21)
 *    #30: new function
 */
SEXP stri_sub_all(SEXP str, SEXP from, SEXP to, SEXP length)
{
   PROTECT(str    = stri_prepare_arg_string(str, "str"));
   PROTECT(from   = stri_prepare_arg_list(from, "from"));
   PROTECT(to     = stri_prepare_arg_list(to, "to"));
   PROTECT(length = stri_prepare_arg_list(length, "length"));

   R_len_t str_len       = LENGTH(str);
   R_len_t from_len      = LENGTH(from);
   // R_len_t to_len        = LENGTH(to);
   // R_len_t length_len    = LENGTH(length);


   R_len_t vectorize_len;
   if (!isNull(to))
      vectorize_len = stri__recycling_rule(true, 3,
         str_len, from_len, LENGTH(to));
   else if (!isNull(length))
      vectorize_len = stri__recycling_rule(true, 3,
        str_len, from_len, LENGTH(length));
   else
      vectorize_len = stri__recycling_rule(true, 2, str_len, from_len);

   if (vectorize_len <= 0) {
      UNPROTECT(4);
      return Rf_allocVector(VECSXP, 0);
   }


  // no STRI__ERROR_HANDLER_BEGIN  block ---- stri_sub can longjmp with Rf_error...

   SEXP ret, str_tmp, tmp;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_len)); //5
   PROTECT(str_tmp = Rf_allocVector(STRSXP, 1));         //6

   for (R_len_t i = 0; i<vectorize_len; ++i)
   {
      PROTECT(tmp = STRING_ELT(str, i%str_len));
      SET_STRING_ELT(str_tmp, 0, tmp);
      UNPROTECT(1); //tmp

      if (!isNull(to)) {
         PROTECT(tmp = stri_sub(str_tmp,
              VECTOR_ELT(from, i%from_len), VECTOR_ELT(to, i%LENGTH(to)), R_NilValue));
      }
      else if (!isNull(length)) {
         PROTECT(tmp = stri_sub(str_tmp,
              VECTOR_ELT(from, i%from_len), R_NilValue, VECTOR_ELT(length, i%LENGTH(length))));
      }
      else {
         PROTECT(tmp = stri_sub(str_tmp,
              VECTOR_ELT(from, i%from_len), R_NilValue, R_NilValue));
      }

      SET_VECTOR_ELT(ret, i, tmp);
      UNPROTECT(1); //tmp
   }

   UNPROTECT(6);
   return ret;
}
Beispiel #9
0
/**
 * Get substring
 *
 *
 * @param str character vector
 * @param from integer vector (possibly with negative indices)
 * @param to integer vector (possibly with negative indices) or NULL
 * @param length integer vector or NULL
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF8 and stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          use StriContainerUTF8's UChar32-to-UTF8 index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          Use StriContainerUTF8_indexable
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          Use stri__sub_prepare_from_to_length()
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.5-9003 (Marek Gagolewski, 2015-08-05)
 *    Bugfix #183: floating point exception when to or length is an empty vector
 */
SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));

   R_len_t str_len       = LENGTH(str);
   R_len_t from_len      = 0;
   R_len_t to_len        = 0;
   R_len_t length_len    = 0;
   int* from_tab         = 0;
   int* to_tab           = 0;
   int* length_tab       = 0;

   R_len_t sub_protected =  1+  /* how many objects to PROTECT on ret? */
      stri__sub_prepare_from_to_length(from, to, length,
      from_len, to_len, length_len, from_tab, to_tab, length_tab);

   R_len_t vectorize_len = stri__recycling_rule(true, 3,
      str_len, from_len, (to_len>length_len)?to_len:length_len);

   if (vectorize_len <= 0) {
      UNPROTECT(sub_protected);
      return Rf_allocVector(STRSXP, 0);
   }

   STRI__ERROR_HANDLER_BEGIN(sub_protected)
   StriContainerUTF8_indexable str_cont(str, vectorize_len);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      R_len_t cur_from     = from_tab[i % from_len];
      R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];
      if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      if (length_tab) {
         if (cur_to <= 0) {
            SET_STRING_ELT(ret, i, R_BlankString);
            continue;
         }
         cur_to = cur_from + cur_to - 1;
         if (cur_from < 0 && cur_to >= 0) cur_to = -1;
      }

      const char* str_cur_s = str_cont.get(i).c_str();

      R_len_t cur_from2; // UTF-8 byte indices
      R_len_t cur_to2;   // UTF-8 byte indices

      stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2);

      if (cur_to2 > cur_from2) { // just copy
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8));
      }
      else {
         // maybe a warning here?
         SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #10
0
/**
 * Pad a string
 *
 * vectorized over str, length and pad
 * if str or pad or length is NA the result will be NA
 *
 * @param str character vector
 * @param min_length integer vector
 * @param side [internal int]
 * @param pad character vector
 * @param use_length single logical value
 * @return character vector
 *
 * @version 0.1-?? (Bartlomiej Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-20)
 *          use stri_error_handler, pad should be a single code point, not byte
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-22)
 *    `use_length` arg added,
 *    second argument renamed `width`
*/
SEXP stri_pad(SEXP str, SEXP width, SEXP side, SEXP pad, SEXP use_length)
{
   // this is an internal arg, check manually, error() allowed here
   if (!Rf_isInteger(side) || LENGTH(side) != 1)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   int _side = INTEGER(side)[0];
   if (_side < 0 || _side > 2)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);

   bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   PROTECT(str         = stri_prepare_arg_string(str, "str"));
   PROTECT(width       = stri_prepare_arg_integer(width, "width"));
   PROTECT(pad         = stri_prepare_arg_string(pad, "pad"));

//   side       = stri_prepare_arg_string(side, "side");
//   const char* side_opts[] = {"left", "right", "both", NULL};

   R_len_t str_length     = LENGTH(str);
   R_len_t width_length  = LENGTH(width);
//   R_len_t side_length    = LENGTH(side);
   R_len_t pad_length     = LENGTH(pad);

   R_len_t vectorize_length = stri__recycling_rule(true, 3,
      str_length, width_length, /*side_length, */ pad_length);

   STRI__ERROR_HANDLER_BEGIN(3)
   StriContainerUTF8       str_cont(str, vectorize_length);
   StriContainerInteger  width_cont(width, vectorize_length);
//   StriContainerUTF8      side_cont(side, vectorize_length);
   StriContainerUTF8       pad_cont(pad, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   String8buf buf(0); // TODO: prealloc
   for (R_len_t i=0; i<vectorize_length; ++i) {
      if (str_cont.isNA(i) || pad_cont.isNA(i)
          || /*side_cont.isNA(i) ||*/ width_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // get the current string
      R_len_t str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_width;

      // get the width/length of padding code point(s)
      R_len_t pad_cur_n = pad_cont.get(i).length();
      const char* pad_cur_s = pad_cont.get(i).c_str();
      R_len_t pad_cur_width;
      if (use_length_val) {
         pad_cur_width = 1;
         str_cur_width = str_cont.get(i).countCodePoints();
         R_len_t k = 0;
         UChar32 pad_cur = 0;
         U8_NEXT(pad_cur_s, k, pad_cur_n, pad_cur);
         if (pad_cur <= 0 || k < pad_cur_n)
            throw StriException(MSG__NOT_EQ_N_CODEPOINTS, "pad", 1);
      }
      else {
         pad_cur_width = stri__width_string(pad_cur_s, pad_cur_n);
         str_cur_width = stri__width_string(str_cur_s, str_cur_n);
         if (pad_cur_width != 1)
            throw StriException(MSG__NOT_EQ_N_WIDTH, "pad", 1);
      }

      // get the minimal width
      R_len_t width_cur = width_cont.get(i);

      if (str_cur_width >= width_cur)  {
         // no padding at all
         SET_STRING_ELT(ret, i, str_cont.toR(i));
         continue;
      }

      R_len_t padnum = width_cur-str_cur_width;
      buf.resize(str_cur_n+padnum*pad_cur_n, false);

      char* buftmp = buf.data();
      R_len_t k = 0;
      switch(_side) {

         case 0: // left
            for (k=0; k<padnum; ++k) {
               memcpy(buftmp, pad_cur_s, pad_cur_n);
               buftmp += pad_cur_n;
            }
            memcpy(buftmp, str_cur_s, str_cur_n);
            buftmp += str_cur_n;
            break;

         case 1: // right
            memcpy(buftmp, str_cur_s, str_cur_n);
            buftmp += str_cur_n;
            for (k=0; k<padnum; ++k) {
               memcpy(buftmp, pad_cur_s, pad_cur_n);
               buftmp += pad_cur_n;
            }
            break;

         case 2: // both
            for (k=0; k<padnum/2; ++k) {
               memcpy(buftmp, pad_cur_s, pad_cur_n);
               buftmp += pad_cur_n;
            }
            memcpy(buftmp, str_cur_s, str_cur_n);
            buftmp += str_cur_n;
            for (; k<padnum; ++k) {
               memcpy(buftmp, pad_cur_s, pad_cur_n);
               buftmp += pad_cur_n;
            }
            break;
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), (int)(buftmp-buf.data()), CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Extract all occurences of a character class in each string
 *
 * @param str character vector
 * @param pattern character vector
 * @return list of character vectors
 *
 * @version 0.1 (Marek Gagolewski, 2013-06-08)
 * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   merge = stri_prepare_arg_logical(merge, "merge");
   R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge));

   STRI__ERROR_HANDLER_BEGIN
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);
   StriContainerLogical merge_cont(merge, vectorize_length);

   SEXP notfound; // this vector will be set iff not found or NA
   PROTECT(notfound = stri__vector_NA_strings(1));

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, notfound);
         continue;
      }


      bool merge_cur = merge_cont.get(i);
      CharClass pattern_cur = pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t j, jlast;
      UChar32 chr;
      deque<R_len_t_x2> occurences; // codepoint based-indices

      for (jlast=j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (pattern_cur.test(chr)) {
            occurences.push_back(R_len_t_x2(jlast, j));
         }
         jlast = j;
      }

      R_len_t noccurences = (R_len_t)occurences.size();
      if (noccurences == 0)
         SET_VECTOR_ELT(ret, i, notfound);
      else if (merge_cur && noccurences > 1) {
         // do merge
         deque<R_len_t_x2> occurences2;
         deque<R_len_t_x2>::iterator iter = occurences.begin();
         occurences2.push_back(*iter);
         for (++iter; iter != occurences.end(); ++iter) {
            R_len_t_x2 curoccur = *iter;
            if (occurences2.back().v2 == curoccur.v1) { // continue seq
               occurences2.back().v2 = curoccur.v2;  // change `end`
            }
            else { // new seq
               occurences2.push_back(curoccur);
            }
         }

         // create resulting matrix from occurences2
         R_len_t noccurences2 = (R_len_t)occurences2.size();
         SEXP cur_res;
         PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2));
         iter = occurences2.begin();
         for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) {
            R_len_t_x2 curo = *iter;
            SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8));
         }
         SET_VECTOR_ELT(ret, i, cur_res);
         UNPROTECT(1);
      }
      else {
         // do not merge
         SEXP cur_res;
         PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences));
         deque<R_len_t_x2>::iterator iter = occurences.begin();
         for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) {
            R_len_t_x2 curo = *iter;
            SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8));
         }
         SET_VECTOR_ELT(ret, i, cur_res);
         UNPROTECT(1);
      }
   }

   UNPROTECT(2);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #12
0
/**
 *  Escape Unicode code points
 *
 *  @param str character vector
 *  @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-08-17)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          fail on incorrect utf8 byte seqs;
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*/
SEXP stri_escape_unicode(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length));

   std::string out; // @TODO: estimate len a priori?

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t     str_cur_n = str_cont.get(i).length();

      // estimate buf size
      R_len_t bufsize = 0;
      UChar32 c;
      R_len_t j = 0;

      while (j < str_cur_n) {
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0)
            throw StriException(MSG__INVALID_UTF8);
         else if ((char)c >= 32 || (char)c <= 126)
            bufsize += 1;
         else if (c <= 0xff)
            bufsize += 6; // for \a, \n this will be overestimated
         else
            bufsize += 10;
      }
      out.clear();
      if ((size_t)bufsize > (size_t)out.size())
         out.reserve(bufsize);

      // do escape
      j = 0;
      char buf[11];
      while (j < str_cur_n) {
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         /* if (c < 0)
            throw StriException(MSG__INVALID_UTF8); // this has already been checked :)
         else */ if (c <= ASCII_MAXCHARCODE) {
            switch ((char)c) {
               case 0x07: out.append("\\a"); break;
               case 0x08: out.append("\\b"); break;
               case 0x09: out.append("\\t"); break;
               case 0x0a: out.append("\\n"); break;
               case 0x0b: out.append("\\v"); break;
               case 0x0c: out.append("\\f"); break;
               case 0x0d: out.append("\\r"); break;
//               case 0x1b: out.append("\\e"); break; // R doesn't know that
               case 0x22: out.append("\\\""); break;
               case 0x27: out.append("\\'"); break;
               case 0x5c: out.append("\\\\"); break;
               default:
                  if ((char)c >= 32 || (char)c <= 126) // printable characters
                     out.append(1, (char)c);
                  else {
                     sprintf(buf, "\\u%4.4x", (uint16_t)c);
                     out.append(buf, 6);
                  }
            }
         }
         else if (c <= 0xffff) {
            sprintf(buf, "\\u%4.4x", (uint16_t)c);
            out.append(buf, 6);
         }
         else {
            sprintf(buf, "\\U%8.8x", (uint32_t)c);
            out.append(buf, 10);
         }
      }

      SET_STRING_ELT(ret, i,
         Rf_mkCharLenCE(out.c_str(), (int)out.size(), (cetype_t)CE_UTF8)
      );
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #13
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 * @version 0.1 (Marcin Bujarski)
 * @version 0.2 (Marek Gagolewski) Multiple input encoding support
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_length(SEXP str)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t ns = LENGTH(str);
   SEXP ret;

   UConverter* uconv = NULL;
   bool uconv_8bit = false;
   bool uconv_utf8 = false;

   STRI__ERROR_HANDLER_BEGIN

/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
#define STRI_LENGTH_CALCULATE_UTF8  \
   const char* qc = CHAR(q);        \
   R_len_t j = 0;                   \
   for (R_len_t i = 0; i < nq; j++) \
      U8_FWD_1(qc, i, nq);          \
   retint[k] = j;

   PROTECT(ret = Rf_allocVector(INTSXP, ns));
   int* retint = INTEGER(ret);
   for (R_len_t k = 0; k < ns; k++) {
      SEXP q = STRING_ELT(str, k);
      if (q == NA_STRING)
         retint[k] = NA_INTEGER;
      else {
         R_len_t nq = LENGTH(q);  // O(1) - stored by R

         // We trust (is that a wise assumption?)
         // R encoding marks; However, it there is no mark,
         // the string may have any encoding (ascii, latin1, utf8, native)
         if (IS_ASCII(q) || IS_LATIN1(q))
            retint[k] = nq;
         else if (IS_BYTES(q))
            throw StriException(MSG__BYTESENC);
         else if (IS_UTF8(q)) {
            STRI_LENGTH_CALCULATE_UTF8
         }
         else { // Any encoding - detection needed
//            UTF-8 strings can be fairly reliably recognized as such by a
//            simple algorithm, i.e., the probability that a string of
//            characters in any other encoding appears as valid UTF-8 is low,
//            diminishing with increasing string length.
//            We have two possibilities here:
//            1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native
//                This won't work correctly in some cases.
//                e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8
//                and ("A umlaut", "Ellipsis") in WINDOWS-1250
//            2. Assume it's Native; this assumes the user working in an 8-bit environment
//                would convert strings to UTF-8 manually if needed - I think is's
//                a more reasonable approach (Native --> input via keyboard)

            if (!uconv) { // open ucnv on demand
               uconv = stri__ucnv_open((const char*)NULL); // native decoder
               if (!uconv) {
                  retint[k] = NA_INTEGER;
                  continue;
               }
               uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1);
               if (!uconv_8bit) {
                  UErrorCode err = U_ZERO_ERROR;
                  const char* name = ucnv_getName(uconv, &err);
                  if (U_FAILURE(err))
                     throw StriException("could not query default converter");
                  uconv_utf8 = !strncmp("UTF-8", name, 5);
               }
            }

            if (uconv_8bit) {
               retint[k] = nq; // it's an 8-bit encoding :-)
            }
            else if (uconv_utf8) { // it's UTF-8
               STRI_LENGTH_CALCULATE_UTF8
            }
            else {  // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
               UErrorCode err = U_ZERO_ERROR;
               const char* source = CHAR(q);
               const char* sourceLimit = source + nq;
               R_len_t j;
               for (j = 0; source != sourceLimit; j++) {
                  if (U_FAILURE(err)) break; // error from previous iteration
                  // iterate through each native-encoded character:
                  ucnv_getNextUChar(uconv, &source, sourceLimit, &err);
               }
               if (U_FAILURE(err)) {  // error from last iteration
                  Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string.");
                  retint[k] = NA_INTEGER;
               }
               else
                  retint[k] = j; // all right, we got it!
            }
         }
      }
Beispiel #14
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Beispiel #15
0
/**
 *  Convert case (upper, lowercase)
 *
 *
 *  @param str character vector
 *  @param locale single string identifying
 *         the locale ("" or NULL for default locale)
 *  @return character vector
 *
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-19)
 *          use UCaseMap + StriContainerUTF8
 *          **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes
 *          ** BTW, since stringi_0.1-25 we require ICU>=50 **
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-18)
 *          use UCaseMap + StriContainerUTF8
 *          (this is much faster for UTF-8 and slightly faster for 8bit enc)
 *          Estimates minimal buffer size.
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
 *          Use a custom BreakIterator with stri_trans_totitle
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    use StriUBreakIterator
 *
 * @version 0.6-1 (Marek Gagolewski, 2015-07-11)
 *    now this is an internal function
*/
SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale)
{
   if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need;
      if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
      else buf_need = ucasemap_utf8ToUpper(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) { /* retry */
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
         else buf_need = ucasemap_utf8ToUpper(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}
/**
 * Trim characters from a charclass from left AND/OR right side of the string
 *
 * @param str character vector
 * @param pattern character vector
 * @param left from left?
 * @param right from left?
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
 *          Use StriContainerUTF8 and CharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly & Use StrContainerCharClass
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*/
SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t jlast1 = 0;
      R_len_t jlast2 = str_cur_n;

      if (left) {
         UChar32 chr;
         for (R_len_t j=0; j<str_cur_n; ) {
            U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast1 = j;
         }
      }

      if (right && jlast1 < str_cur_n) {
         UChar32 chr;
         for (R_len_t j=str_cur_n; j>0; ) {
            U8_PREV(str_cur_s, 0, j, chr); // "look behind"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast2 = j;
         }
      }

      // now jlast is the index, from which we start copying
      SET_STRING_ELT(ret, i,
         Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #17
0
/**
 *  Convert case (TitleCase)
 *
 *
 *  @param str character vector
 *  @param opts_brkiter list
 *  @return character vector
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    separated from stri_trans_casemap;
 *    use StriUBreakIterator
 */
SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) {
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "word");
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   StriUBreakIterator brkiter(opts_brkiter2);

   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   brkiter.free(false);
   // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do?
   // now briter is owned by ucasemap.
   // it will be released on ucasemap_close
   // (checked with ICU man & src code)

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) {
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}
Beispiel #18
0
/**
 * Substring replacement function
 *
 *
 * @param str character vector
 * @param from integer vector (possibly with negative indices)
 * @param to integer vector (possibly with negative indices) or NULL
 * @param length integer vector or NULL
 * @param omit_na logical scalar
 * @param value character vector replacement
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF8 and stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          use StriContainerUTF8's UChar32-to-UTF8 index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          Use StriContainerUTF8_indexable
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          Use stri__sub_prepare_from_to_length()
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.5-9003 (Marek Gagolewski, 2015-08-05)
 *    Bugfix #183: floating point exception when to or length is an empty vector
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-31)
 *    FR #199: new arg: `omit_na`
 *    FR #207: allow insertions
 *
 *
 * @version 1.4.3 (Marek Gagolewski, 2019-03-12)
 *    #346: na_omit for `value`
 */
SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value)
{
   PROTECT(str   = stri_prepare_arg_string(str, "str"));
   PROTECT(value = stri_prepare_arg_string(value, "value"));
   bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");

   R_len_t value_len     = LENGTH(value);
   R_len_t str_len       = LENGTH(str);
   R_len_t from_len      = 0; // see below
   R_len_t to_len        = 0; // see below
   R_len_t length_len    = 0; // see below
   int* from_tab         = 0; // see below
   int* to_tab           = 0; // see below
   int* length_tab       = 0; // see below

   R_len_t sub_protected =  2+ /* how many objects to PROTECT on ret? */
      stri__sub_prepare_from_to_length(from, to, length,
      from_len, to_len, length_len, from_tab, to_tab, length_tab);

   R_len_t vectorize_len = stri__recycling_rule(true, 4,
      str_len, value_len, from_len, (to_len>length_len)?to_len:length_len);

   if (vectorize_len <= 0) {
      UNPROTECT(sub_protected);
      return Rf_allocVector(STRSXP, 0);
   }

   STRI__ERROR_HANDLER_BEGIN(sub_protected)
   StriContainerUTF8_indexable str_cont(str, vectorize_len);
   StriContainerUTF8 value_cont(value, vectorize_len);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len));
   String8buf buf(0); // @TODO: estimate bufsize a priori

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      R_len_t cur_from     = from_tab[i % from_len];
      R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];

      if (str_cont.isNA(i)) {
          SET_STRING_ELT(ret, i, NA_STRING);
          continue;
      }
      if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) {
         if (omit_na_1) {
            SET_STRING_ELT(ret, i, str_cont.toR(i));
         }
         else {
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         continue;
      }

      if (length_tab) {
         if (cur_to <= 0) {
            // SET_STRING_ELT(ret, i, R_BlankString);
            // continue;
            cur_to = 0;
         }
         else {
            cur_to = cur_from + cur_to - 1;
            if (cur_from < 0 && cur_to >= 0) cur_to = -1;
         }
      }

      const char* str_cur_s   = str_cont.get(i).c_str();
      R_len_t str_cur_n       = str_cont.get(i).length();
      const char* value_cur_s = value_cont.get(i).c_str();
      R_len_t value_cur_n     = value_cont.get(i).length();

      R_len_t cur_from2; // UTF-8 byte indices
      R_len_t cur_to2;   // UTF-8 byte indices

      stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2);
      if (cur_to2 < cur_from2) cur_to2 = cur_from2;

      R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n;
      buf.resize(buflen, false/*destroy contents*/);
      memcpy(buf.data(), str_cur_s, (size_t)cur_from2);
      memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n);
      memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2);
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #19
0
/** Generate random strings
 *
 * @param n single integer
 * @param length integer vector
 * @param pattern character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          Use StriContainerCharClass which now contains UnicodeSets;
 *          vectorized also over pattern
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern)
{
   int n_val = stri__prepare_arg_integer_1_notNA(n, "n");
   PROTECT(length    = stri_prepare_arg_integer(length, "length"));
   PROTECT(pattern   = stri_prepare_arg_string(pattern, "pattern"));

   if (n_val < 0) n_val = 0; /* that's not NA for sure now */

   R_len_t length_len = LENGTH(length);
   if (length_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length");
   }
   else if (length_len > n_val || n_val % length_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   R_len_t pattern_len = LENGTH(pattern);
   if (pattern_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern");
   }
   else if (pattern_len > n_val || n_val % pattern_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(2)

   StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len));
   StriContainerInteger   length_cont(length, max(n_val, length_len));

   // get max required bufsize
   int*    length_tab = INTEGER(length);
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<length_len; ++i) {
      if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize)
         bufsize = length_tab[i];
   }
   bufsize *= 4;  // 1 UChar32 -> max. 4 UTF-8 bytes
   String8buf buf(bufsize);
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val));

   for (R_len_t i=0; i<n_val; ++i) {
      if (length_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      int length_cur = length_cont.get(i);
      if (length_cur < 0) length_cur = 0;

      const UnicodeSet* uset = &(pattern_cont.get(i));
      int32_t uset_size = uset->size();

      // generate string:
      R_len_t j = 0;
      UBool err = FALSE;
      for (R_len_t k=0; k<length_cur; ++k) {
         int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */
         UChar32 c = uset->charAt(idx);
         if (c < 0) throw StriException(MSG__INTERNAL_ERROR);

         U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);
         if (err) throw StriException(MSG__INTERNAL_ERROR);
      }
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @param validate single logical value (or NA)
 *
 * @return character vector
 *
 * @version 0.1-XX (Marek Gagolewski)
 *
 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
 *                  make StriException-friendly
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
 *                 Use one String8buf;
 *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
 *
 * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
 *                 added validate arg
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
{
   PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate"));
   bool is_unknown_8bit_logical =
      stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(2)
   SEXP ret;
   if (!is_unknown_8bit_logical) {
      // Trivial - everything we need is in StriContainerUTF8 :)
      // which removes BOMs silently
      StriContainerUTF8 str_cont(str, n);
      STRI__PROTECT(ret = str_cont.toR());
   }
   else {
      // get buf size
      R_len_t bufsize = 0;
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
            continue;

         R_len_t ni = LENGTH(curs);
         if (ni > bufsize) bufsize = ni;
      }
      String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
      char* bufdata = buf.data();

      STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }

         if (IS_ASCII(curs) || IS_UTF8(curs)) {
            R_len_t curs_n = LENGTH(curs);
            const char* curs_s = CHAR(curs);
            if (curs_n >= 3 &&
               (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
               (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
               (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
               // has BOM - get rid of it
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
            }
            else
               SET_STRING_ELT(ret, i, curs);

            continue;
         }

         // otherwise, we have an 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else { // 0xEF 0xBF 0xBD
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
      }

   }

   // validate utf8 byte stream
   if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
      R_len_t ret_n = LENGTH(ret);
      for (R_len_t i=0; i<ret_n; ++i) {
         SEXP curs = STRING_ELT(ret, i);
         if (curs == NA_STRING) continue;

         const char* s = CHAR(curs);
         R_len_t sn = LENGTH(curs);
         R_len_t j = 0;
         UChar32 c = 0;
         while (c >= 0 && j < sn) {
            U8_NEXT(s, j, sn, c);
         }

         if (c >= 0) continue; // valid, nothing to do

         if (LOGICAL(validate)[0] == NA_LOGICAL) {
            Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         else {
            int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
            String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
            char* bufdata = buf.data();

            j = 0;
            R_len_t k = 0;
            UBool err = FALSE;
            while (!err && j < sn) {
               U8_NEXT(s, j, sn, c);
               if (c >= 0) {
                  U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
               } else {
                  Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }

            if (err) throw StriException(MSG__INTERNAL_ERROR);
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         }
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Beispiel #21
0
/** Generate random permutations of code points in each string
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.2.5 (Marek Gagolewski, 2019-07-23)
 *    #319: Fixed overflow in `stri_rand_shuffle()`.
 */
SEXP stri_rand_shuffle(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }
   std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.)
   String8buf buf2(bufsize);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // fill buf1
      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf1[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle)
      R_len_t cur_n = k;
      for (j=0; j<cur_n-1; ++j) {
         // rand from i to cur_n-1
         R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j);
         UChar32 tmp = buf1[r];
         buf1[r] = buf1[j];
         buf1[j] = tmp;
      }

      // create string:
      char* buf2data = buf2.data();
      c = (UChar32)0;
      j = 0;
      k = 0;
      UBool err = FALSE;
      while (!err && k < cur_n) {
         c = buf1[k++];
         U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err);
      }

      if (err) throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}
/**
 * Convert character vector between marked encodings and the encoding provided
 *
 * @param str     input character vector or list of raw vectors
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
 *          use StriUcnv
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          calc required buf size a priori
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_n = LENGTH(str);
   StriContainerUTF16 str_cont(str, str_n);

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   StriUcnv ucnv(selected_to);
   UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);

   // Get target encoding mark
   cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();

   // Prepare out val
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   // calculate required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_n; ++i) {
      if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize)
         bufsize = str_cont.get(i).length();
   }
   bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
   // "The calculated size is guaranteed to be sufficient for this conversion."
   String8buf buf(bufsize);

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t curn_tmp = str_cont.get(i).length();
      const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      UErrorCode status = U_ZERO_ERROR;
      ucnv_resetFromUnicode(uconv_to);
      R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
            curs_tmp, curn_tmp, &status);
      if (bufneed <= buf.size()) {
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }
      else {// larger buffer needed [this shouldn't happen?]
         buf.resize(bufneed, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
               curs_tmp, curn_tmp, &status);
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }

      if (to_raw_logical) {
         SEXP outobj;
         STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
         STRI__UNPROTECT(1);
      }
      else {
         SET_STRING_ELT(ret, i,
            Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* nothing special on error */})
}
/**
 * Parse date-time objects
 *
 * @param str
 * @param format
 * @param tz
 * @param lenient
 * @param locale
 *
 * @return character vector
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-01-08)
 * @version 0.5-1 (Marek Gagolewski, 2015-01-11) lenient arg added
 * @version 0.5-1 (Marek Gagolewski, 2015-02-22) use tz
 * @version 0.5-1 (Marek Gagolewski, 2015-03-01) set tzone attrib on retval
 */
SEXP stri_datetime_parse(SEXP str, SEXP format, SEXP lenient, SEXP tz, SEXP locale) {
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   const char* locale_val = stri__prepare_arg_locale(locale, "locale", true);
   const char* format_val = stri__prepare_arg_string_1_notNA(format, "format");
   bool lenient_val = stri__prepare_arg_logical_1_notNA(lenient, "lenient");
   if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz"));
   else             PROTECT(tz); /* needed to set tzone attrib */

   // "format" may be one of:
   const char* format_opts[] = {
      "date_full", "date_long", "date_medium", "date_short",
      "date_relative_full", "date_relative_long", "date_relative_medium", "date_relative_short",
      "time_full", "time_long", "time_medium", "time_short",
      "time_relative_full", "time_relative_long", "time_relative_medium", "time_relative_short",
      "datetime_full", "datetime_long", "datetime_medium", "datetime_short",
      "datetime_relative_full", "datetime_relative_long", "datetime_relative_medium", "datetime_relative_short",
      NULL};
   int format_cur = stri__match_arg(format_val, format_opts);

   TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/);
   Calendar* cal = NULL;
   DateFormat* fmt = NULL;
   STRI__ERROR_HANDLER_BEGIN(2)
   R_len_t vectorize_length = LENGTH(str);
   StriContainerUTF16 str_cont(str, vectorize_length);
   UnicodeString format_str(format_val);

   UErrorCode status = U_ZERO_ERROR;
   if (format_cur >= 0) {
      DateFormat::EStyle style = DateFormat::kNone;
      switch (format_cur % 8) {
         case 0:  style = DateFormat::kFull; break;
         case 1:  style = DateFormat::kLong; break;
         case 2:  style = DateFormat::kMedium; break;
         case 3:  style = DateFormat::kShort; break;
         case 4:  style = DateFormat::kFullRelative; break;
         case 5:  style = DateFormat::kLongRelative; break;
         case 6:  style = DateFormat::kMediumRelative; break;
         case 7:  style = DateFormat::kShortRelative; break;
         default: style = DateFormat::kNone; break;
      }

      /* ICU 54.1: Relative time styles are not currently supported.  */
      switch (format_cur / 8) {
         case 0:
            fmt = DateFormat::createDateInstance(style, Locale::createFromName(locale_val));
            break;

         case 1:
            fmt = DateFormat::createTimeInstance((DateFormat::EStyle)(style & ~DateFormat::kRelative),
               Locale::createFromName(locale_val));
            break;

         case 2:
            fmt = DateFormat::createDateTimeInstance(style,
               (DateFormat::EStyle)(style & ~DateFormat::kRelative),
               Locale::createFromName(locale_val));
            break;

         default:
            fmt = NULL;
            break;

      }
   }
   else
      fmt = new SimpleDateFormat(format_str, Locale::createFromName(locale_val), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   cal = Calendar::createInstance(locale_val, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   cal->adoptTimeZone(tz_val);
   tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */

   cal->setLenient(lenient_val);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length));
   for (R_len_t i=0; i<vectorize_length; ++i) {
      if (str_cont.isNA(i)) {
         REAL(ret)[i] = NA_REAL;
         continue;
      }

      status = U_ZERO_ERROR;
      ParsePosition pos;
      fmt->parse(str_cont.get(i), *cal, pos);

      if (pos.getErrorIndex() >= 0)
         REAL(ret)[i] = NA_REAL;
      else {
         status = U_ZERO_ERROR;
         REAL(ret)[i] = ((double)cal->getTime(status))/1000.0;
         if (U_FAILURE(status)) REAL(ret)[i] = NA_REAL;
      }
   }


   if (!isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz);
   stri__set_class_POSIXct(ret);
   if (tz_val) { delete tz_val; tz_val = NULL; }
   if (fmt) { delete fmt; fmt = NULL; }
   if (cal) { delete cal; cal = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (tz_val) { delete tz_val; tz_val = NULL; }
      if (fmt) { delete fmt; fmt = NULL; }
      if (cal) { delete cal; cal = NULL; }
   })
}