/** Locate all BreakIterator boundaries
 *
 * @param str character vector
 * @param omit_no_match logical
 * @param opts_brkiter named list
 * @return list
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-22)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-23)
 *          removed "title": For Unicode 4.0 and above title boundary
 *          iteration, please use Word Boundary iterator.
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-25)
 *          use stri__split_or_locate_boundaries
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-29)
 *          use opts_brkiter
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-11-28)
 *          new args: omit_no_match
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-02)
 *          use StriRuleBasedBreakIterator
 */
SEXP stri_locate_all_boundaries(SEXP str, SEXP omit_no_match, SEXP opts_brkiter)
{
   bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));

   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2));
         continue;
      }

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      brkiter.first();

      deque< pair<R_len_t,R_len_t> > occurrences;
      pair<R_len_t,R_len_t> curpair;
      while (brkiter.next(curpair))
         occurrences.push_back(curpair);

      R_len_t noccurrences = (R_len_t)occurrences.size();
      if (noccurrences <= 0) {
         SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2));
         continue;
      }

      SEXP ans;
      STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2));
      int* ans_tab = INTEGER(ans);
      deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin();
      for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) {
         pair<R_len_t, R_len_t> cur_match = *iter;
         ans_tab[j]             = cur_match.first;
         ans_tab[j+noccurrences] = cur_match.second;
      }

      // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points)
      str_cont.UTF8_to_UChar32_index(i, ans_tab,
            ans_tab+noccurrences, noccurrences,
            1, // 0-based index -> 1-based
            0  // end returns position of next character after match
      );
      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   stri__locate_set_dimnames_list(ret);
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* nothing special t.b.d. on error */ })
}
/** Convert character vector to UTF-32
 *
 * @param str character vector
 * @return list with integer vectors
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
 *          use vector<UChar32> buf instead of R_alloc;
 *          warn and set NULL on improper UTF-8 byte sequences
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-12)
 *          Use UChar32* instead of vector<UChar32> as ::data is C++11
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf32(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 1; // to avoid allocating an empty buffer
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }

   UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
   if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
   // deque<UChar32> was slower than using a common, over-sized buf

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }

      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }
      else {
         SEXP conv;
         STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
         memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
         SET_VECTOR_ELT(ret, i, conv);
         STRI__UNPROTECT(1);
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
}
/** Count the number of BreakIterator boundaries
 *
 * @param str character vector
 * @param opts_brkiter identifier
 * @return character vector
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-30)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-02)
 *          use StriRuleBasedBreakIterator
 */
SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length));
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i)) {
         INTEGER(ret)[i] = NA_INTEGER;
         continue;
      }

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      brkiter.first();

      R_len_t cur_count = 0;
      while (brkiter.next())
         ++cur_count;

      INTEGER(ret)[i] = cur_count;
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* no action */  })
}
Пример #4
0
/**
 * Reverse Each String
 * @param str character vector
 * @return character vector with every string reversed
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          detect incorrect utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_reverse(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_len); // writable, no recycle

   // STEP 1.
   // Calculate the required buffer length
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_len; ++i) {
      if (str_cont.isNA(i))
         continue;

      R_len_t cursize = str_cont.get(i).length();
      if (cursize > bufsize)
         bufsize = cursize;
   }

   // STEP 2.
   // Alloc buffer & result vector
   String8buf buf(bufsize);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      R_len_t j, k;
      UChar32 chr;
      UBool isError = FALSE;

      for (j=str_cur_n, k=0; !isError && j>0; ) {
         U8_PREV(str_cur_s, 0, j, chr); // go backwards
         if (chr < 0) {
            throw StriException(MSG__INVALID_UTF8);
         }
         U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError);
      }

      if (isError)
         throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Get all available ICU charsets and their aliases (elems 2,3,...)
 *
 * @return R list object; element name == ICU charset canonical name;
 * elements are character vectors (aliases)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski)
 *          use StriUcnv; make StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_list()
{
   R_len_t c = (R_len_t)ucnv_countAvailable();

   STRI__ERROR_HANDLER_BEGIN(0)
   SEXP ret;
   SEXP names;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, c));
   STRI__PROTECT(names = Rf_allocVector(STRSXP, c));

   for (R_len_t i=0; i<c; ++i) {
      const char* canonical_name = ucnv_getAvailableName(i);
      if (!canonical_name) {
         SET_STRING_ELT(names, i, NA_STRING);
         continue;
      }

      SET_STRING_ELT(names, i, Rf_mkChar(canonical_name));

      UErrorCode status = U_ZERO_ERROR;
      R_len_t ci = (R_len_t)ucnv_countAliases(canonical_name, &status);
      if (U_FAILURE(status) || ci <= 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(NA_STRING));
      else {
         SEXP aliases;
         STRI__PROTECT(aliases = Rf_allocVector(STRSXP, ci));
         for (R_len_t j=0; j<ci; ++j) {
            status = U_ZERO_ERROR;
            const char* alias = ucnv_getAlias(canonical_name, j, &status);
            if (U_FAILURE(status) || !alias)
               SET_STRING_ELT(aliases, j, NA_STRING);
            else
               SET_STRING_ELT(aliases, j, Rf_mkChar(alias));
         }
         SET_VECTOR_ELT(ret, i, aliases);
         STRI__UNPROTECT(1);
      }
   }

   Rf_setAttrib(ret, R_NamesSymbol, names);
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* no special action on error */})
}
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param omit_na single logical value
 * @return logical vector
 *
 * @version 0.3-1 (Bartek Tartanus, 2014-07-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-17)
 *                using std::vector<int> to avoid mem-leaks
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
 *    FR #122: omit_na arg added
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   // BT: this cannot be done with deque, because pattern is reused so i does not
   // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on
   // MG: agreed
   std::vector<int> which(vectorize_length);
   int result_counter = 0;

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         if (omit_na1) which[i] = FALSE;
         else {
            which[i] = NA_LOGICAL;
            result_counter++;
         }
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      which[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            which[i] = TRUE;
            break;
         }
      }
      if (negate_1) which[i] = !which[i];
      if (which[i]) result_counter++;
   }

   SEXP ret;
   STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter));
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Substitutes vector elements if a pattern occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param value character vector
 * @return character vector
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *   FR#124
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern"));
   PROTECT(value = stri_prepare_arg_string(value, "value"));

   int vectorize_length = LENGTH(str);
   int value_length = LENGTH(value);
   if (value_length == 0)
      Rf_error(MSG__REPLACEMENT_ZERO);

   STRI__ERROR_HANDLER_BEGIN(3)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerUTF8 value_cont(value, value_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   R_len_t k = 0;
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      bool found = false;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            found = true;
            break;
         }
      }

      if ((found && !negate_1) || (!found && negate_1))
         SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length));
      else
         SET_STRING_ELT(ret, i, str_cont.toR(i));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #8
0
/**
 * Detect if a pattern occurs in a string
 *
 * @param str R character vector
 * @param pattern R character vector containing regular expressions
 * @param negate single bool
 * @param max_count single int
 * @param opts_regex list
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16's vectorization
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-18)
 *          use StriContainerRegexPattern + opts_regex
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
 *    Issue #214: allow a regex pattern like `.*`  to match an empty string
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate,
    SEXP max_count, SEXP opts_regex)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF16 str_cont(str, vectorize_length);
//   StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0) {
          ret_tab[i] = NA_LOGICAL;
          continue;
      }

      STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont,
         pattern_cont, ret_tab[i] = NA_LOGICAL)

      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      matcher->reset(str_cont.get(i));
      ret_tab[i] = (int)matcher->find(); // returns UBool
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;

//      // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s
//      UText* str_text = NULL;
//      UErrorCode status = U_ZERO_ERROR;
//      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
//      str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
//      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
//      matcher->reset(str_text);
//      ret_tab[i] = (int)matcher->find(); // returns UBool
//      utext_close(str_text);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert from UTF-32
 *
 * @param vec integer vector or list with integer vectors
 * @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
 *          StriException friently;
 *          use StriContainerListInt
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_fromutf32(SEXP vec)
{
   PROTECT(vec = stri_prepare_arg_list_integer(vec, "vec"));

   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerListInt vec_cont(vec);
   R_len_t vec_n = vec_cont.get_n();

   // get required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<vec_n; ++i) {
      if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize)
         bufsize = vec_cont.get(i).size();
   }
   bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient
   String8buf buf(bufsize);
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n));

   for (R_len_t i=0; i<vec_n; ++i) {
      if (vec_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const int* cur_data = vec_cont.get(i).data();
      R_len_t    cur_n    = vec_cont.get(i).size();
      UChar32 c = (UChar32)0;
      R_len_t j = 0;
      R_len_t k = 0;
      UBool err = FALSE;
      while (!err && k < cur_n) {
         c = cur_data[k++];
         U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);

         // Rf_mkCharLenCE detects embedded nuls, but stops execution completely
         if (c == 0) err = TRUE;
      }

      if (err) {
         Rf_warning(MSG__INVALID_CODE_POINT, (int)c);
         SET_STRING_ELT(ret, i, NA_STRING);
      }
      else
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
   }

   STRI__UNPROTECT_ALL;
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #10
0
/**
 * Compare elements in 2 character vectors, with collation
 *
 * @param e1 character vector
 * @param e2 character vector
 * @param opts_collator passed to stri__ucol_open()
 * @param type [internal] vector of length 2,
 * type[0]: 0 for ==, -1 for < and 1 for >,
 * type[1]: 0 or 1 (whether to negate the results)
 *
 * @return logical vector
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-19)
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-07)
 *          opts_collator == NA no longer allowed
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_cmp_logical(SEXP e1, SEXP e2, SEXP opts_collator, SEXP type)
{
   // we'll perform a collator-based cmp
   // type is an internal arg, check manually, error() allowed here
   if (!Rf_isInteger(type) || LENGTH(type) != 2)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   int _type = INTEGER(type)[0];
   int _negate = INTEGER(type)[1];
   if (_type > 1 || _type < -1 || _negate < 0 || _negate > 1)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);

   PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument
   PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument

   // call stri__ucol_open after prepare_arg:
   // if prepare_arg had failed, we would have a mem leak
   UCollator* col = NULL;
   col = stri__ucol_open(opts_collator);

   STRI__ERROR_HANDLER_BEGIN(2)

   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2));

   StriContainerUTF8 e1_cont(e1, vectorize_length);
   StriContainerUTF8 e2_cont(e2, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = 0; i < vectorize_length; ++i)
   {
      if (e1_cont.isNA(i) || e2_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      R_len_t     cur1_n = e1_cont.get(i).length();
      const char* cur1_s = e1_cont.get(i).c_str();
      R_len_t     cur2_n = e2_cont.get(i).length();
      const char* cur2_s = e2_cont.get(i).c_str();

      // with collation
      UErrorCode status = U_ZERO_ERROR;
      ret_tab[i] = (_type == (int)ucol_strcollUTF8(col,
         cur1_s, cur1_n, cur2_s, cur2_n, &status
      ));
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      if (_negate)
         ret_tab[i] = !ret_tab[i];
   }
Пример #11
0
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param negate single bool
 * @param max_count single int
 * @return logical vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-02)
 *          Use StrContainerUTF8 and CharClass classes
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-15)
 *          Use StrContainerCharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_charclass(SEXP str, SEXP pattern,
    SEXP negate, SEXP max_count)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      ret_tab[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid UTF-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            ret_tab[i] = TRUE;
            break;
         }
      }
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #12
0
/**
 * Compare elements in 2 character vectors, without collation
 *
 * @param e1 character vector
 * @param e2 character vector
 * @param type [internal] integer; 0 or 1 (whether to negate the results)
 *
 * @return logical vector
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-07)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_cmp_codepoints(SEXP e1, SEXP e2, SEXP type)
{
   // type is an internal arg, check manually, error() allowed here
   if (!Rf_isInteger(type) || LENGTH(type) != 1)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   int _negate = INTEGER(type)[0];
   if (_negate < 0 || _negate > 1)
      Rf_error(MSG__INCORRECT_INTERNAL_ARG);

   PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument
   PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(2)

   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2));

   StriContainerUTF8 e1_cont(e1, vectorize_length);
   StriContainerUTF8 e2_cont(e2, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = 0; i < vectorize_length; ++i)
   {
      if (e1_cont.isNA(i) || e2_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      R_len_t     cur1_n = e1_cont.get(i).length();
      const char* cur1_s = e1_cont.get(i).c_str();
      R_len_t     cur2_n = e2_cont.get(i).length();
      const char* cur2_s = e2_cont.get(i).c_str();

      if (cur1_n != cur2_n) // different number of bytes => not equal
         ret_tab[i] = FALSE;
      else
         ret_tab[i] = (memcmp(cur1_s, cur2_s, cur1_n) == 0);

      if (_negate)
         ret_tab[i] = !ret_tab[i];
   }

   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* no-op on err */})
}
/**
 * Locate first or last boundaries
 *
 * @param str character vector
 * @param opts_brkiter list
 * @param first looking for first or last match?
 * @return integer matrix (2 columns)
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-05)
 */
SEXP stri__locate_firstlast_boundaries(SEXP str, SEXP opts_brkiter, bool first)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, str_length, 2));
   stri__locate_set_dimnames_matrix(ret);
   int* ret_tab = INTEGER(ret);

   for (R_len_t i = 0; i < str_length; ++i)
   {
      ret_tab[i]            = NA_INTEGER;
      ret_tab[i+str_length] = NA_INTEGER;

      if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue;

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      pair<R_len_t,R_len_t> curpair;

      if (first) {
         brkiter.first();
         if (!brkiter.next(curpair)) continue;
      }
      else {
         brkiter.last();
         if (!brkiter.previous(curpair)) continue;
      }

      ret_tab[i]            = curpair.first;
      ret_tab[i+str_length] = curpair.second;

      // Adjust UTF8 byte index -> UChar32 index
      str_cont.UTF8_to_UChar32_index(i,
            ret_tab+i, ret_tab+i+str_length, 1,
            1, // 0-based index -> 1-based
            0  // end returns position of next character after match
      );
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ )
}
Пример #14
0
/**
 * Determine which strings are of length 0
 *
 * Note that ICU permits only strings of length < 2^31.
 *
 * @param s R character vector
 * @return logical vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_isempty(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument
    R_len_t str_n = LENGTH(str);

    STRI__ERROR_HANDLER_BEGIN(1)
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(LGLSXP, str_n));
    int* retlog = LOGICAL(ret);
    for (R_len_t i=0; i<str_n; ++i) {
        SEXP curs = STRING_ELT(str, i);
        /* INPUT ENCODING CHECK: this function does not need this. */
        retlog[i] = (curs == NA_STRING)?NA_LOGICAL:(LENGTH(curs) <= 0);
    }
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Пример #15
0
/**
 * Get number of bytes in each string
 *
 * Note that ICU permits only strings of length < 2^31.
 *
 * @param s R object coercible to a character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_numbytes(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument
    R_len_t str_n = LENGTH(str);

    STRI__ERROR_HANDLER_BEGIN(1)
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = LOGICAL(ret);
    for (R_len_t i=0; i<str_n; ++i) {
        SEXP curs = STRING_ELT(str, i);
        /* INPUT ENCODING CHECK: this function does not need this. */
        retint[i] = (curs == NA_STRING)?NA_INTEGER:LENGTH(curs); // O(1) - stored by R
    }
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
/**
 * Count pattern occurcess in a string [with collation]
 *
 * @param str character vector
 * @param pattern character vector
 * @param opts_collator passed to stri__ucol_open()
 * @return integer vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          corrected behavior on empty str/pattern
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-23)
 *          make StriException-friendly,
 *          use StriContainerUStringSearch
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-08)
 *          new fun: stri_count_coll (opts_collator == NA not allowed)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));

   // call stri__ucol_open after prepare_arg:
   // if prepare_arg had failed, we would have a mem leak
   UCollator* collator = NULL;
   collator = stri__ucol_open(opts_collator);

   STRI__ERROR_HANDLER_BEGIN(2)
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
   StriContainerUTF16 str_cont(str, vectorize_length);
   StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator);  // collator is not owned by pattern_cont

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length));
   int* ret_tab = INTEGER(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
         ret_tab[i] = NA_INTEGER,
         ret_tab[i] = 0)

      UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i));
      usearch_reset(matcher);
      UErrorCode status = U_ZERO_ERROR;
      R_len_t found = 0;
      while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE))
         ++found;
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      ret_tab[i] = found;
   }

   if (collator) { ucol_close(collator); collator=NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(
      if (collator) ucol_close(collator);
   )
}
/** Get Declared Encodings of Each String
 *
 * @param str a character vector or an object coercible to
 * @return a character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_mark(SEXP str) {
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);

   // some of them will not be used in this call, but we're lazy
   SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes;
   STRI__PROTECT(mark_ascii  = Rf_mkChar("ASCII"));
   STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1"));
   STRI__PROTECT(mark_utf8   = Rf_mkChar("UTF-8"));
   STRI__PROTECT(mark_native = Rf_mkChar("native"));
   STRI__PROTECT(mark_bytes  = Rf_mkChar("bytes"));

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i=0; i<str_len; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      if (IS_ASCII(curs))
         SET_STRING_ELT(ret, i, mark_ascii);
      else if (IS_UTF8(curs))
         SET_STRING_ELT(ret, i, mark_utf8);
      else if (IS_BYTES(curs))
         SET_STRING_ELT(ret, i, mark_bytes);
      else if (IS_LATIN1(curs))
         SET_STRING_ELT(ret, i, mark_latin1);
      else
         SET_STRING_ELT(ret, i, mark_native);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #18
0
/** List available time zone IDs
 *
 * @param offset single numeric
 * @param region single string
 * @return character vector
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-24)
 */
SEXP stri_timezone_list(SEXP region, SEXP offset)
{
   StringEnumeration* tz_enum = NULL;
   PROTECT(region = stri_prepare_arg_string_1(region, "region"));
   PROTECT(offset = stri_prepare_arg_double_1(offset, "offset"));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 region_cont(region, 1);

   UErrorCode status = U_ZERO_ERROR;

   int32_t offset_hours = 0;
   const int32_t* o = NULL;
   const char* r = NULL;

   if (!ISNA(REAL(offset)[0])) {
      // 0.5 and 0.75 are represented exactly within the double type
      offset_hours = (int32_t)(REAL(offset)[0]*1000.0*3600.0);
      o = &offset_hours;
   }

   if (!region_cont.isNA(0))
      r = region_cont.get(0).c_str();

   tz_enum = TimeZone::createTimeZoneIDEnumeration(UCAL_ZONE_TYPE_ANY, r, o, status);
   STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   tz_enum->reset(status);
   STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   R_len_t n = (R_len_t)tz_enum->count(status);
   STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */})

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));

//   SEXP nam;
//   STRI__PROTECT(nam = Rf_allocVector(STRSXP, n));

   // MG: I reckon that IDs are more readable than DisplayNames (which are moreover localized)
   for (R_len_t i=0; i<n; ++i) {
      int len;
      status = U_ZERO_ERROR;
      const char* cur = tz_enum->next(&len, status);
      STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */})
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(cur, len, CE_UTF8));

//      TimeZone* curtz = TimeZone::createTimeZone(UnicodeString::fromUTF8(cur));
//      UnicodeString curdn;
//      curtz->getDisplayName(locale, curdn);
//      delete curtz;
//      string out;
//      curdn.toUTF8String(out);
//      SET_STRING_ELT(nam, i, Rf_mkCharCE(out.c_str(), CE_UTF8));
   }

//   Rf_setAttrib(ret, R_NamesSymbol, nam);

   if (tz_enum) { delete tz_enum; tz_enum = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(
      if (tz_enum) { delete tz_enum; tz_enum = NULL; }
   )
}
/**
 * Convert character vector between marked encodings and the encoding provided
 *
 * @param str     input character vector or list of raw vectors
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
 *          use StriUcnv
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          calc required buf size a priori
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_n = LENGTH(str);
   StriContainerUTF16 str_cont(str, str_n);

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   StriUcnv ucnv(selected_to);
   UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);

   // Get target encoding mark
   cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();

   // Prepare out val
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   // calculate required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_n; ++i) {
      if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize)
         bufsize = str_cont.get(i).length();
   }
   bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
   // "The calculated size is guaranteed to be sufficient for this conversion."
   String8buf buf(bufsize);

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t curn_tmp = str_cont.get(i).length();
      const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      UErrorCode status = U_ZERO_ERROR;
      ucnv_resetFromUnicode(uconv_to);
      R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
            curs_tmp, curn_tmp, &status);
      if (bufneed <= buf.size()) {
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }
      else {// larger buffer needed [this shouldn't happen?]
         buf.resize(bufneed, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
               curs_tmp, curn_tmp, &status);
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }

      if (to_raw_logical) {
         SEXP outobj;
         STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
         STRI__UNPROTECT(1);
      }
      else {
         SET_STRING_ELT(ret, i,
            Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* nothing special on error */})
}
/** Convert character vector to ASCII
 *
 * All charcodes > 127 are replaced with subst chars (0x1A)
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-30)
 *          use single common buf;
 *          warn on invalid utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toascii(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)

   // get buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING)
         continue;

      R_len_t ni = LENGTH(curs);
      if (ni > bufsize) bufsize = ni;
   }
   String8buf buf(bufsize); // no more bytes than this needed
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING || IS_ASCII(curs)) {
         // nothing to do
         SET_STRING_ELT(ret, i, curs);
         continue;
      }

      R_len_t curn = LENGTH(curs);
      const char* curs_tab = CHAR(curs);

      if (IS_UTF8(curs)) {
         R_len_t k = 0, j = 0;
         UChar32 c;
         while (j<curn) {
            U8_NEXT(curs_tab, j, curn, c);
            if (c < 0) {
               Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
               bufdata[k++] = ASCII_SUBSTITUTE;
            }
            else if (c > ASCII_MAXCHARCODE)
               bufdata[k++] = ASCII_SUBSTITUTE;
            else
               bufdata[k++] = (char)c;
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
      else { // some 8-bit encoding
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else {
               bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @param validate single logical value (or NA)
 *
 * @return character vector
 *
 * @version 0.1-XX (Marek Gagolewski)
 *
 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
 *                  make StriException-friendly
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
 *                 Use one String8buf;
 *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
 *
 * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
 *                 added validate arg
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
{
   PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate"));
   bool is_unknown_8bit_logical =
      stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(2)
   SEXP ret;
   if (!is_unknown_8bit_logical) {
      // Trivial - everything we need is in StriContainerUTF8 :)
      // which removes BOMs silently
      StriContainerUTF8 str_cont(str, n);
      STRI__PROTECT(ret = str_cont.toR());
   }
   else {
      // get buf size
      R_len_t bufsize = 0;
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
            continue;

         R_len_t ni = LENGTH(curs);
         if (ni > bufsize) bufsize = ni;
      }
      String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
      char* bufdata = buf.data();

      STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }

         if (IS_ASCII(curs) || IS_UTF8(curs)) {
            R_len_t curs_n = LENGTH(curs);
            const char* curs_s = CHAR(curs);
            if (curs_n >= 3 &&
               (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
               (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
               (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
               // has BOM - get rid of it
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
            }
            else
               SET_STRING_ELT(ret, i, curs);

            continue;
         }

         // otherwise, we have an 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else { // 0xEF 0xBF 0xBD
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
      }

   }

   // validate utf8 byte stream
   if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
      R_len_t ret_n = LENGTH(ret);
      for (R_len_t i=0; i<ret_n; ++i) {
         SEXP curs = STRING_ELT(ret, i);
         if (curs == NA_STRING) continue;

         const char* s = CHAR(curs);
         R_len_t sn = LENGTH(curs);
         R_len_t j = 0;
         UChar32 c = 0;
         while (c >= 0 && j < sn) {
            U8_NEXT(s, j, sn, c);
         }

         if (c >= 0) continue; // valid, nothing to do

         if (LOGICAL(validate)[0] == NA_LOGICAL) {
            Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         else {
            int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
            String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
            char* bufdata = buf.data();

            j = 0;
            R_len_t k = 0;
            UBool err = FALSE;
            while (!err && j < sn) {
               U8_NEXT(s, j, sn, c);
               if (c >= 0) {
                  U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
               } else {
                  Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }

            if (err) throw StriException(MSG__INTERNAL_ERROR);
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         }
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #22
0
/**
 *  Convert case (TitleCase)
 *
 *
 *  @param str character vector
 *  @param opts_brkiter list
 *  @return character vector
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    separated from stri_trans_casemap;
 *    use StriUBreakIterator
 */
SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) {
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "word");
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   StriUBreakIterator brkiter(opts_brkiter2);

   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   brkiter.free(false);
   // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do?
   // now briter is owned by ucasemap.
   // it will be released on ucasemap_close
   // (checked with ICU man & src code)

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) {
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}
Пример #23
0
/** internal function - replace multiple substrings in a single string
 * can raise Rf_error
 *
 *  @version 1.3.2 (Marek Gagolewski, 2019-02-23)
 *
 * @version 1.4.3 (Marek Gagolewski, 2019-03-12)
 *    #346: na_omit for `value`
 */
SEXP stri__sub_replacement_all_single(SEXP curs,
    SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value)
{
    // curs is a CHARSXP in UTF-8

    PROTECT(value = stri_enc_toutf8(value,
        Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE)));
    R_len_t value_len     = LENGTH(value);

    R_len_t from_len      = 0; // see below
    R_len_t to_len        = 0; // see below
    R_len_t length_len    = 0; // see below
    int* from_tab         = 0; // see below
    int* to_tab           = 0; // see below
    int* length_tab       = 0; // see below

    R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */
        stri__sub_prepare_from_to_length(from, to, length,
        from_len, to_len, length_len, from_tab, to_tab, length_tab);

    R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len
      from_len, (to_len>length_len)?to_len:length_len);

    if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is
        UNPROTECT(sub_protected);
        return curs;
    }
    if (value_len <= 0) { // things are supposed to be replaced with "nothing"...
        UNPROTECT(sub_protected);
        Rf_warning(MSG__REPLACEMENT_ZERO);
        return NA_STRING;
    }

    if (vectorize_len % value_len != 0)
        Rf_warning(MSG__WARN_RECYCLING_RULE2);


    const char* curs_s = CHAR(curs); // already in UTF-8
    R_len_t curs_n = LENGTH(curs);

    // first check for NAs....
    if (!omit_na_1) {
       for (R_len_t i=0; i<vectorize_len; ++i) {
           R_len_t cur_from     = from_tab[i % from_len];
           R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];
           if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) {
               UNPROTECT(sub_protected);
               if (omit_na_1) return curs;
               else return NA_STRING;
           }
       }

       for (R_len_t i=0; i<vectorize_len; ++i) {
           if (STRING_ELT(value, i%value_len) == NA_STRING) {
               UNPROTECT(sub_protected);
               return NA_STRING;
           }
       }
    }

    // get the number of code points in curs, if required (for negative indexes)
    R_len_t curs_m = -1;
    if (IS_ASCII(curs)) curs_m = curs_n;
    else { // is UTF-8
        curs_m = 0;    // code points count
        R_len_t j = 0; // byte pos
        while (j < curs_n) {
            U8_FWD_1_UNSAFE(curs_s, j);
            ++curs_m;
        }
    }

    STRI__ERROR_HANDLER_BEGIN(sub_protected)
    std::vector<char> buf; // convenience >> speed

    R_len_t buf_size;
    R_len_t last_pos = 0;
    R_len_t byte_pos = 0, byte_pos_last;
    for (R_len_t i=0; i<vectorize_len; ++i) {
        R_len_t cur_from     = from_tab[i % from_len];
        R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];

        if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) {
           continue;
        }

        if (cur_from < 0) cur_from = curs_m+cur_from+1;
        if (cur_from <= 0) cur_from = 1;
        cur_from--; // 1-based -> 0-based index
        if (cur_from >= curs_m) cur_from = curs_m;

        // cur_from is in [0, curs_m]

        if (length_tab) {
            if (cur_to < 0) cur_to = 0;
            cur_to = cur_from+cur_to;
        }
        else {
            if (cur_to < 0)  cur_to = curs_m+cur_to+1;
            if (cur_to < cur_from) cur_to = cur_from; // insertion
        }
        if (cur_to >= curs_m) cur_to = curs_m;

        // the chunk to replace is at code points [cur_from, cur_to)

        // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to);

        if (last_pos > cur_from)
            throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES);

        // first, copy [last_pos, cur_from)
        byte_pos_last = byte_pos;
        while (last_pos < cur_from) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
        buf_size = buf.size();
        buf.resize(buf_size+byte_pos-byte_pos_last);
        memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last);

        // then, copy the corresponding replacement string
        SEXP value_cur = STRING_ELT(value, i%value_len);
        const char* value_s = CHAR(value_cur);
        R_len_t value_n = LENGTH(value_cur);
        buf_size = buf.size();
        buf.resize(buf_size+value_n);
        memcpy(buf.data()+buf_size, value_s, value_n);



        // lastly, update last_pos
        // ---> last_pos = cur_to;
        while (last_pos < cur_to) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
      }

      // finally, copy [last_pos, curs_m)
    // Rprintf("orig [%d,%d)\n", last_pos, curs_m);
    buf_size = buf.size();
    buf.resize(buf_size+curs_n-byte_pos);
    memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos);

    SEXP ret;
    STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8));
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #24
0
/**
 * Substring replacement function
 *
 *
 * @param str character vector
 * @param from integer vector (possibly with negative indices)
 * @param to integer vector (possibly with negative indices) or NULL
 * @param length integer vector or NULL
 * @param omit_na logical scalar
 * @param value character vector replacement
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF8 and stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          use StriContainerUTF8's UChar32-to-UTF8 index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          Use StriContainerUTF8_indexable
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          Use stri__sub_prepare_from_to_length()
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.5-9003 (Marek Gagolewski, 2015-08-05)
 *    Bugfix #183: floating point exception when to or length is an empty vector
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-31)
 *    FR #199: new arg: `omit_na`
 *    FR #207: allow insertions
 *
 *
 * @version 1.4.3 (Marek Gagolewski, 2019-03-12)
 *    #346: na_omit for `value`
 */
SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value)
{
   PROTECT(str   = stri_prepare_arg_string(str, "str"));
   PROTECT(value = stri_prepare_arg_string(value, "value"));
   bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");

   R_len_t value_len     = LENGTH(value);
   R_len_t str_len       = LENGTH(str);
   R_len_t from_len      = 0; // see below
   R_len_t to_len        = 0; // see below
   R_len_t length_len    = 0; // see below
   int* from_tab         = 0; // see below
   int* to_tab           = 0; // see below
   int* length_tab       = 0; // see below

   R_len_t sub_protected =  2+ /* how many objects to PROTECT on ret? */
      stri__sub_prepare_from_to_length(from, to, length,
      from_len, to_len, length_len, from_tab, to_tab, length_tab);

   R_len_t vectorize_len = stri__recycling_rule(true, 4,
      str_len, value_len, from_len, (to_len>length_len)?to_len:length_len);

   if (vectorize_len <= 0) {
      UNPROTECT(sub_protected);
      return Rf_allocVector(STRSXP, 0);
   }

   STRI__ERROR_HANDLER_BEGIN(sub_protected)
   StriContainerUTF8_indexable str_cont(str, vectorize_len);
   StriContainerUTF8 value_cont(value, vectorize_len);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len));
   String8buf buf(0); // @TODO: estimate bufsize a priori

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      R_len_t cur_from     = from_tab[i % from_len];
      R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];

      if (str_cont.isNA(i)) {
          SET_STRING_ELT(ret, i, NA_STRING);
          continue;
      }
      if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) {
         if (omit_na_1) {
            SET_STRING_ELT(ret, i, str_cont.toR(i));
         }
         else {
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         continue;
      }

      if (length_tab) {
         if (cur_to <= 0) {
            // SET_STRING_ELT(ret, i, R_BlankString);
            // continue;
            cur_to = 0;
         }
         else {
            cur_to = cur_from + cur_to - 1;
            if (cur_from < 0 && cur_to >= 0) cur_to = -1;
         }
      }

      const char* str_cur_s   = str_cont.get(i).c_str();
      R_len_t str_cur_n       = str_cont.get(i).length();
      const char* value_cur_s = value_cont.get(i).c_str();
      R_len_t value_cur_n     = value_cont.get(i).length();

      R_len_t cur_from2; // UTF-8 byte indices
      R_len_t cur_to2;   // UTF-8 byte indices

      stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2);
      if (cur_to2 < cur_from2) cur_to2 = cur_from2;

      R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n;
      buf.resize(buflen, false/*destroy contents*/);
      memcpy(buf.data(), str_cur_s, (size_t)cur_from2);
      memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n);
      memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2);
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #25
0
/**
 * Get substring
 *
 *
 * @param str character vector
 * @param from integer vector (possibly with negative indices)
 * @param to integer vector (possibly with negative indices) or NULL
 * @param length integer vector or NULL
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF8 and stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          use StriContainerUTF8's UChar32-to-UTF8 index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          Use StriContainerUTF8_indexable
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          Use stri__sub_prepare_from_to_length()
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.5-9003 (Marek Gagolewski, 2015-08-05)
 *    Bugfix #183: floating point exception when to or length is an empty vector
 */
SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));

   R_len_t str_len       = LENGTH(str);
   R_len_t from_len      = 0;
   R_len_t to_len        = 0;
   R_len_t length_len    = 0;
   int* from_tab         = 0;
   int* to_tab           = 0;
   int* length_tab       = 0;

   R_len_t sub_protected =  1+  /* how many objects to PROTECT on ret? */
      stri__sub_prepare_from_to_length(from, to, length,
      from_len, to_len, length_len, from_tab, to_tab, length_tab);

   R_len_t vectorize_len = stri__recycling_rule(true, 3,
      str_len, from_len, (to_len>length_len)?to_len:length_len);

   if (vectorize_len <= 0) {
      UNPROTECT(sub_protected);
      return Rf_allocVector(STRSXP, 0);
   }

   STRI__ERROR_HANDLER_BEGIN(sub_protected)
   StriContainerUTF8_indexable str_cont(str, vectorize_len);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      R_len_t cur_from     = from_tab[i % from_len];
      R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];
      if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      if (length_tab) {
         if (cur_to <= 0) {
            SET_STRING_ELT(ret, i, R_BlankString);
            continue;
         }
         cur_to = cur_from + cur_to - 1;
         if (cur_from < 0 && cur_to >= 0) cur_to = -1;
      }

      const char* str_cur_s = str_cont.get(i).c_str();

      R_len_t cur_from2; // UTF-8 byte indices
      R_len_t cur_to2;   // UTF-8 byte indices

      stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2);

      if (cur_to2 > cur_from2) { // just copy
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8));
      }
      else {
         // maybe a warning here?
         SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Trim characters from a charclass from left AND/OR right side of the string
 *
 * @param str character vector
 * @param pattern character vector
 * @param left from left?
 * @param right from left?
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
 *          Use StriContainerUTF8 and CharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly & Use StrContainerCharClass
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*/
SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t jlast1 = 0;
      R_len_t jlast2 = str_cur_n;

      if (left) {
         UChar32 chr;
         for (R_len_t j=0; j<str_cur_n; ) {
            U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast1 = j;
         }
      }

      if (right && jlast1 < str_cur_n) {
         UChar32 chr;
         for (R_len_t j=str_cur_n; j>0; ) {
            U8_PREV(str_cur_s, 0, j, chr); // "look behind"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast2 = j;
         }
      }

      // now jlast is the index, from which we start copying
      SET_STRING_ELT(ret, i,
         Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #27
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Пример #28
0
/**
 *  Convert case (upper, lowercase)
 *
 *
 *  @param str character vector
 *  @param locale single string identifying
 *         the locale ("" or NULL for default locale)
 *  @return character vector
 *
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-19)
 *          use UCaseMap + StriContainerUTF8
 *          **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes
 *          ** BTW, since stringi_0.1-25 we require ICU>=50 **
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-18)
 *          use UCaseMap + StriContainerUTF8
 *          (this is much faster for UTF-8 and slightly faster for 8bit enc)
 *          Estimates minimal buffer size.
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
 *          Use a custom BreakIterator with stri_trans_totitle
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    use StriUBreakIterator
 *
 * @version 0.6-1 (Marek Gagolewski, 2015-07-11)
 *    now this is an internal function
*/
SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale)
{
   if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need;
      if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
      else buf_need = ucasemap_utf8ToUpper(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) { /* retry */
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
         else buf_need = ucasemap_utf8ToUpper(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}
Пример #29
0
/** Generate random permutations of code points in each string
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.2.5 (Marek Gagolewski, 2019-07-23)
 *    #319: Fixed overflow in `stri_rand_shuffle()`.
 */
SEXP stri_rand_shuffle(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }
   std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.)
   String8buf buf2(bufsize);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // fill buf1
      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf1[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle)
      R_len_t cur_n = k;
      for (j=0; j<cur_n-1; ++j) {
         // rand from i to cur_n-1
         R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j);
         UChar32 tmp = buf1[r];
         buf1[r] = buf1[j];
         buf1[j] = tmp;
      }

      // create string:
      char* buf2data = buf2.data();
      c = (UChar32)0;
      j = 0;
      k = 0;
      UBool err = FALSE;
      while (!err && k < cur_n) {
         c = buf1[k++];
         U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err);
      }

      if (err) throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}
Пример #30
0
/** Generate random strings
 *
 * @param n single integer
 * @param length integer vector
 * @param pattern character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          Use StriContainerCharClass which now contains UnicodeSets;
 *          vectorized also over pattern
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern)
{
   int n_val = stri__prepare_arg_integer_1_notNA(n, "n");
   PROTECT(length    = stri_prepare_arg_integer(length, "length"));
   PROTECT(pattern   = stri_prepare_arg_string(pattern, "pattern"));

   if (n_val < 0) n_val = 0; /* that's not NA for sure now */

   R_len_t length_len = LENGTH(length);
   if (length_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length");
   }
   else if (length_len > n_val || n_val % length_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   R_len_t pattern_len = LENGTH(pattern);
   if (pattern_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern");
   }
   else if (pattern_len > n_val || n_val % pattern_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(2)

   StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len));
   StriContainerInteger   length_cont(length, max(n_val, length_len));

   // get max required bufsize
   int*    length_tab = INTEGER(length);
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<length_len; ++i) {
      if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize)
         bufsize = length_tab[i];
   }
   bufsize *= 4;  // 1 UChar32 -> max. 4 UTF-8 bytes
   String8buf buf(bufsize);
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val));

   for (R_len_t i=0; i<n_val; ++i) {
      if (length_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      int length_cur = length_cont.get(i);
      if (length_cur < 0) length_cur = 0;

      const UnicodeSet* uset = &(pattern_cont.get(i));
      int32_t uset_size = uset->size();

      // generate string:
      R_len_t j = 0;
      UBool err = FALSE;
      for (R_len_t k=0; k<length_cur; ++k) {
         int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */
         UChar32 c = uset->charAt(idx);
         if (c < 0) throw StriException(MSG__INTERNAL_ERROR);

         U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);
         if (err) throw StriException(MSG__INTERNAL_ERROR);
      }
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}