/**
 * Initialize object data
 *
 */
void StriContainerBase::init_Base(R_len_t _n, R_len_t _nrecycle, bool _shallowrecycle, SEXP _sexp)
{
#ifndef NDEBUG
   if (this->n != 0 || this->nrecycle != 0 || this->sexp != (SEXP)NULL)
      throw StriException("StriContainerBase::init_Base(...): already initialized");
   this->isShallow = _shallowrecycle;
#endif

   if (_n == 0 || _nrecycle == 0) {
      this->nrecycle = 0;
      this->n = 0;
      this->sexp = _sexp;
   }
   else {
      this->nrecycle = _nrecycle;
      this->n = (_shallowrecycle)?_n:_nrecycle;
      this->sexp = _sexp;

#ifndef NDEBUG
   if (this->n < _n)
      throw StriException("StriContainerBase::init_Base(...): this->n < _n");
   if (this->n > this->nrecycle)
      throw StriException("StriContainerBase::init_Base(...): this->n > this->nrecycle");
#endif
   }
}
예제 #2
0
/**
 * Reverse Each String
 * @param str character vector
 * @return character vector with every string reversed
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          detect incorrect utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_reverse(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_len); // writable, no recycle

   // STEP 1.
   // Calculate the required buffer length
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_len; ++i) {
      if (str_cont.isNA(i))
         continue;

      R_len_t cursize = str_cont.get(i).length();
      if (cursize > bufsize)
         bufsize = cursize;
   }

   // STEP 2.
   // Alloc buffer & result vector
   String8buf buf(bufsize);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      R_len_t j, k;
      UChar32 chr;
      UBool isError = FALSE;

      for (j=str_cur_n, k=0; !isError && j>0; ) {
         U8_PREV(str_cur_s, 0, j, chr); // go backwards
         if (chr < 0) {
            throw StriException(MSG__INVALID_UTF8);
         }
         U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError);
      }

      if (isError)
         throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Construct Container from R cobject
 * @param rstr R object
 *
 * if you want nrecycle > n, call set_nrecycle
 */
StriContainerListInt::StriContainerListInt(SEXP rstr)
{
   this->data = NULL;

   if (isNull(rstr)) {
      this->init_Base(1, 1, true);
      this->data = new IntVec[this->n]; // 1 vector, NA/NULL
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
   }
   else if (Rf_isInteger(rstr)) {
      this->init_Base(1, 1, true);
      this->data = new IntVec[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      this->data[0].initialize((const int*)INTEGER(rstr), LENGTH(rstr)); // shallow copy
   }
   else // if (Rf_isVectorList(rstr)) -- args already checked
   {
      R_len_t nv = LENGTH(rstr);
      this->init_Base(nv, nv, true);
      this->data = new IntVec[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (R_len_t i=0; i<this->n; ++i) {
         SEXP cur = VECTOR_ELT(rstr, i);
         if (!isNull(cur))
            this->data[i].initialize((const int*)INTEGER(cur), LENGTH(cur)); // shallow copy
         // else leave as-is, i.e. NULL/NA
      }
   }
}
/** Copy operator
 * @param container source
 * @return *this
 */
StriContainerByteSearch& StriContainerByteSearch::operator=(StriContainerByteSearch& container)
{
   this->~StriContainerByteSearch();
   (StriContainerUTF8&) (*this) = (StriContainerUTF8&)container;
   this->patternLen = 0;
   this->patternStr = NULL;
   this->searchPos = -1;
   this->searchEnd = -1;
   this->searchStr = NULL;
   this->searchLen = 0;
   this->flags = container.flags;
#ifndef NDEBUG
   this->debugMatcherIndex = -1;
#endif
//#ifndef STRI__BYTESEARCH_DISABLE_KMP
   this->patternPos = -1;
   this->kmpMaxSize = container.kmpMaxSize;
   this->kmpNext = new int[kmpMaxSize];
   if (!this->kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR);

   this->patternLenCaseInsensitive = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      this->patternStrCaseInsensitive = new UChar32[this->kmpMaxSize];
      if (!this->patternStrCaseInsensitive) throw StriException(MSG__MEM_ALLOC_ERROR);
   }
   else
      this->patternStrCaseInsensitive = NULL;
//#endif
   return *this;
}
/**
 * Construct String Container from R character vector
 * @param rstr R character vector
 * @param _nrecycle extend length [vectorization]
 */
StriContainerByteSearch::StriContainerByteSearch(SEXP rstr, R_len_t _nrecycle, uint32_t _flags)
   : StriContainerUTF8(rstr, _nrecycle, true)
{
   this->flags = _flags;

   this->patternLen = 0;
   this->patternStr = NULL;

   this->searchPos = -1;
   this->searchEnd = -1;
   this->searchStr = NULL;
   this->searchLen = 0;

#ifndef NDEBUG
   this->debugMatcherIndex = -1;
#endif

//#ifndef STRI__BYTESEARCH_DISABLE_KMP
   this->patternPos = -1;
   this->kmpMaxSize = getMaxNumBytes()+1;
   this->kmpNext = new int[kmpMaxSize];
   if (!this->kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR);
//#endif

   this->patternLenCaseInsensitive = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      this->patternStrCaseInsensitive = new UChar32[this->kmpMaxSize];
      if (!this->patternStrCaseInsensitive) throw StriException(MSG__MEM_ALLOC_ERROR);
   }
   else
      this->patternStrCaseInsensitive = NULL;
}
/** setup/reuse matcher for findFirst/findNext
 *
 * @param i index
 * @param searchStr string to search in
 * @param searchLen string length in bytes
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Bartek Tartanus, 2013-08-15)
 *          uses KMP
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *          KMP upgrade;
 *          special procedure for patternLen <= 4
 */
void StriContainerByteSearch::setupMatcherFwd(R_len_t i, const char* _searchStr, R_len_t _searchLen)
{
   if (i >= n && this->patternStr == get(i).c_str()) {
#ifndef NDEBUG
      if ((debugMatcherIndex % n) != (i % n)) {
         throw StriException("DEBUG: vectorize_getMatcher - matcher reuse failed!");
      }
#endif
      // matcher reuse
   }
   else {
      this->patternStr = get(i).c_str();
      this->patternLen = get(i).length();

      if (flags & BYTESEARCH_CASE_INSENSITIVE) {
         upgradePatternCaseInsensitive();
         createKMPtableFwdCaseInsensitive();
      }
      else {
#ifndef STRI__BYTESEARCH_DISABLE_SHORTPAT
         if (this->patternLen > 4) // a short pattern => don't use KMP
            createKMPtableFwd();
#endif
      }
   }

   this->searchStr = _searchStr;
   this->searchLen = _searchLen;
   this->resetMatcher();

#ifndef NDEBUG
   debugMatcherIndex = (i % n);
#endif
}
/** Convert FORWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == first character in i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh)
{
   if (wh <= 0) return 0;
   if (get(i).isASCII()) return std::min(wh, get(i).length());

   R_len_t cur_n = get(i).length();
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s");
#endif


   if (last_ind_fwd_str != cur_s) {
      // starting search in a different string
      last_ind_fwd_codepoint = 0;
      last_ind_fwd_utf8 = 0;
      last_ind_fwd_str = cur_s;
   }

   R_len_t j = 0;
   R_len_t jres = 0;

   if (last_ind_fwd_codepoint > 0) {
      if (wh < last_ind_fwd_codepoint) {
         // check if it makes sense to go backwards from last position,
         // or it is better to start from scratch
         if ((last_ind_fwd_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_fwd_codepoint;
            jres = last_ind_fwd_utf8;
            while (j > wh && jres > 0) {
               U8_BACK_1((const uint8_t*)cur_s, 0, jres);
               --j;
            }

            last_ind_fwd_codepoint = wh;
            last_ind_fwd_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_fwd_codepoint)  // continue last search
         j    = last_ind_fwd_codepoint;
         jres = last_ind_fwd_utf8;
      }
   }

   // go forward
   while (j < wh && jres < cur_n) {
      U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
      ++j;
   }

   last_ind_fwd_codepoint = wh;
   last_ind_fwd_utf8 = jres;
   return jres;
}
/** Convert BACKWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == byte after last character in the i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh)
{
   R_len_t cur_n = get(i).length();
   if (wh <= 0) return cur_n;
   if (get(i).isASCII()) return std::max(cur_n-wh, 0);
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s");
#endif

   if (last_ind_back_str != cur_s) {
      // starting search in a different string
      last_ind_back_codepoint = 0;
      last_ind_back_utf8 = cur_n;
      last_ind_back_str = cur_s;
   }


   R_len_t j = 0;
   R_len_t jres = cur_n;

   if (last_ind_back_codepoint > 0) {
      if (wh < last_ind_back_codepoint) {
         // check if it makes sense to go towards the end of the string
         // or maybe it will be better to start from the end and move backwards
         if ((last_ind_back_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_back_codepoint;
            jres = last_ind_back_utf8;
            while (j > wh && jres < cur_n) {
               U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
               --j;
            }

            last_ind_back_codepoint = wh;
            last_ind_back_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_back_codepoint)  // continue last search
         j    = last_ind_back_codepoint;
         jres = last_ind_back_utf8;
      }
   }

   // go backward
   while (j < wh && jres > 0) {
      U8_BACK_1((const uint8_t*)cur_s, 0, jres);
      ++j;
   }

   last_ind_back_codepoint = wh;
   last_ind_back_utf8 = jres;

   return jres;
}
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param omit_na single logical value
 * @return logical vector
 *
 * @version 0.3-1 (Bartek Tartanus, 2014-07-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-17)
 *                using std::vector<int> to avoid mem-leaks
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-04)
 *    FR #122: omit_na arg added
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   // BT: this cannot be done with deque, because pattern is reused so i does not
   // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on
   // MG: agreed
   std::vector<int> which(vectorize_length);
   int result_counter = 0;

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         if (omit_na1) which[i] = FALSE;
         else {
            which[i] = NA_LOGICAL;
            result_counter++;
         }
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      which[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            which[i] = TRUE;
            break;
         }
      }
      if (negate_1) which[i] = !which[i];
      if (which[i]) result_counter++;
   }

   SEXP ret;
   STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter));
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
예제 #10
0
/** container for nrecycle fresh, brand new, writable UnicodeStrings
 *
 * Each string is initially empty.
 *
 * @param nrecycle number of strings
 */
StriContainerUTF16::StriContainerUTF16(R_len_t _nrecycle)
{
   this->str = NULL;
   this->init_Base(_nrecycle, _nrecycle, false);
   if (this->n > 0) {
      this->str = new UnicodeString[this->n];
      if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR);
   }
}
/** Convert character vector to UTF-32
 *
 * @param str character vector
 * @return list with integer vectors
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
 *          use vector<UChar32> buf instead of R_alloc;
 *          warn and set NULL on improper UTF-8 byte sequences
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-12)
 *          Use UChar32* instead of vector<UChar32> as ::data is C++11
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf32(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 1; // to avoid allocating an empty buffer
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }

   UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
   if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
   // deque<UChar32> was slower than using a common, over-sized buf

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }

      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_VECTOR_ELT(ret, i, R_NilValue);
         continue;
      }
      else {
         SEXP conv;
         STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
         memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
         SET_VECTOR_ELT(ret, i, conv);
         STRI__UNPROTECT(1);
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
}
/**
 * Substitutes vector elements if a pattern occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param value character vector
 * @return character vector
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *   FR#124
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 */
SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern"));
   PROTECT(value = stri_prepare_arg_string(value, "value"));

   int vectorize_length = LENGTH(str);
   int value_length = LENGTH(value);
   if (value_length == 0)
      Rf_error(MSG__REPLACEMENT_ZERO);

   STRI__ERROR_HANDLER_BEGIN(3)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerUTF8 value_cont(value, value_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   R_len_t k = 0;
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      bool found = false;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            found = true;
            break;
         }
      }

      if ((found && !negate_1) || (!found && negate_1))
         SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length));
      else
         SET_STRING_ELT(ret, i, str_cont.toR(i));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Construct String Container from R object
 * @param rstr R object
 *
 * if you want nrecycle > n, call set_nrecycle
 */
StriContainerListRaw::StriContainerListRaw(SEXP rstr)
{
   this->data = NULL;

   if (isNull(rstr)) {
      this->init_Base(1, 1, true);
      this->data = new String8[this->n]; // 1 string, NA
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
   }
   else if (isRaw(rstr)) {
      this->init_Base(1, 1, true);
      this->data = new String8[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      this->data[0].initialize((const char*)RAW(rstr), LENGTH(rstr),
         false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy
   }
   else if (Rf_isVectorList(rstr)) {
      R_len_t nv = LENGTH(rstr);
      this->init_Base(nv, nv, true);
      this->data = new String8[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (R_len_t i=0; i<this->n; ++i) {
         SEXP cur = VECTOR_ELT(rstr, i);
         if (!isNull(cur))
            this->data[i].initialize((const char*)RAW(cur), LENGTH(cur),
               false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy
         // else leave as-is, i.e. NA
      }
   }
   else { // it's surely a character vector (args have been checked)
      R_len_t nv = LENGTH(rstr);
      this->init_Base(nv, nv, true);
      this->data = new String8[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (R_len_t i=0; i<this->n; ++i) {
         SEXP cur = STRING_ELT(rstr, i);
         if (cur != NA_STRING)
            this->data[i].initialize(CHAR(cur), LENGTH(cur),
               false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy
         // else leave as-is, i.e. NA
      }
   }
}
StriContainerListUTF8::StriContainerListUTF8(StriContainerListUTF8& container)
   :    StriContainerBase((StriContainerBase&)container)
{
   if (container.data) {
      this->data = new StriContainerUTF8*[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (int i=0; i<container.n; ++i) {
         if (container.data[i]) {
            this->data[i] = new StriContainerUTF8(*container.data[i]);
            if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR);
         }
         else
            this->data[i] = NULL;
      }
   }
   else {
      this->data = NULL;
   }
}
/** reset matcher
 *
 * will start search from the beginning next time
 */
void StriContainerByteSearch::resetMatcher()
{
#ifndef NDEBUG
   if (!this->searchStr || !this->patternStr)
      throw StriException("DEBUG: StriContainerByteSearch: setupMatcher() hasn't been called yet");
#endif

   this->searchPos = -1;
   this->searchEnd = -1;
   this->patternPos = -1;
}
/**
 * Construct String Container from R character vector
 * @param rvec R list vector
 * @param nrecycle extend length of each character vector stored [vectorization]
 * @param shallowrecycle will stored character vectors be ever modified?
 */
StriContainerListUTF8::StriContainerListUTF8(SEXP rvec, R_len_t _nrecycle, bool _shallowrecycle)
{
   this->data = NULL;
#ifndef NDEBUG
   if (!Rf_isVectorList(rvec))
      throw StriException("DEBUG: !isVectorList in StriContainerListUTF8::StriContainerListUTF8(SEXP rvec)");
#endif
   R_len_t rvec_length = LENGTH(rvec);
   this->init_Base(rvec_length, rvec_length, true);

   if (this->n > 0) {
      this->data = new StriContainerUTF8*[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (R_len_t i=0; i<this->n; ++i)
         this->data[i] = NULL; // in case it fails during conversion (this is "NA")

      for (R_len_t i=0; i<this->n; ++i) {
         this->data[i] = new StriContainerUTF8(VECTOR_ELT(rvec, i), _nrecycle, _shallowrecycle);
         if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR);
      }
   }
}
StriContainerListInt::StriContainerListInt(StriContainerListInt& container)
   :    StriContainerBase((StriContainerBase&)container)
{
   if (container.data) {
      this->data = new IntVec[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (int i=0; i<this->n; ++i) {
         this->data[i] = container.data[i];
      }
   }
   else {
      this->data = NULL;
   }
}
예제 #18
0
/**
 * Detect if a character class occurs in a string
 *
 * @param str character vector
 * @param pattern character vector
 * @param negate single bool
 * @param max_count single int
 * @return logical vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-02)
 *          Use StrContainerUTF8 and CharClass classes
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-15)
 *          Use StrContainerCharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_charclass(SEXP str, SEXP pattern,
    SEXP negate, SEXP max_count)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) {
         ret_tab[i] = NA_LOGICAL;
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      UChar32 chr = 0;
      ret_tab[i] = FALSE;
      for (R_len_t j=0; j<str_cur_n; ) {
         U8_NEXT(str_cur_s, j, str_cur_n, chr);
         if (chr < 0) // invalid UTF-8 sequence
            throw StriException(MSG__INVALID_UTF8);
         if (pattern_cur->contains(chr)) {
            ret_tab[i] = TRUE;
            break;
         }
      }
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
StriContainerListUTF8& StriContainerListUTF8::operator=(StriContainerListUTF8& container)
{
   this->~StriContainerListUTF8();
   (StriContainerBase&) (*this) = (StriContainerBase&)container;

   if (container.data) {
      this->data = new StriContainerUTF8*[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (int i=0; i<container.n; ++i) {
         if (container.data[i]) {
            this->data[i] = new StriContainerUTF8(*container.data[i]);
            if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR);
         }
         else
            this->data[i] = NULL;
      }
   }
   else {
      this->data = NULL;
   }

   return *this;
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
void StriContainerByteSearch::upgradePatternCaseInsensitive()
{
   UChar32 c = 0;
   R_len_t j = 0;
   patternLenCaseInsensitive = 0;
   while (j < patternLen) {
      U8_NEXT(patternStr, j, patternLen, c);
#ifndef NDEBUG
      if (patternLenCaseInsensitive >= this->kmpMaxSize)
         throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()");
#endif
      patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c);
   }
   patternStrCaseInsensitive[patternLenCaseInsensitive] = 0;
}
예제 #21
0
/** Export string to R
 *  THE OUTPUT IS ALWAYS IN UTF-8
 *  @param i index [with recycle]
 *  @return CHARSXP
 */
SEXP StriContainerUTF16::toR(R_len_t i) const
{
#ifndef NDEBUG
   if (i < 0 || i >= nrecycle)
      throw StriException("StriContainerUTF16::toR(): INDEX OUT OF BOUNDS");
#endif

   if (str[i%n] == NULL)
      return NA_STRING;
   else {
      std::string s;
      str[i%n]->toUTF8String(s);
      return Rf_mkCharLenCE(s.c_str(), (int)s.length(), (cetype_t)CE_UTF8);
   }
}
StriContainerListInt& StriContainerListInt::operator=(StriContainerListInt& container)
{
   this->~StriContainerListInt();
   (StriContainerBase&) (*this) = (StriContainerBase&)container;

   if (container.data) {
      this->data = new IntVec[this->n];
      if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR);
      for (int i=0; i<this->n; ++i) {
         this->data[i] = container.data[i];
      }
   }
   else {
      this->data = NULL;
   }
   return *this;
}
예제 #23
0
/**
 * Count pattern occurcess in a string [with collation]
 *
 * @param str character vector
 * @param pattern character vector
 * @param collator_opts passed to stri__ucol_open(),
 * if \code{NA}, then \code{stri_detect_fixed_byte} is called
 * @return integer vector
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski) - corrected behavior on empty str/pattern
 * @version 0.3 (Marek Gagolewski, 2013-06-23) make StriException-friendly,
 *    use StriContainerUStringSearch
 */
SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP collator_opts)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");

   // call stri__ucol_open after prepare_arg:
   // if prepare_arg had failed, we would have a mem leak
   UCollator* collator = stri__ucol_open(collator_opts);
   if (!collator)
      return stri__count_fixed_byte(str, pattern);

   STRI__ERROR_HANDLER_BEGIN
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
   StriContainerUTF16 str_cont(str, vectorize_length);
   StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator);  // collator is not owned by pattern_cont

   SEXP ret;
   PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length));
   int* ret_tab = INTEGER(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
         ret_tab[i] = NA_INTEGER,
         ret_tab[i] = 0)

      UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i));
      usearch_reset(matcher);
      UErrorCode status = U_ZERO_ERROR;
      ret_tab[i] = 0;
      while (((int)usearch_next(matcher, &status) != USEARCH_DONE) && !U_FAILURE(status))
         ++ret_tab[i];
      if (U_FAILURE(status)) throw StriException(status);
   }

   if (collator) { ucol_close(collator); collator=NULL; }
   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(
      if (collator) ucol_close(collator);
   )
}
/**
 * Convert character vector between marked encodings and the encoding provided
 *
 * @param str     input character vector or list of raw vectors
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
 *          use StriUcnv
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          calc required buf size a priori
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_n = LENGTH(str);
   StriContainerUTF16 str_cont(str, str_n);

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   StriUcnv ucnv(selected_to);
   UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);

   // Get target encoding mark
   cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();

   // Prepare out val
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   // calculate required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_n; ++i) {
      if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize)
         bufsize = str_cont.get(i).length();
   }
   bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
   // "The calculated size is guaranteed to be sufficient for this conversion."
   String8buf buf(bufsize);

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t curn_tmp = str_cont.get(i).length();
      const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      UErrorCode status = U_ZERO_ERROR;
      ucnv_resetFromUnicode(uconv_to);
      R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
            curs_tmp, curn_tmp, &status);
      if (bufneed <= buf.size()) {
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }
      else {// larger buffer needed [this shouldn't happen?]
         buf.resize(bufneed, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
               curs_tmp, curn_tmp, &status);
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }

      if (to_raw_logical) {
         SEXP outobj;
         STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
         STRI__UNPROTECT(1);
      }
      else {
         SET_STRING_ELT(ret, i,
            Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* nothing special on error */})
}
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @param validate single logical value (or NA)
 *
 * @return character vector
 *
 * @version 0.1-XX (Marek Gagolewski)
 *
 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
 *                  make StriException-friendly
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
 *                 Use one String8buf;
 *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
 *
 * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
 *                 added validate arg
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
{
   PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate"));
   bool is_unknown_8bit_logical =
      stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(2)
   SEXP ret;
   if (!is_unknown_8bit_logical) {
      // Trivial - everything we need is in StriContainerUTF8 :)
      // which removes BOMs silently
      StriContainerUTF8 str_cont(str, n);
      STRI__PROTECT(ret = str_cont.toR());
   }
   else {
      // get buf size
      R_len_t bufsize = 0;
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
            continue;

         R_len_t ni = LENGTH(curs);
         if (ni > bufsize) bufsize = ni;
      }
      String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
      char* bufdata = buf.data();

      STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }

         if (IS_ASCII(curs) || IS_UTF8(curs)) {
            R_len_t curs_n = LENGTH(curs);
            const char* curs_s = CHAR(curs);
            if (curs_n >= 3 &&
               (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
               (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
               (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
               // has BOM - get rid of it
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
            }
            else
               SET_STRING_ELT(ret, i, curs);

            continue;
         }

         // otherwise, we have an 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else { // 0xEF 0xBF 0xBD
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
      }

   }

   // validate utf8 byte stream
   if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
      R_len_t ret_n = LENGTH(ret);
      for (R_len_t i=0; i<ret_n; ++i) {
         SEXP curs = STRING_ELT(ret, i);
         if (curs == NA_STRING) continue;

         const char* s = CHAR(curs);
         R_len_t sn = LENGTH(curs);
         R_len_t j = 0;
         UChar32 c = 0;
         while (c >= 0 && j < sn) {
            U8_NEXT(s, j, sn, c);
         }

         if (c >= 0) continue; // valid, nothing to do

         if (LOGICAL(validate)[0] == NA_LOGICAL) {
            Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         else {
            int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
            String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
            char* bufdata = buf.data();

            j = 0;
            R_len_t k = 0;
            UBool err = FALSE;
            while (!err && j < sn) {
               U8_NEXT(s, j, sn, c);
               if (c >= 0) {
                  U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
               } else {
                  Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }

            if (err) throw StriException(MSG__INTERNAL_ERROR);
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         }
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
예제 #26
0
파일: stri_sub.cpp 프로젝트: cran/stringi
/** internal function - replace multiple substrings in a single string
 * can raise Rf_error
 *
 *  @version 1.3.2 (Marek Gagolewski, 2019-02-23)
 *
 * @version 1.4.3 (Marek Gagolewski, 2019-03-12)
 *    #346: na_omit for `value`
 */
SEXP stri__sub_replacement_all_single(SEXP curs,
    SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value)
{
    // curs is a CHARSXP in UTF-8

    PROTECT(value = stri_enc_toutf8(value,
        Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE)));
    R_len_t value_len     = LENGTH(value);

    R_len_t from_len      = 0; // see below
    R_len_t to_len        = 0; // see below
    R_len_t length_len    = 0; // see below
    int* from_tab         = 0; // see below
    int* to_tab           = 0; // see below
    int* length_tab       = 0; // see below

    R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */
        stri__sub_prepare_from_to_length(from, to, length,
        from_len, to_len, length_len, from_tab, to_tab, length_tab);

    R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len
      from_len, (to_len>length_len)?to_len:length_len);

    if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is
        UNPROTECT(sub_protected);
        return curs;
    }
    if (value_len <= 0) { // things are supposed to be replaced with "nothing"...
        UNPROTECT(sub_protected);
        Rf_warning(MSG__REPLACEMENT_ZERO);
        return NA_STRING;
    }

    if (vectorize_len % value_len != 0)
        Rf_warning(MSG__WARN_RECYCLING_RULE2);


    const char* curs_s = CHAR(curs); // already in UTF-8
    R_len_t curs_n = LENGTH(curs);

    // first check for NAs....
    if (!omit_na_1) {
       for (R_len_t i=0; i<vectorize_len; ++i) {
           R_len_t cur_from     = from_tab[i % from_len];
           R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];
           if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) {
               UNPROTECT(sub_protected);
               if (omit_na_1) return curs;
               else return NA_STRING;
           }
       }

       for (R_len_t i=0; i<vectorize_len; ++i) {
           if (STRING_ELT(value, i%value_len) == NA_STRING) {
               UNPROTECT(sub_protected);
               return NA_STRING;
           }
       }
    }

    // get the number of code points in curs, if required (for negative indexes)
    R_len_t curs_m = -1;
    if (IS_ASCII(curs)) curs_m = curs_n;
    else { // is UTF-8
        curs_m = 0;    // code points count
        R_len_t j = 0; // byte pos
        while (j < curs_n) {
            U8_FWD_1_UNSAFE(curs_s, j);
            ++curs_m;
        }
    }

    STRI__ERROR_HANDLER_BEGIN(sub_protected)
    std::vector<char> buf; // convenience >> speed

    R_len_t buf_size;
    R_len_t last_pos = 0;
    R_len_t byte_pos = 0, byte_pos_last;
    for (R_len_t i=0; i<vectorize_len; ++i) {
        R_len_t cur_from     = from_tab[i % from_len];
        R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];

        if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) {
           continue;
        }

        if (cur_from < 0) cur_from = curs_m+cur_from+1;
        if (cur_from <= 0) cur_from = 1;
        cur_from--; // 1-based -> 0-based index
        if (cur_from >= curs_m) cur_from = curs_m;

        // cur_from is in [0, curs_m]

        if (length_tab) {
            if (cur_to < 0) cur_to = 0;
            cur_to = cur_from+cur_to;
        }
        else {
            if (cur_to < 0)  cur_to = curs_m+cur_to+1;
            if (cur_to < cur_from) cur_to = cur_from; // insertion
        }
        if (cur_to >= curs_m) cur_to = curs_m;

        // the chunk to replace is at code points [cur_from, cur_to)

        // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to);

        if (last_pos > cur_from)
            throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES);

        // first, copy [last_pos, cur_from)
        byte_pos_last = byte_pos;
        while (last_pos < cur_from) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
        buf_size = buf.size();
        buf.resize(buf_size+byte_pos-byte_pos_last);
        memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last);

        // then, copy the corresponding replacement string
        SEXP value_cur = STRING_ELT(value, i%value_len);
        const char* value_s = CHAR(value_cur);
        R_len_t value_n = LENGTH(value_cur);
        buf_size = buf.size();
        buf.resize(buf_size+value_n);
        memcpy(buf.data()+buf_size, value_s, value_n);



        // lastly, update last_pos
        // ---> last_pos = cur_to;
        while (last_pos < cur_to) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
      }

      // finally, copy [last_pos, curs_m)
    // Rprintf("orig [%d,%d)\n", last_pos, curs_m);
    buf_size = buf.size();
    buf.resize(buf_size+curs_n-byte_pos);
    memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos);

    SEXP ret;
    STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8));
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Trim characters from a charclass from left AND/OR right side of the string
 *
 * @param str character vector
 * @param pattern character vector
 * @param left from left?
 * @param right from left?
 * @return character vector
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-04)
 *          Use StriContainerUTF8 and CharClass
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly & Use StrContainerCharClass
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-03)
 *          detects invalid UTF-8 byte stream
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          StriContainerCharClass now relies on UnicodeSet
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*/
SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerCharClass pattern_cont(pattern, vectorize_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length));

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const UnicodeSet* pattern_cur = &pattern_cont.get(i);
      R_len_t     str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t jlast1 = 0;
      R_len_t jlast2 = str_cur_n;

      if (left) {
         UChar32 chr;
         for (R_len_t j=0; j<str_cur_n; ) {
            U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast1 = j;
         }
      }

      if (right && jlast1 < str_cur_n) {
         UChar32 chr;
         for (R_len_t j=str_cur_n; j>0; ) {
            U8_PREV(str_cur_s, 0, j, chr); // "look behind"
            if (chr < 0) // invalid utf-8 sequence
               throw StriException(MSG__INVALID_UTF8);
            if (pattern_cur->contains(chr)) {
               break; // break at first occurrence
            }
            jlast2 = j;
         }
      }

      // now jlast is the index, from which we start copying
      SET_STRING_ELT(ret, i,
         Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
예제 #28
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
예제 #29
0
/**
 * Convert character vector between given encodings
 *
 * @param str     input character vector or list of raw vectors
 * @param from  source encoding, \code{NULL} or \code{""} for default enc
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski) arg to_raw_added, encoding marking
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 * @version 0.4 (Marek Gagolewski, 2013-08-08) use StriContainerListRaw
 * @version 0.5 (Marek Gagolewski, 2013-11-20) BUGFIX call stri_encode_from_marked if necessary
 */
SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw)
{
   const char* selected_from = stri__prepare_arg_enc(from, "from", true);
   if (!selected_from && Rf_isVectorAtomic(str))
      return stri_encode_from_marked(str, to, to_raw);

   str = stri_prepare_arg_list_raw(str, "str");
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true);
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   UConverter* uconv_from = NULL;
   UConverter* uconv_to = NULL;

   STRI__ERROR_HANDLER_BEGIN
   StriContainerListRaw str_cont(str);
   R_len_t str_n = str_cont.get_n();

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   uconv_from = stri__ucnv_open(selected_from);
   uconv_to = stri__ucnv_open(selected_to);


   // Get target encoding mark
   UErrorCode err = U_ZERO_ERROR;
   const char* uconv_to_name = ucnv_getName(uconv_to, &err);
   if (U_FAILURE(err))
      throw StriException(err);
   cetype_t encmark_to = CE_BYTES; // all other cases than the below ones
      // - bytes enc (this is reasonable, isn't it?)
   if (!to_raw_logical) { // otherwise not needed
      if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8"))
         encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE
      else if (!strcmp(uconv_to_name, "ISO-8859-1"))
         encmark_to = CE_LATIN1;
      else if (!strcmp(uconv_to_name, ucnv_getDefaultName()))
         encmark_to = CE_NATIVE;
   }

   // Prepare out val
   SEXP ret;
   PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   String8 buf(0); // will be extended in a moment

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const char* curd = str_cont.get(i).c_str();
      R_len_t curn     = str_cont.get(i).length();

      err = U_ZERO_ERROR;
      UnicodeString encs(curd, curn, uconv_from, err); // FROM -> UTF-16 [this is the slow part]
      if (U_FAILURE(err))
         throw StriException(err);

      R_len_t curn_tmp = encs.length();
      const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to));
      // "The calculated size is guaranteed to be sufficient for this conversion."
      buf.resize(bufneed);

      err = U_ZERO_ERROR;
//      bufneed = encs.extract(buf.data(), buf.size(), uconv_to, err); // UTF-16 -> TO
      ucnv_resetFromUnicode(uconv_to);
      bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
      if (bufneed <= buf.size()) {
         if (U_FAILURE(err))
            throw StriException(err);
      }
      else {// larger buffer needed [this shouldn't happen?]
//         warning("buf extending");
         buf.resize(bufneed);
         err = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
         if (U_FAILURE(err))
            throw StriException(err);
         if (bufneed > buf.size())
            throw StriException(MSG__INTERNAL_ERROR);
      }

      if (to_raw_logical) {
         SEXP outobj = Rf_allocVector(RAWSXP, bufneed);
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
      }
      else {
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   if (uconv_from) {
      ucnv_close(uconv_from);
      uconv_from = NULL;
   }
   if (uconv_to) {
      ucnv_close(uconv_to);
      uconv_to = NULL;
   }

   UNPROTECT(1);
   return ret;

   STRI__ERROR_HANDLER_END({
      if (uconv_from)
         ucnv_close(uconv_from);
      if (uconv_to)
         ucnv_close(uconv_to);
   })
}
예제 #30
0
/**
 * Construct String Container from R character vector
 * @param rstr R character vector
 * @param nrecycle extend length [vectorization]
 * @param shallowrecycle will \code{this->str} be ever modified?
 */
StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle)
{
   this->str = NULL;
#ifndef NDEBUG
   if (!isString(rstr))
      throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)");
#endif
   R_len_t nrstr = LENGTH(rstr);
   this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call


   if (this->n > 0) {
      this->str = new UnicodeString*[this->n];
      for (R_len_t i=0; i<this->n; ++i)
         this->str[i] = NULL; // in case it fails during conversion (this is NA)

      UConverter* ucnvASCII = NULL;
//      UConverter* ucnvUTF8 = NULL;
      UConverter* ucnvLatin1 = NULL;
      UConverter* ucnvNative = NULL;

      for (R_len_t i=0; i<nrstr; ++i) {
         SEXP curs = STRING_ELT(rstr, i);
         if (curs == NA_STRING) {
            continue; // keep NA
         }
         else {
            if (IS_ASCII(curs)) {
               if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII");
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvASCII, status);
               if (U_FAILURE(status))
                  throw StriException(status);

               // Performance improvement attempt #1:
               // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above

               // Performance improvement attempt #2:
               // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,...
               // This wasn't faster tham the ucnvASCII approach.
            }
            else if (IS_UTF8(curs)) {
               // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8
               this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs)));
            }
            else if (IS_LATIN1(curs)) {
               if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1");
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvLatin1, status);
               if (U_FAILURE(status))
                  throw StriException(status);
            }
            else if (IS_BYTES(curs))
               throw StriException(MSG__BYTESENC);
            else {
//             Any encoding - detection needed
//             Assume it's Native; this assumes the user working in an 8-bit environment
//             would convert strings to UTF-8 manually if needed - I think is's
//             a more reasonable approach (Native --> input via keyboard)
               if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL);
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvNative, status);
               if (U_FAILURE(status))
                  throw StriException(status);
            }
         }
      }

      if (ucnvASCII) ucnv_close(ucnvASCII);
//      if (ucnvUTF8)  ucnv_close(ucnvUTF8);
      if (ucnvLatin1) ucnv_close(ucnvLatin1);
      if (ucnvNative) ucnv_close(ucnvNative);

      if (!_shallowrecycle) {
         for (R_len_t i=nrstr; i<this->n; ++i) {
            if (this->str[i%nrstr] == NULL)
               this->str[i] = NULL;
            else
               this->str[i] = new UnicodeString(*this->str[i%nrstr]);
         }
      }
   }
}