/** * Initialize object data * */ void StriContainerBase::init_Base(R_len_t _n, R_len_t _nrecycle, bool _shallowrecycle, SEXP _sexp) { #ifndef NDEBUG if (this->n != 0 || this->nrecycle != 0 || this->sexp != (SEXP)NULL) throw StriException("StriContainerBase::init_Base(...): already initialized"); this->isShallow = _shallowrecycle; #endif if (_n == 0 || _nrecycle == 0) { this->nrecycle = 0; this->n = 0; this->sexp = _sexp; } else { this->nrecycle = _nrecycle; this->n = (_shallowrecycle)?_n:_nrecycle; this->sexp = _sexp; #ifndef NDEBUG if (this->n < _n) throw StriException("StriContainerBase::init_Base(...): this->n < _n"); if (this->n > this->nrecycle) throw StriException("StriContainerBase::init_Base(...): this->n > this->nrecycle"); #endif } }
/** * Reverse Each String * @param str character vector * @return character vector with every string reversed * * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect incorrect utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_reverse(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); StriContainerUTF8 str_cont(str, str_len); // writable, no recycle // STEP 1. // Calculate the required buffer length R_len_t bufsize = 0; for (R_len_t i=0; i<str_len; ++i) { if (str_cont.isNA(i)) continue; R_len_t cursize = str_cont.get(i).length(); if (cursize > bufsize) bufsize = cursize; } // STEP 2. // Alloc buffer & result vector String8buf buf(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, k; UChar32 chr; UBool isError = FALSE; for (j=str_cur_n, k=0; !isError && j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) { throw StriException(MSG__INVALID_UTF8); } U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError); } if (isError) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Construct Container from R cobject * @param rstr R object * * if you want nrecycle > n, call set_nrecycle */ StriContainerListInt::StriContainerListInt(SEXP rstr) { this->data = NULL; if (isNull(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; // 1 vector, NA/NULL if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (Rf_isInteger(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); this->data[0].initialize((const int*)INTEGER(rstr), LENGTH(rstr)); // shallow copy } else // if (Rf_isVectorList(rstr)) -- args already checked { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!isNull(cur)) this->data[i].initialize((const int*)INTEGER(cur), LENGTH(cur)); // shallow copy // else leave as-is, i.e. NULL/NA } } }
/** Copy operator * @param container source * @return *this */ StriContainerByteSearch& StriContainerByteSearch::operator=(StriContainerByteSearch& container) { this->~StriContainerByteSearch(); (StriContainerUTF8&) (*this) = (StriContainerUTF8&)container; this->patternLen = 0; this->patternStr = NULL; this->searchPos = -1; this->searchEnd = -1; this->searchStr = NULL; this->searchLen = 0; this->flags = container.flags; #ifndef NDEBUG this->debugMatcherIndex = -1; #endif //#ifndef STRI__BYTESEARCH_DISABLE_KMP this->patternPos = -1; this->kmpMaxSize = container.kmpMaxSize; this->kmpNext = new int[kmpMaxSize]; if (!this->kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR); this->patternLenCaseInsensitive = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { this->patternStrCaseInsensitive = new UChar32[this->kmpMaxSize]; if (!this->patternStrCaseInsensitive) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->patternStrCaseInsensitive = NULL; //#endif return *this; }
/** * Construct String Container from R character vector * @param rstr R character vector * @param _nrecycle extend length [vectorization] */ StriContainerByteSearch::StriContainerByteSearch(SEXP rstr, R_len_t _nrecycle, uint32_t _flags) : StriContainerUTF8(rstr, _nrecycle, true) { this->flags = _flags; this->patternLen = 0; this->patternStr = NULL; this->searchPos = -1; this->searchEnd = -1; this->searchStr = NULL; this->searchLen = 0; #ifndef NDEBUG this->debugMatcherIndex = -1; #endif //#ifndef STRI__BYTESEARCH_DISABLE_KMP this->patternPos = -1; this->kmpMaxSize = getMaxNumBytes()+1; this->kmpNext = new int[kmpMaxSize]; if (!this->kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR); //#endif this->patternLenCaseInsensitive = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { this->patternStrCaseInsensitive = new UChar32[this->kmpMaxSize]; if (!this->patternStrCaseInsensitive) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->patternStrCaseInsensitive = NULL; }
/** setup/reuse matcher for findFirst/findNext * * @param i index * @param searchStr string to search in * @param searchLen string length in bytes * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Bartek Tartanus, 2013-08-15) * uses KMP * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * KMP upgrade; * special procedure for patternLen <= 4 */ void StriContainerByteSearch::setupMatcherFwd(R_len_t i, const char* _searchStr, R_len_t _searchLen) { if (i >= n && this->patternStr == get(i).c_str()) { #ifndef NDEBUG if ((debugMatcherIndex % n) != (i % n)) { throw StriException("DEBUG: vectorize_getMatcher - matcher reuse failed!"); } #endif // matcher reuse } else { this->patternStr = get(i).c_str(); this->patternLen = get(i).length(); if (flags & BYTESEARCH_CASE_INSENSITIVE) { upgradePatternCaseInsensitive(); createKMPtableFwdCaseInsensitive(); } else { #ifndef STRI__BYTESEARCH_DISABLE_SHORTPAT if (this->patternLen > 4) // a short pattern => don't use KMP createKMPtableFwd(); #endif } } this->searchStr = _searchStr; this->searchLen = _searchLen; this->resetMatcher(); #ifndef NDEBUG debugMatcherIndex = (i % n); #endif }
/** Convert FORWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == first character in i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh) { if (wh <= 0) return 0; if (get(i).isASCII()) return std::min(wh, get(i).length()); R_len_t cur_n = get(i).length(); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s"); #endif if (last_ind_fwd_str != cur_s) { // starting search in a different string last_ind_fwd_codepoint = 0; last_ind_fwd_utf8 = 0; last_ind_fwd_str = cur_s; } R_len_t j = 0; R_len_t jres = 0; if (last_ind_fwd_codepoint > 0) { if (wh < last_ind_fwd_codepoint) { // check if it makes sense to go backwards from last position, // or it is better to start from scratch if ((last_ind_fwd_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; while (j > wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); --j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_fwd_codepoint) // continue last search j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; } } // go forward while (j < wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); ++j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; }
/** Convert BACKWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == byte after last character in the i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh) { R_len_t cur_n = get(i).length(); if (wh <= 0) return cur_n; if (get(i).isASCII()) return std::max(cur_n-wh, 0); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s"); #endif if (last_ind_back_str != cur_s) { // starting search in a different string last_ind_back_codepoint = 0; last_ind_back_utf8 = cur_n; last_ind_back_str = cur_s; } R_len_t j = 0; R_len_t jres = cur_n; if (last_ind_back_codepoint > 0) { if (wh < last_ind_back_codepoint) { // check if it makes sense to go towards the end of the string // or maybe it will be better to start from the end and move backwards if ((last_ind_back_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_back_codepoint; jres = last_ind_back_utf8; while (j > wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); --j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_back_codepoint) // continue last search j = last_ind_back_codepoint; jres = last_ind_back_utf8; } } // go backward while (j < wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); ++j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector<int> to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector<int> which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** container for nrecycle fresh, brand new, writable UnicodeStrings * * Each string is initially empty. * * @param nrecycle number of strings */ StriContainerUTF16::StriContainerUTF16(R_len_t _nrecycle) { this->str = NULL; this->init_Base(_nrecycle, _nrecycle, false); if (this->n > 0) { this->str = new UnicodeString[this->n]; if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); } }
/** Convert character vector to UTF-32 * * @param str character vector * @return list with integer vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * use vector<UChar32> buf instead of R_alloc; * warn and set NULL on improper UTF-8 byte sequences * * @version 0.2-3 (Marek Gagolewski, 2014-05-12) * Use UChar32* instead of vector<UChar32> as ::data is C++11 * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf32(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 1; // to avoid allocating an empty buffer for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.) if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR); // deque<UChar32> was slower than using a common, over-sized buf SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, R_NilValue); continue; } UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_VECTOR_ELT(ret, i, R_NilValue); continue; } else { SEXP conv; STRI__PROTECT(conv = Rf_allocVector(INTSXP, k)); memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k); SET_VECTOR_ELT(ret, i, conv); STRI__UNPROTECT(1); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* do nothing on error */ }) }
/** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR#124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern")); PROTECT(value = stri_prepare_arg_string(value, "value")); int vectorize_length = LENGTH(str); int value_length = LENGTH(value); if (value_length == 0) Rf_error(MSG__REPLACEMENT_ZERO); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 value_cont(value, value_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); R_len_t k = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; bool found = false; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { found = true; break; } } if ((found && !negate_1) || (!found && negate_1)) SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length)); else SET_STRING_ELT(ret, i, str_cont.toR(i)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Construct String Container from R object * @param rstr R object * * if you want nrecycle > n, call set_nrecycle */ StriContainerListRaw::StriContainerListRaw(SEXP rstr) { this->data = NULL; if (isNull(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; // 1 string, NA if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (isRaw(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); this->data[0].initialize((const char*)RAW(rstr), LENGTH(rstr), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy } else if (Rf_isVectorList(rstr)) { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!isNull(cur)) this->data[i].initialize((const char*)RAW(cur), LENGTH(cur), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy // else leave as-is, i.e. NA } } else { // it's surely a character vector (args have been checked) R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) { SEXP cur = STRING_ELT(rstr, i); if (cur != NA_STRING) this->data[i].initialize(CHAR(cur), LENGTH(cur), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy // else leave as-is, i.e. NA } } }
StriContainerListUTF8::StriContainerListUTF8(StriContainerListUTF8& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; i<container.n; ++i) { if (container.data[i]) { this->data[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } }
/** reset matcher * * will start search from the beginning next time */ void StriContainerByteSearch::resetMatcher() { #ifndef NDEBUG if (!this->searchStr || !this->patternStr) throw StriException("DEBUG: StriContainerByteSearch: setupMatcher() hasn't been called yet"); #endif this->searchPos = -1; this->searchEnd = -1; this->patternPos = -1; }
/** * Construct String Container from R character vector * @param rvec R list vector * @param nrecycle extend length of each character vector stored [vectorization] * @param shallowrecycle will stored character vectors be ever modified? */ StriContainerListUTF8::StriContainerListUTF8(SEXP rvec, R_len_t _nrecycle, bool _shallowrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isVectorList(rvec)) throw StriException("DEBUG: !isVectorList in StriContainerListUTF8::StriContainerListUTF8(SEXP rvec)"); #endif R_len_t rvec_length = LENGTH(rvec); this->init_Base(rvec_length, rvec_length, true); if (this->n > 0) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) this->data[i] = NULL; // in case it fails during conversion (this is "NA") for (R_len_t i=0; i<this->n; ++i) { this->data[i] = new StriContainerUTF8(VECTOR_ELT(rvec, i), _nrecycle, _shallowrecycle); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } } }
StriContainerListInt::StriContainerListInt(StriContainerListInt& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; i<this->n; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid UTF-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
StriContainerListUTF8& StriContainerListUTF8::operator=(StriContainerListUTF8& container) { this->~StriContainerListUTF8(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; i<container.n; ++i) { if (container.data[i]) { this->data[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } return *this; }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ void StriContainerByteSearch::upgradePatternCaseInsensitive() { UChar32 c = 0; R_len_t j = 0; patternLenCaseInsensitive = 0; while (j < patternLen) { U8_NEXT(patternStr, j, patternLen, c); #ifndef NDEBUG if (patternLenCaseInsensitive >= this->kmpMaxSize) throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()"); #endif patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c); } patternStrCaseInsensitive[patternLenCaseInsensitive] = 0; }
/** Export string to R * THE OUTPUT IS ALWAYS IN UTF-8 * @param i index [with recycle] * @return CHARSXP */ SEXP StriContainerUTF16::toR(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF16::toR(): INDEX OUT OF BOUNDS"); #endif if (str[i%n] == NULL) return NA_STRING; else { std::string s; str[i%n]->toUTF8String(s); return Rf_mkCharLenCE(s.c_str(), (int)s.length(), (cetype_t)CE_UTF8); } }
StriContainerListInt& StriContainerListInt::operator=(StriContainerListInt& container) { this->~StriContainerListInt(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; i<this->n; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } return *this; }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param collator_opts passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return integer vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.3 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerUStringSearch */ SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP collator_opts) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = stri__ucol_open(collator_opts); if (!collator) return stri__count_fixed_byte(str, pattern); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = 0; while (((int)usearch_next(matcher, &status) != USEARCH_DONE) && !U_FAILURE(status)) ++ret_tab[i]; if (U_FAILURE(status)) throw StriException(status); } if (collator) { ucol_close(collator); collator=NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Convert character vector between marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski, 2013-11-12) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * calc required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters StriUcnv ucnv(selected_to); UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE(); // Prepare out val SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // calculate required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<str_n; ++i) { if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." String8buf buf(bufsize); for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); UErrorCode status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed [this shouldn't happen?] buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** internal function - replace multiple substrings in a single string * can raise Rf_error * * @version 1.3.2 (Marek Gagolewski, 2019-02-23) * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri__sub_replacement_all_single(SEXP curs, SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value) { // curs is a CHARSXP in UTF-8 PROTECT(value = stri_enc_toutf8(value, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); R_len_t value_len = LENGTH(value); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is UNPROTECT(sub_protected); return curs; } if (value_len <= 0) { // things are supposed to be replaced with "nothing"... UNPROTECT(sub_protected); Rf_warning(MSG__REPLACEMENT_ZERO); return NA_STRING; } if (vectorize_len % value_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); const char* curs_s = CHAR(curs); // already in UTF-8 R_len_t curs_n = LENGTH(curs); // first check for NAs.... if (!omit_na_1) { for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) { UNPROTECT(sub_protected); if (omit_na_1) return curs; else return NA_STRING; } } for (R_len_t i=0; i<vectorize_len; ++i) { if (STRING_ELT(value, i%value_len) == NA_STRING) { UNPROTECT(sub_protected); return NA_STRING; } } } // get the number of code points in curs, if required (for negative indexes) R_len_t curs_m = -1; if (IS_ASCII(curs)) curs_m = curs_n; else { // is UTF-8 curs_m = 0; // code points count R_len_t j = 0; // byte pos while (j < curs_n) { U8_FWD_1_UNSAFE(curs_s, j); ++curs_m; } } STRI__ERROR_HANDLER_BEGIN(sub_protected) std::vector<char> buf; // convenience >> speed R_len_t buf_size; R_len_t last_pos = 0; R_len_t byte_pos = 0, byte_pos_last; for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) { continue; } if (cur_from < 0) cur_from = curs_m+cur_from+1; if (cur_from <= 0) cur_from = 1; cur_from--; // 1-based -> 0-based index if (cur_from >= curs_m) cur_from = curs_m; // cur_from is in [0, curs_m] if (length_tab) { if (cur_to < 0) cur_to = 0; cur_to = cur_from+cur_to; } else { if (cur_to < 0) cur_to = curs_m+cur_to+1; if (cur_to < cur_from) cur_to = cur_from; // insertion } if (cur_to >= curs_m) cur_to = curs_m; // the chunk to replace is at code points [cur_from, cur_to) // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to); if (last_pos > cur_from) throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES); // first, copy [last_pos, cur_from) byte_pos_last = byte_pos; while (last_pos < cur_from) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } buf_size = buf.size(); buf.resize(buf_size+byte_pos-byte_pos_last); memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last); // then, copy the corresponding replacement string SEXP value_cur = STRING_ELT(value, i%value_len); const char* value_s = CHAR(value_cur); R_len_t value_n = LENGTH(value_cur); buf_size = buf.size(); buf.resize(buf_size+value_n); memcpy(buf.data()+buf_size, value_s, value_n); // lastly, update last_pos // ---> last_pos = cur_to; while (last_pos < cur_to) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } } // finally, copy [last_pos, curs_m) // Rprintf("orig [%d,%d)\n", last_pos, curs_m); buf_size = buf.size(); buf.resize(buf_size+curs_n-byte_pos); memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos); SEXP ret; STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Convert character vector between given encodings * * @param str input character vector or list of raw vectors * @param from source encoding, \code{NULL} or \code{""} for default enc * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) arg to_raw_added, encoding marking * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly * @version 0.4 (Marek Gagolewski, 2013-08-08) use StriContainerListRaw * @version 0.5 (Marek Gagolewski, 2013-11-20) BUGFIX call stri_encode_from_marked if necessary */ SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw) { const char* selected_from = stri__prepare_arg_enc(from, "from", true); if (!selected_from && Rf_isVectorAtomic(str)) return stri_encode_from_marked(str, to, to_raw); str = stri_prepare_arg_list_raw(str, "str"); const char* selected_to = stri__prepare_arg_enc(to, "to", true); bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); UConverter* uconv_from = NULL; UConverter* uconv_to = NULL; STRI__ERROR_HANDLER_BEGIN StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters uconv_from = stri__ucnv_open(selected_from); uconv_to = stri__ucnv_open(selected_to); // Get target encoding mark UErrorCode err = U_ZERO_ERROR; const char* uconv_to_name = ucnv_getName(uconv_to, &err); if (U_FAILURE(err)) throw StriException(err); cetype_t encmark_to = CE_BYTES; // all other cases than the below ones // - bytes enc (this is reasonable, isn't it?) if (!to_raw_logical) { // otherwise not needed if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8")) encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE else if (!strcmp(uconv_to_name, "ISO-8859-1")) encmark_to = CE_LATIN1; else if (!strcmp(uconv_to_name, ucnv_getDefaultName())) encmark_to = CE_NATIVE; } // Prepare out val SEXP ret; PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); String8 buf(0); // will be extended in a moment for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } const char* curd = str_cont.get(i).c_str(); R_len_t curn = str_cont.get(i).length(); err = U_ZERO_ERROR; UnicodeString encs(curd, curn, uconv_from, err); // FROM -> UTF-16 [this is the slow part] if (U_FAILURE(err)) throw StriException(err); R_len_t curn_tmp = encs.length(); const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." buf.resize(bufneed); err = U_ZERO_ERROR; // bufneed = encs.extract(buf.data(), buf.size(), uconv_to, err); // UTF-16 -> TO ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (bufneed <= buf.size()) { if (U_FAILURE(err)) throw StriException(err); } else {// larger buffer needed [this shouldn't happen?] // warning("buf extending"); buf.resize(bufneed); err = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (U_FAILURE(err)) throw StriException(err); if (bufneed > buf.size()) throw StriException(MSG__INTERNAL_ERROR); } if (to_raw_logical) { SEXP outobj = Rf_allocVector(RAWSXP, bufneed); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } if (uconv_from) { ucnv_close(uconv_from); uconv_from = NULL; } if (uconv_to) { ucnv_close(uconv_to); uconv_to = NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END({ if (uconv_from) ucnv_close(uconv_from); if (uconv_to) ucnv_close(uconv_to); }) }
/** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n > 0) { this->str = new UnicodeString*[this->n]; for (R_len_t i=0; i<this->n; ++i) this->str[i] = NULL; // in case it fails during conversion (this is NA) UConverter* ucnvASCII = NULL; // UConverter* ucnvUTF8 = NULL; UConverter* ucnvLatin1 = NULL; UConverter* ucnvNative = NULL; for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } else { if (IS_ASCII(curs)) { if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvASCII, status); if (U_FAILURE(status)) throw StriException(status); // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster tham the ucnvASCII approach. } else if (IS_UTF8(curs)) { // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8 this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvLatin1, status); if (U_FAILURE(status)) throw StriException(status); } else if (IS_BYTES(curs)) throw StriException(MSG__BYTESENC); else { // Any encoding - detection needed // Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvNative, status); if (U_FAILURE(status)) throw StriException(status); } } } if (ucnvASCII) ucnv_close(ucnvASCII); // if (ucnvUTF8) ucnv_close(ucnvUTF8); if (ucnvLatin1) ucnv_close(ucnvLatin1); if (ucnvNative) ucnv_close(ucnvNative); if (!_shallowrecycle) { for (R_len_t i=nrstr; i<this->n; ++i) { if (this->str[i%nrstr] == NULL) this->str[i] = NULL; else this->str[i] = new UnicodeString(*this->str[i%nrstr]); } } } }