/** Locate all BreakIterator boundaries * * @param str character vector * @param omit_no_match logical * @param opts_brkiter named list * @return list * * @version 0.2-2 (Marek Gagolewski, 2014-04-22) * * @version 0.2-2 (Marek Gagolewski, 2014-04-23) * removed "title": For Unicode 4.0 and above title boundary * iteration, please use Word Boundary iterator. * * @version 0.2-2 (Marek Gagolewski, 2014-04-25) * use stri__split_or_locate_boundaries * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * use opts_brkiter * * @version 0.4-1 (Marek Gagolewski, 2014-11-28) * new args: omit_no_match * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator */ SEXP stri_locate_all_boundaries(SEXP str, SEXP omit_no_match, SEXP opts_brkiter) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2)); continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); deque< pair<R_len_t,R_len_t> > occurrences; pair<R_len_t,R_len_t> curpair; while (brkiter.next(curpair)) occurrences.push_back(curpair); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2)); continue; } SEXP ans; STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair<R_len_t, R_len_t> cur_match = *iter; ans_tab[j] = cur_match.first; ans_tab[j+noccurrences] = cur_match.second; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UTF8_to_UChar32_index(i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } stri__locate_set_dimnames_list(ret); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* nothing special t.b.d. on error */ }) }
/** Convert character vector to UTF-32 * * @param str character vector * @return list with integer vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * use vector<UChar32> buf instead of R_alloc; * warn and set NULL on improper UTF-8 byte sequences * * @version 0.2-3 (Marek Gagolewski, 2014-05-12) * Use UChar32* instead of vector<UChar32> as ::data is C++11 * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf32(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 1; // to avoid allocating an empty buffer for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.) if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR); // deque<UChar32> was slower than using a common, over-sized buf SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, R_NilValue); continue; } UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_VECTOR_ELT(ret, i, R_NilValue); continue; } else { SEXP conv; STRI__PROTECT(conv = Rf_allocVector(INTSXP, k)); memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k); SET_VECTOR_ELT(ret, i, conv); STRI__UNPROTECT(1); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* do nothing on error */ }) }
/** Count the number of BreakIterator boundaries * * @param str character vector * @param opts_brkiter identifier * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator */ SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter) { PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length)); StriRuleBasedBreakIterator brkiter(opts_brkiter2); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { INTEGER(ret)[i] = NA_INTEGER; continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); R_len_t cur_count = 0; while (brkiter.next()) ++cur_count; INTEGER(ret)[i] = cur_count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no action */ }) }
/** * Reverse Each String * @param str character vector * @return character vector with every string reversed * * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect incorrect utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_reverse(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); StriContainerUTF8 str_cont(str, str_len); // writable, no recycle // STEP 1. // Calculate the required buffer length R_len_t bufsize = 0; for (R_len_t i=0; i<str_len; ++i) { if (str_cont.isNA(i)) continue; R_len_t cursize = str_cont.get(i).length(); if (cursize > bufsize) bufsize = cursize; } // STEP 2. // Alloc buffer & result vector String8buf buf(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, k; UChar32 chr; UBool isError = FALSE; for (j=str_cur_n, k=0; !isError && j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) { throw StriException(MSG__INVALID_UTF8); } U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError); } if (isError) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Get all available ICU charsets and their aliases (elems 2,3,...) * * @return R list object; element name == ICU charset canonical name; * elements are character vectors (aliases) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_list() { R_len_t c = (R_len_t)ucnv_countAvailable(); STRI__ERROR_HANDLER_BEGIN(0) SEXP ret; SEXP names; STRI__PROTECT(ret = Rf_allocVector(VECSXP, c)); STRI__PROTECT(names = Rf_allocVector(STRSXP, c)); for (R_len_t i=0; i<c; ++i) { const char* canonical_name = ucnv_getAvailableName(i); if (!canonical_name) { SET_STRING_ELT(names, i, NA_STRING); continue; } SET_STRING_ELT(names, i, Rf_mkChar(canonical_name)); UErrorCode status = U_ZERO_ERROR; R_len_t ci = (R_len_t)ucnv_countAliases(canonical_name, &status); if (U_FAILURE(status) || ci <= 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(NA_STRING)); else { SEXP aliases; STRI__PROTECT(aliases = Rf_allocVector(STRSXP, ci)); for (R_len_t j=0; j<ci; ++j) { status = U_ZERO_ERROR; const char* alias = ucnv_getAlias(canonical_name, j, &status); if (U_FAILURE(status) || !alias) SET_STRING_ELT(aliases, j, NA_STRING); else SET_STRING_ELT(aliases, j, Rf_mkChar(alias)); } SET_VECTOR_ELT(ret, i, aliases); STRI__UNPROTECT(1); } } Rf_setAttrib(ret, R_NamesSymbol, names); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no special action on error */}) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector<int> to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector<int> which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR#124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern")); PROTECT(value = stri_prepare_arg_string(value, "value")); int vectorize_length = LENGTH(str); int value_length = LENGTH(value); if (value_length == 0) Rf_error(MSG__REPLACEMENT_ZERO); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 value_cont(value, value_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); R_len_t k = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; bool found = false; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { found = true; break; } } if ((found && !negate_1) || (!found && negate_1)) SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length)); else SET_STRING_ELT(ret, i, str_cont.toR(i)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert from UTF-32 * * @param vec integer vector or list with integer vectors * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * StriException friently; * use StriContainerListInt * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_fromutf32(SEXP vec) { PROTECT(vec = stri_prepare_arg_list_integer(vec, "vec")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListInt vec_cont(vec); R_len_t vec_n = vec_cont.get_n(); // get required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<vec_n; ++i) { if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize) bufsize = vec_cont.get(i).size(); } bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n)); for (R_len_t i=0; i<vec_n; ++i) { if (vec_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const int* cur_data = vec_cont.get(i).data(); R_len_t cur_n = vec_cont.get(i).size(); UChar32 c = (UChar32)0; R_len_t j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && k < cur_n) { c = cur_data[k++]; U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); // Rf_mkCharLenCE detects embedded nuls, but stops execution completely if (c == 0) err = TRUE; } if (err) { Rf_warning(MSG__INVALID_CODE_POINT, (int)c); SET_STRING_ELT(ret, i, NA_STRING); } else SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } STRI__UNPROTECT_ALL; return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Compare elements in 2 character vectors, with collation * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * @param type [internal] vector of length 2, * type[0]: 0 for ==, -1 for < and 1 for >, * type[1]: 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-19) * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp_logical(SEXP e1, SEXP e2, SEXP opts_collator, SEXP type) { // we'll perform a collator-based cmp // type is an internal arg, check manually, error() allowed here if (!Rf_isInteger(type) || LENGTH(type) != 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _type = INTEGER(type)[0]; int _negate = INTEGER(type)[1]; if (_type > 1 || _type < -1 || _negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); // with collation UErrorCode status = U_ZERO_ERROR; ret_tab[i] = (_type == (int)ucol_strcollUTF8(col, cur1_s, cur1_n, cur2_s, cur2_n, &status )); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (_negate) ret_tab[i] = !ret_tab[i]; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid UTF-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Compare elements in 2 character vectors, without collation * * @param e1 character vector * @param e2 character vector * @param type [internal] integer; 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp_codepoints(SEXP e1, SEXP e2, SEXP type) { // type is an internal arg, check manually, error() allowed here if (!Rf_isInteger(type) || LENGTH(type) != 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _negate = INTEGER(type)[0]; if (_negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); if (cur1_n != cur2_n) // different number of bytes => not equal ret_tab[i] = FALSE; else ret_tab[i] = (memcmp(cur1_s, cur2_s, cur1_n) == 0); if (_negate) ret_tab[i] = !ret_tab[i]; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op on err */}) }
/** * Locate first or last boundaries * * @param str character vector * @param opts_brkiter list * @param first looking for first or last match? * @return integer matrix (2 columns) * * @version 0.4-1 (Marek Gagolewski, 2014-12-05) */ SEXP stri__locate_firstlast_boundaries(SEXP str, SEXP opts_brkiter, bool first) { PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, str_length, 2)); stri__locate_set_dimnames_matrix(ret); int* ret_tab = INTEGER(ret); for (R_len_t i = 0; i < str_length; ++i) { ret_tab[i] = NA_INTEGER; ret_tab[i+str_length] = NA_INTEGER; if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue; brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); pair<R_len_t,R_len_t> curpair; if (first) { brkiter.first(); if (!brkiter.next(curpair)) continue; } else { brkiter.last(); if (!brkiter.previous(curpair)) continue; } ret_tab[i] = curpair.first; ret_tab[i+str_length] = curpair.second; // Adjust UTF8 byte index -> UChar32 index str_cont.UTF8_to_UChar32_index(i, ret_tab+i, ret_tab+i+str_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) }
/** * Determine which strings are of length 0 * * Note that ICU permits only strings of length < 2^31. * * @param s R character vector * @return logical vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_isempty(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument R_len_t str_n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, str_n)); int* retlog = LOGICAL(ret); for (R_len_t i=0; i<str_n; ++i) { SEXP curs = STRING_ELT(str, i); /* INPUT ENCODING CHECK: this function does not need this. */ retlog[i] = (curs == NA_STRING)?NA_LOGICAL:(LENGTH(curs) <= 0); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Get number of bytes in each string * * Note that ICU permits only strings of length < 2^31. * * @param s R object coercible to a character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_numbytes(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument R_len_t str_n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = LOGICAL(ret); for (R_len_t i=0; i<str_n; ++i) { SEXP curs = STRING_ELT(str, i); /* INPUT ENCODING CHECK: this function does not need this. */ retint[i] = (curs == NA_STRING)?NA_INTEGER:LENGTH(curs); // O(1) - stored by R } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_count_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t found = 0; while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE)) ++found; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i] = found; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** Get Declared Encodings of Each String * * @param str a character vector or an object coercible to * @return a character vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_mark(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); // some of them will not be used in this call, but we're lazy SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes; STRI__PROTECT(mark_ascii = Rf_mkChar("ASCII")); STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1")); STRI__PROTECT(mark_utf8 = Rf_mkChar("UTF-8")); STRI__PROTECT(mark_native = Rf_mkChar("native")); STRI__PROTECT(mark_bytes = Rf_mkChar("bytes")); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i=0; i<str_len; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs)) SET_STRING_ELT(ret, i, mark_ascii); else if (IS_UTF8(curs)) SET_STRING_ELT(ret, i, mark_utf8); else if (IS_BYTES(curs)) SET_STRING_ELT(ret, i, mark_bytes); else if (IS_LATIN1(curs)) SET_STRING_ELT(ret, i, mark_latin1); else SET_STRING_ELT(ret, i, mark_native); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** List available time zone IDs * * @param offset single numeric * @param region single string * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) */ SEXP stri_timezone_list(SEXP region, SEXP offset) { StringEnumeration* tz_enum = NULL; PROTECT(region = stri_prepare_arg_string_1(region, "region")); PROTECT(offset = stri_prepare_arg_double_1(offset, "offset")); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 region_cont(region, 1); UErrorCode status = U_ZERO_ERROR; int32_t offset_hours = 0; const int32_t* o = NULL; const char* r = NULL; if (!ISNA(REAL(offset)[0])) { // 0.5 and 0.75 are represented exactly within the double type offset_hours = (int32_t)(REAL(offset)[0]*1000.0*3600.0); o = &offset_hours; } if (!region_cont.isNA(0)) r = region_cont.get(0).c_str(); tz_enum = TimeZone::createTimeZoneIDEnumeration(UCAL_ZONE_TYPE_ANY, r, o, status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; tz_enum->reset(status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; R_len_t n = (R_len_t)tz_enum->count(status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); // SEXP nam; // STRI__PROTECT(nam = Rf_allocVector(STRSXP, n)); // MG: I reckon that IDs are more readable than DisplayNames (which are moreover localized) for (R_len_t i=0; i<n; ++i) { int len; status = U_ZERO_ERROR; const char* cur = tz_enum->next(&len, status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) SET_STRING_ELT(ret, i, Rf_mkCharLenCE(cur, len, CE_UTF8)); // TimeZone* curtz = TimeZone::createTimeZone(UnicodeString::fromUTF8(cur)); // UnicodeString curdn; // curtz->getDisplayName(locale, curdn); // delete curtz; // string out; // curdn.toUTF8String(out); // SET_STRING_ELT(nam, i, Rf_mkCharCE(out.c_str(), CE_UTF8)); } // Rf_setAttrib(ret, R_NamesSymbol, nam); if (tz_enum) { delete tz_enum; tz_enum = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (tz_enum) { delete tz_enum; tz_enum = NULL; } ) }
/** * Convert character vector between marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski, 2013-11-12) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * calc required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters StriUcnv ucnv(selected_to); UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE(); // Prepare out val SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // calculate required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<str_n; ++i) { if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." String8buf buf(bufsize); for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); UErrorCode status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed [this shouldn't happen?] buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-30) * use single common buf; * warn on invalid utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toascii(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize); // no more bytes than this needed char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs)) { // nothing to do SET_STRING_ELT(ret, i, curs); continue; } R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); if (IS_UTF8(curs)) { R_len_t k = 0, j = 0; UChar32 c; while (j<curn) { U8_NEXT(curs_tab, j, curn, c); if (c < 0) { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = ASCII_SUBSTITUTE; } else if (c > ASCII_MAXCHARCODE) bufdata[k++] = ASCII_SUBSTITUTE; else bufdata[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** internal function - replace multiple substrings in a single string * can raise Rf_error * * @version 1.3.2 (Marek Gagolewski, 2019-02-23) * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri__sub_replacement_all_single(SEXP curs, SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value) { // curs is a CHARSXP in UTF-8 PROTECT(value = stri_enc_toutf8(value, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); R_len_t value_len = LENGTH(value); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is UNPROTECT(sub_protected); return curs; } if (value_len <= 0) { // things are supposed to be replaced with "nothing"... UNPROTECT(sub_protected); Rf_warning(MSG__REPLACEMENT_ZERO); return NA_STRING; } if (vectorize_len % value_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); const char* curs_s = CHAR(curs); // already in UTF-8 R_len_t curs_n = LENGTH(curs); // first check for NAs.... if (!omit_na_1) { for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) { UNPROTECT(sub_protected); if (omit_na_1) return curs; else return NA_STRING; } } for (R_len_t i=0; i<vectorize_len; ++i) { if (STRING_ELT(value, i%value_len) == NA_STRING) { UNPROTECT(sub_protected); return NA_STRING; } } } // get the number of code points in curs, if required (for negative indexes) R_len_t curs_m = -1; if (IS_ASCII(curs)) curs_m = curs_n; else { // is UTF-8 curs_m = 0; // code points count R_len_t j = 0; // byte pos while (j < curs_n) { U8_FWD_1_UNSAFE(curs_s, j); ++curs_m; } } STRI__ERROR_HANDLER_BEGIN(sub_protected) std::vector<char> buf; // convenience >> speed R_len_t buf_size; R_len_t last_pos = 0; R_len_t byte_pos = 0, byte_pos_last; for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) { continue; } if (cur_from < 0) cur_from = curs_m+cur_from+1; if (cur_from <= 0) cur_from = 1; cur_from--; // 1-based -> 0-based index if (cur_from >= curs_m) cur_from = curs_m; // cur_from is in [0, curs_m] if (length_tab) { if (cur_to < 0) cur_to = 0; cur_to = cur_from+cur_to; } else { if (cur_to < 0) cur_to = curs_m+cur_to+1; if (cur_to < cur_from) cur_to = cur_from; // insertion } if (cur_to >= curs_m) cur_to = curs_m; // the chunk to replace is at code points [cur_from, cur_to) // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to); if (last_pos > cur_from) throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES); // first, copy [last_pos, cur_from) byte_pos_last = byte_pos; while (last_pos < cur_from) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } buf_size = buf.size(); buf.resize(buf_size+byte_pos-byte_pos_last); memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last); // then, copy the corresponding replacement string SEXP value_cur = STRING_ELT(value, i%value_len); const char* value_s = CHAR(value_cur); R_len_t value_n = LENGTH(value_cur); buf_size = buf.size(); buf.resize(buf_size+value_n); memcpy(buf.data()+buf_size, value_s, value_n); // lastly, update last_pos // ---> last_pos = cur_to; while (last_pos < cur_to) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } } // finally, copy [last_pos, curs_m) // Rprintf("orig [%d,%d)\n", last_pos, curs_m); buf_size = buf.size(); buf.resize(buf_size+curs_n-byte_pos); memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos); SEXP ret; STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Substring replacement function * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector * * @version 1.0-2 (Marek Gagolewski, 2016-01-31) * FR #199: new arg: `omit_na` * FR #207: allow insertions * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(value = stri_prepare_arg_string(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); R_len_t value_len = LENGTH(value); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 2+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 4, str_len, value_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); StriContainerUTF8 value_cont(value, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); String8buf buf(0); // @TODO: estimate bufsize a priori for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) { if (omit_na_1) { SET_STRING_ELT(ret, i, str_cont.toR(i)); } else { SET_STRING_ELT(ret, i, NA_STRING); } continue; } if (length_tab) { if (cur_to <= 0) { // SET_STRING_ELT(ret, i, R_BlankString); // continue; cur_to = 0; } else { cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const char* value_cur_s = value_cont.get(i).c_str(); R_len_t value_cur_n = value_cont.get(i).length(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 < cur_from2) cur_to2 = cur_from2; R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n; buf.resize(buflen, false/*destroy contents*/); memcpy(buf.data(), str_cur_s, (size_t)cur_from2); memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n); memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Get substring * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @return character vector * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector */ SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; R_len_t to_len = 0; R_len_t length_len = 0; int* from_tab = 0; int* to_tab = 0; int* length_tab = 0; R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to <= 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 > cur_from2) { // just copy SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8)); } else { // maybe a warning here? SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Convert case (upper, lowercase) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { /* retry */ buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** Generate random permutations of code points in each string * * @param str character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.5 (Marek Gagolewski, 2019-07-23) * #319: Fixed overflow in `stri_rand_shuffle()`. */ SEXP stri_rand_shuffle(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.) String8buf buf2(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // fill buf1 UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf1[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_STRING_ELT(ret, i, NA_STRING); continue; } // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle) R_len_t cur_n = k; for (j=0; j<cur_n-1; ++j) { // rand from i to cur_n-1 R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j); UChar32 tmp = buf1[r]; buf1[r] = buf1[j]; buf1[j] = tmp; } // create string: char* buf2data = buf2.data(); c = (UChar32)0; j = 0; k = 0; UBool err = FALSE; while (!err && k < cur_n) { c = buf1[k++]; U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err); } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }
/** Generate random strings * * @param n single integer * @param length integer vector * @param pattern character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * Use StriContainerCharClass which now contains UnicodeSets; * vectorized also over pattern * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern) { int n_val = stri__prepare_arg_integer_1_notNA(n, "n"); PROTECT(length = stri_prepare_arg_integer(length, "length")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); if (n_val < 0) n_val = 0; /* that's not NA for sure now */ R_len_t length_len = LENGTH(length); if (length_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length"); } else if (length_len > n_val || n_val % length_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); R_len_t pattern_len = LENGTH(pattern); if (pattern_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern"); } else if (pattern_len > n_val || n_val % pattern_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(2) StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len)); StriContainerInteger length_cont(length, max(n_val, length_len)); // get max required bufsize int* length_tab = INTEGER(length); R_len_t bufsize = 0; for (R_len_t i=0; i<length_len; ++i) { if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize) bufsize = length_tab[i]; } bufsize *= 4; // 1 UChar32 -> max. 4 UTF-8 bytes String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val)); for (R_len_t i=0; i<n_val; ++i) { if (length_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } int length_cur = length_cont.get(i); if (length_cur < 0) length_cur = 0; const UnicodeSet* uset = &(pattern_cont.get(i)); int32_t uset_size = uset->size(); // generate string: R_len_t j = 0; UBool err = FALSE; for (R_len_t k=0; k<length_cur; ++k) { int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */ UChar32 c = uset->charAt(idx); if (c < 0) throw StriException(MSG__INTERNAL_ERROR); U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); if (err) throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }