/** * Get number of bytes in each string * * Note that ICU permits only strings of length < 2^31. * @param s R object coercible to a character vector * @return integer vector * @version 0.1 (Marcin Bujarski) */ SEXP stri_numbytes(SEXP str) { str = stri_prepare_arg_string(str, "str"); // prepare string argument R_len_t n = LENGTH(str); SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, n)); int* retint = INTEGER(ret); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); /* INPUT ENCODING CHECK: this function does not need this. */ if (curs == NA_STRING) retint[i] = NA_INTEGER; else retint[i] = LENGTH(curs); // O(1) - stored by R } UNPROTECT(1); return ret; }
/** * Get number of bytes in each string * * Note that ICU permits only strings of length < 2^31. * * @param s R object coercible to a character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_numbytes(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument R_len_t str_n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = LOGICAL(ret); for (R_len_t i=0; i<str_n; ++i) { SEXP curs = STRING_ELT(str, i); /* INPUT ENCODING CHECK: this function does not need this. */ retint[i] = (curs == NA_STRING)?NA_INTEGER:LENGTH(curs); // O(1) - stored by R } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** Get Declared Encodings of Each String * * @param str a character vector or an object coercible to * @return a character vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_mark(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); // some of them will not be used in this call, but we're lazy SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes; STRI__PROTECT(mark_ascii = Rf_mkChar("ASCII")); STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1")); STRI__PROTECT(mark_utf8 = Rf_mkChar("UTF-8")); STRI__PROTECT(mark_native = Rf_mkChar("native")); STRI__PROTECT(mark_bytes = Rf_mkChar("bytes")); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i=0; i<str_len; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs)) SET_STRING_ELT(ret, i, mark_ascii); else if (IS_UTF8(curs)) SET_STRING_ELT(ret, i, mark_utf8); else if (IS_BYTES(curs)) SET_STRING_ELT(ret, i, mark_bytes); else if (IS_LATIN1(curs)) SET_STRING_ELT(ret, i, mark_latin1); else SET_STRING_ELT(ret, i, mark_native); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Convert character vector between a marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1 (Marek Gagolewski, 2013-11-12) */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { str = stri_prepare_arg_string(str, "str"); const char* selected_to = stri__prepare_arg_enc(to, "to", true); bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); UConverter* uconv_to = NULL; STRI__ERROR_HANDLER_BEGIN R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters uconv_to = stri__ucnv_open(selected_to); // Get target encoding mark UErrorCode err = U_ZERO_ERROR; const char* uconv_to_name = ucnv_getName(uconv_to, &err); if (U_FAILURE(err)) throw StriException(err); cetype_t encmark_to = CE_BYTES; // all other cases than the below ones // - bytes enc (this is reasonable, isn't it?) if (!to_raw_logical) { // otherwise not needed if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8")) encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE else if (!strcmp(uconv_to_name, "ISO-8859-1")) encmark_to = CE_LATIN1; else if (!strcmp(uconv_to_name, ucnv_getDefaultName())) encmark_to = CE_NATIVE; } // Prepare out val SEXP ret; PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); String8 buf(0); // will be extended in a moment for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer contents is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." buf.resize(bufneed); err = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (bufneed <= buf.size()) { if (U_FAILURE(err)) throw StriException(err); } else {// larger buffer needed [this shouldn't happen?] // warning("buf extending"); buf.resize(bufneed); err = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (U_FAILURE(err)) throw StriException(err); if (bufneed > buf.size()) throw StriException(MSG__INTERNAL_ERROR); } if (to_raw_logical) { SEXP outobj = Rf_allocVector(RAWSXP, bufneed); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } if (uconv_to) { ucnv_close(uconv_to); uconv_to = NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END({ if (uconv_to) ucnv_close(uconv_to); }) }
/** * Split a string into text lines * * @param str character vector * @param n_max integer vector * @param omit_empty logical vector * * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-08-04) */ SEXP stri_split_lines(SEXP str, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(n_max), LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); if (n_max_cur < 0) n_max_cur = INT_MAX; else if (n_max_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } //#define STRI_INDEX_NEWLINE_CR 0 //#define STRI_INDEX_NEWLINE_LF 1 //#define STRI_INDEX_NEWLINE_CRLF 2 //#define STRI_INDEX_NEWLINE_NEL 3 //#define STRI_INDEX_NEWLINE_VT 4 //#define STRI_INDEX_NEWLINE_FF 5 //#define STRI_INDEX_NEWLINE_LS 6 //#define STRI_INDEX_NEWLINE_PS 7 //#define STRI_INDEX_NEWLINE_LAST 8 // int counts[STRI_INDEX_NEWLINE_LAST]; // for (R_len_t j=0; j<STRI_INDEX_NEWLINE_LAST; ++j) // counts[j] = 0; UChar32 c; R_len_t jlast, k=1; deque<R_len_t_x2> occurences; occurences.push_back(R_len_t_x2(0, 0)); for (R_len_t j=0; j < str_cur_n && k < n_max_cur; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ // counts[STRI_INDEX_NEWLINE_CR]++; /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte // counts[STRI_INDEX_NEWLINE_LF]++; // counts[STRI_INDEX_NEWLINE_CRLF]++; j++; // just one byte } break; case ASCII_LF: /* LF */ // counts[STRI_INDEX_NEWLINE_LF]++; break; case UCHAR_NEL: /* NEL */ // counts[STRI_INDEX_NEWLINE_NEL]++; break; case ASCII_VT: /* VT */ // counts[STRI_INDEX_NEWLINE_VT]++; break; case ASCII_FF: /* FF */ // counts[STRI_INDEX_NEWLINE_FF]++; break; case UCHAR_LS: /* LS */ // counts[STRI_INDEX_NEWLINE_LS]++; break; case UCHAR_PS: /* PS */ // counts[STRI_INDEX_NEWLINE_PS]++; break; default: /* not a newline character */ occurences.back().v2 = j; continue; } // if here, then at newline if (omit_empty_cur && occurences.back().v2 == occurences.back().v1) occurences.back().v1 = occurences.back().v2 = j; // don't start new field else { occurences.back().v2 = jlast; occurences.push_back(R_len_t_x2(j, j)); ++k; // another field } } if (k == n_max_cur) occurences.back().v2 = str_cur_n; if (omit_empty_cur && occurences.back().v1 == occurences.back().v2) occurences.pop_back(); SEXP ans; PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurences.size())); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t l = 0; iter != occurences.end(); ++iter, ++l) { R_len_t_x2 curoccur = *iter; SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.v1, curoccur.v2-curoccur.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); UNPROTECT(1); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Word wrap text * * @param str character vector * @param width single integer * @param cost_exponent single double * @param indent single integer * @param exdent single integer * @param prefix single string * @param initial single string * @param locale locale identifier or NULL for default locale * @param use_length single logical value * * @return list * * @version 0.1-?? (Bartek Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-27) * single function for wrap_greedy and wrap_dynamic * (dispatch inside); * use BreakIterator * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: indent, exdent, prefix, initial * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) * #133 allow width <= 0 * * @version 0.5-1 (Marek Gagolewski, 2015-02-28) * don't trim so many white spaces at the end of each word (normalize arg does that) * #139: allow a "whitespace" break iterator * * @version 0.5-1 (Marek Gagolewski, 2015-04-23) * `use_length` arg added * * * @version 0.5-1 (Marek Gagolewski, 2015-06-09) * BIGSKIP: no more CHARSXP on out on "" input */ SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent, SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only, SEXP use_length, SEXP locale) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent"); bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only"); int width_val = stri__prepare_arg_integer_1_notNA(width, "width"); if (width_val <= 0) width_val = 0; int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent"); if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent"); int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent"); if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent"); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ Locale loc = Locale::createFromName(qloc); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(prefix = stri_prepare_arg_string_1(prefix, "prefix")); PROTECT(initial = stri_prepare_arg_string_1(initial, "initial")); BreakIterator* briter = NULL; UText* str_text = NULL; STRI__ERROR_HANDLER_BEGIN(3) UErrorCode status = U_ZERO_ERROR; briter = BreakIterator::createLineInstance(loc, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector<R_len_t> end_pos_orig(nwords); deque<R_len_t>::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector<R_len_t> widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector<R_len_t> widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector<R_len_t> end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque<R_len_t>::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-30) * use single common buf; * warn on invalid utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toascii(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize); // no more bytes than this needed char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs)) { // nothing to do SET_STRING_ELT(ret, i, curs); continue; } R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); if (IS_UTF8(curs)) { R_len_t k = 0, j = 0; UChar32 c; while (j<curn) { U8_NEXT(curs_tab, j, curn, c); if (c < 0) { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = ASCII_SUBSTITUTE; } else if (c > ASCII_MAXCHARCODE) bufdata[k++] = ASCII_SUBSTITUTE; else bufdata[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract multiple substrings * * * @param str character vector * @param from list * @param to list * @param length list * @return list of character vectors * * @version 1.3.2 (Marek Gagolewski, 2019-02-21) * #30: new function */ SEXP stri_sub_all(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(from = stri_prepare_arg_list(from, "from")); PROTECT(to = stri_prepare_arg_list(to, "to")); PROTECT(length = stri_prepare_arg_list(length, "length")); R_len_t str_len = LENGTH(str); R_len_t from_len = LENGTH(from); // R_len_t to_len = LENGTH(to); // R_len_t length_len = LENGTH(length); R_len_t vectorize_len; if (!isNull(to)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(to)); else if (!isNull(length)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(length)); else vectorize_len = stri__recycling_rule(true, 2, str_len, from_len); if (vectorize_len <= 0) { UNPROTECT(4); return Rf_allocVector(VECSXP, 0); } // no STRI__ERROR_HANDLER_BEGIN block ---- stri_sub can longjmp with Rf_error... SEXP ret, str_tmp, tmp; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_len)); //5 PROTECT(str_tmp = Rf_allocVector(STRSXP, 1)); //6 for (R_len_t i = 0; i<vectorize_len; ++i) { PROTECT(tmp = STRING_ELT(str, i%str_len)); SET_STRING_ELT(str_tmp, 0, tmp); UNPROTECT(1); //tmp if (!isNull(to)) { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), VECTOR_ELT(to, i%LENGTH(to)), R_NilValue)); } else if (!isNull(length)) { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), R_NilValue, VECTOR_ELT(length, i%LENGTH(length)))); } else { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), R_NilValue, R_NilValue)); } SET_VECTOR_ELT(ret, i, tmp); UNPROTECT(1); //tmp } UNPROTECT(6); return ret; }
/** * Get substring * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @return character vector * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector */ SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; R_len_t to_len = 0; R_len_t length_len = 0; int* from_tab = 0; int* to_tab = 0; int* length_tab = 0; R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to <= 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 > cur_from2) { // just copy SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8)); } else { // maybe a warning here? SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Pad a string * * vectorized over str, length and pad * if str or pad or length is NA the result will be NA * * @param str character vector * @param min_length integer vector * @param side [internal int] * @param pad character vector * @param use_length single logical value * @return character vector * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-20) * use stri_error_handler, pad should be a single code point, not byte * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-1 (Marek Gagolewski, 2015-04-22) * `use_length` arg added, * second argument renamed `width` */ SEXP stri_pad(SEXP str, SEXP width, SEXP side, SEXP pad, SEXP use_length) { // this is an internal arg, check manually, error() allowed here if (!Rf_isInteger(side) || LENGTH(side) != 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _side = INTEGER(side)[0]; if (_side < 0 || _side > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(width = stri_prepare_arg_integer(width, "width")); PROTECT(pad = stri_prepare_arg_string(pad, "pad")); // side = stri_prepare_arg_string(side, "side"); // const char* side_opts[] = {"left", "right", "both", NULL}; R_len_t str_length = LENGTH(str); R_len_t width_length = LENGTH(width); // R_len_t side_length = LENGTH(side); R_len_t pad_length = LENGTH(pad); R_len_t vectorize_length = stri__recycling_rule(true, 3, str_length, width_length, /*side_length, */ pad_length); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger width_cont(width, vectorize_length); // StriContainerUTF8 side_cont(side, vectorize_length); StriContainerUTF8 pad_cont(pad, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); // TODO: prealloc for (R_len_t i=0; i<vectorize_length; ++i) { if (str_cont.isNA(i) || pad_cont.isNA(i) || /*side_cont.isNA(i) ||*/ width_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // get the current string R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_width; // get the width/length of padding code point(s) R_len_t pad_cur_n = pad_cont.get(i).length(); const char* pad_cur_s = pad_cont.get(i).c_str(); R_len_t pad_cur_width; if (use_length_val) { pad_cur_width = 1; str_cur_width = str_cont.get(i).countCodePoints(); R_len_t k = 0; UChar32 pad_cur = 0; U8_NEXT(pad_cur_s, k, pad_cur_n, pad_cur); if (pad_cur <= 0 || k < pad_cur_n) throw StriException(MSG__NOT_EQ_N_CODEPOINTS, "pad", 1); } else { pad_cur_width = stri__width_string(pad_cur_s, pad_cur_n); str_cur_width = stri__width_string(str_cur_s, str_cur_n); if (pad_cur_width != 1) throw StriException(MSG__NOT_EQ_N_WIDTH, "pad", 1); } // get the minimal width R_len_t width_cur = width_cont.get(i); if (str_cur_width >= width_cur) { // no padding at all SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } R_len_t padnum = width_cur-str_cur_width; buf.resize(str_cur_n+padnum*pad_cur_n, false); char* buftmp = buf.data(); R_len_t k = 0; switch(_side) { case 0: // left for (k=0; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; break; case 1: // right memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; for (k=0; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } break; case 2: // both for (k=0; k<padnum/2; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; for (; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } break; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), (int)(buftmp-buf.data()), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract all occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); merge = stri_prepare_arg_logical(merge, "merge"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerLogical merge_cont(merge, vectorize_length); SEXP notfound; // this vector will be set iff not found or NA PROTECT(notfound = stri__vector_NA_strings(1)); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, notfound); continue; } bool merge_cur = merge_cont.get(i); CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; deque<R_len_t_x2> occurences; // codepoint based-indices for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { occurences.push_back(R_len_t_x2(jlast, j)); } jlast = j; } R_len_t noccurences = (R_len_t)occurences.size(); if (noccurences == 0) SET_VECTOR_ELT(ret, i, notfound); else if (merge_cur && noccurences > 1) { // do merge deque<R_len_t_x2> occurences2; deque<R_len_t_x2>::iterator iter = occurences.begin(); occurences2.push_back(*iter); for (++iter; iter != occurences.end(); ++iter) { R_len_t_x2 curoccur = *iter; if (occurences2.back().v2 == curoccur.v1) { // continue seq occurences2.back().v2 = curoccur.v2; // change `end` } else { // new seq occurences2.push_back(curoccur); } } // create resulting matrix from occurences2 R_len_t noccurences2 = (R_len_t)occurences2.size(); SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2)); iter = occurences2.begin(); for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } else { // do not merge SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences)); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } } UNPROTECT(2); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Escape Unicode code points * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-08-17) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * fail on incorrect utf8 byte seqs; * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_escape_unicode(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8 str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length)); std::string out; // @TODO: estimate len a priori? for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); // estimate buf size R_len_t bufsize = 0; UChar32 c; R_len_t j = 0; while (j < str_cur_n) { U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) throw StriException(MSG__INVALID_UTF8); else if ((char)c >= 32 || (char)c <= 126) bufsize += 1; else if (c <= 0xff) bufsize += 6; // for \a, \n this will be overestimated else bufsize += 10; } out.clear(); if ((size_t)bufsize > (size_t)out.size()) out.reserve(bufsize); // do escape j = 0; char buf[11]; while (j < str_cur_n) { U8_NEXT(str_cur_s, j, str_cur_n, c); /* if (c < 0) throw StriException(MSG__INVALID_UTF8); // this has already been checked :) else */ if (c <= ASCII_MAXCHARCODE) { switch ((char)c) { case 0x07: out.append("\\a"); break; case 0x08: out.append("\\b"); break; case 0x09: out.append("\\t"); break; case 0x0a: out.append("\\n"); break; case 0x0b: out.append("\\v"); break; case 0x0c: out.append("\\f"); break; case 0x0d: out.append("\\r"); break; // case 0x1b: out.append("\\e"); break; // R doesn't know that case 0x22: out.append("\\\""); break; case 0x27: out.append("\\'"); break; case 0x5c: out.append("\\\\"); break; default: if ((char)c >= 32 || (char)c <= 126) // printable characters out.append(1, (char)c); else { sprintf(buf, "\\u%4.4x", (uint16_t)c); out.append(buf, 6); } } } else if (c <= 0xffff) { sprintf(buf, "\\u%4.4x", (uint16_t)c); out.append(buf, 6); } else { sprintf(buf, "\\U%8.8x", (uint32_t)c); out.append(buf, 10); } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(out.c_str(), (int)out.size(), (cetype_t)CE_UTF8) ); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) Multiple input encoding support * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_length(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t ns = LENGTH(str); SEXP ret; UConverter* uconv = NULL; bool uconv_8bit = false; bool uconv_utf8 = false; STRI__ERROR_HANDLER_BEGIN /* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */ #define STRI_LENGTH_CALCULATE_UTF8 \ const char* qc = CHAR(q); \ R_len_t j = 0; \ for (R_len_t i = 0; i < nq; j++) \ U8_FWD_1(qc, i, nq); \ retint[k] = j; PROTECT(ret = Rf_allocVector(INTSXP, ns)); int* retint = INTEGER(ret); for (R_len_t k = 0; k < ns; k++) { SEXP q = STRING_ELT(str, k); if (q == NA_STRING) retint[k] = NA_INTEGER; else { R_len_t nq = LENGTH(q); // O(1) - stored by R // We trust (is that a wise assumption?) // R encoding marks; However, it there is no mark, // the string may have any encoding (ascii, latin1, utf8, native) if (IS_ASCII(q) || IS_LATIN1(q)) retint[k] = nq; else if (IS_BYTES(q)) throw StriException(MSG__BYTESENC); else if (IS_UTF8(q)) { STRI_LENGTH_CALCULATE_UTF8 } else { // Any encoding - detection needed // UTF-8 strings can be fairly reliably recognized as such by a // simple algorithm, i.e., the probability that a string of // characters in any other encoding appears as valid UTF-8 is low, // diminishing with increasing string length. // We have two possibilities here: // 1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native // This won't work correctly in some cases. // e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8 // and ("A umlaut", "Ellipsis") in WINDOWS-1250 // 2. Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!uconv) { // open ucnv on demand uconv = stri__ucnv_open((const char*)NULL); // native decoder if (!uconv) { retint[k] = NA_INTEGER; continue; } uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1); if (!uconv_8bit) { UErrorCode err = U_ZERO_ERROR; const char* name = ucnv_getName(uconv, &err); if (U_FAILURE(err)) throw StriException("could not query default converter"); uconv_utf8 = !strncmp("UTF-8", name, 5); } } if (uconv_8bit) { retint[k] = nq; // it's an 8-bit encoding :-) } else if (uconv_utf8) { // it's UTF-8 STRI_LENGTH_CALCULATE_UTF8 } else { // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') UErrorCode err = U_ZERO_ERROR; const char* source = CHAR(q); const char* sourceLimit = source + nq; R_len_t j; for (j = 0; source != sourceLimit; j++) { if (U_FAILURE(err)) break; // error from previous iteration // iterate through each native-encoded character: ucnv_getNextUChar(uconv, &source, sourceLimit, &err); } if (U_FAILURE(err)) { // error from last iteration Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string."); retint[k] = NA_INTEGER; } else retint[k] = j; // all right, we got it! } } }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Convert case (upper, lowercase) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { /* retry */ buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** * Substring replacement function * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector * * @version 1.0-2 (Marek Gagolewski, 2016-01-31) * FR #199: new arg: `omit_na` * FR #207: allow insertions * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(value = stri_prepare_arg_string(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); R_len_t value_len = LENGTH(value); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 2+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 4, str_len, value_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); StriContainerUTF8 value_cont(value, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); String8buf buf(0); // @TODO: estimate bufsize a priori for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) { if (omit_na_1) { SET_STRING_ELT(ret, i, str_cont.toR(i)); } else { SET_STRING_ELT(ret, i, NA_STRING); } continue; } if (length_tab) { if (cur_to <= 0) { // SET_STRING_ELT(ret, i, R_BlankString); // continue; cur_to = 0; } else { cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const char* value_cur_s = value_cont.get(i).c_str(); R_len_t value_cur_n = value_cont.get(i).length(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 < cur_from2) cur_to2 = cur_from2; R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n; buf.resize(buflen, false/*destroy contents*/); memcpy(buf.data(), str_cur_s, (size_t)cur_from2); memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n); memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Generate random strings * * @param n single integer * @param length integer vector * @param pattern character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * Use StriContainerCharClass which now contains UnicodeSets; * vectorized also over pattern * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern) { int n_val = stri__prepare_arg_integer_1_notNA(n, "n"); PROTECT(length = stri_prepare_arg_integer(length, "length")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); if (n_val < 0) n_val = 0; /* that's not NA for sure now */ R_len_t length_len = LENGTH(length); if (length_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length"); } else if (length_len > n_val || n_val % length_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); R_len_t pattern_len = LENGTH(pattern); if (pattern_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern"); } else if (pattern_len > n_val || n_val % pattern_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(2) StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len)); StriContainerInteger length_cont(length, max(n_val, length_len)); // get max required bufsize int* length_tab = INTEGER(length); R_len_t bufsize = 0; for (R_len_t i=0; i<length_len; ++i) { if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize) bufsize = length_tab[i]; } bufsize *= 4; // 1 UChar32 -> max. 4 UTF-8 bytes String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val)); for (R_len_t i=0; i<n_val; ++i) { if (length_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } int length_cur = length_cont.get(i); if (length_cur < 0) length_cur = 0; const UnicodeSet* uset = &(pattern_cont.get(i)); int32_t uset_size = uset->size(); // generate string: R_len_t j = 0; UBool err = FALSE; for (R_len_t k=0; k<length_cur; ++k) { int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */ UChar32 c = uset->charAt(idx); if (c < 0) throw StriException(MSG__INTERNAL_ERROR); U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); if (err) throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Generate random permutations of code points in each string * * @param str character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.5 (Marek Gagolewski, 2019-07-23) * #319: Fixed overflow in `stri_rand_shuffle()`. */ SEXP stri_rand_shuffle(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.) String8buf buf2(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // fill buf1 UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf1[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_STRING_ELT(ret, i, NA_STRING); continue; } // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle) R_len_t cur_n = k; for (j=0; j<cur_n-1; ++j) { // rand from i to cur_n-1 R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j); UChar32 tmp = buf1[r]; buf1[r] = buf1[j]; buf1[j] = tmp; } // create string: char* buf2data = buf2.data(); c = (UChar32)0; j = 0; k = 0; UBool err = FALSE; while (!err && k < cur_n) { c = buf1[k++]; U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err); } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }
/** * Convert character vector between marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski, 2013-11-12) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * calc required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters StriUcnv ucnv(selected_to); UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE(); // Prepare out val SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // calculate required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<str_n; ++i) { if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." String8buf buf(bufsize); for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); UErrorCode status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed [this shouldn't happen?] buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) }
/** * Parse date-time objects * * @param str * @param format * @param tz * @param lenient * @param locale * * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-01-08) * @version 0.5-1 (Marek Gagolewski, 2015-01-11) lenient arg added * @version 0.5-1 (Marek Gagolewski, 2015-02-22) use tz * @version 0.5-1 (Marek Gagolewski, 2015-03-01) set tzone attrib on retval */ SEXP stri_datetime_parse(SEXP str, SEXP format, SEXP lenient, SEXP tz, SEXP locale) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); const char* format_val = stri__prepare_arg_string_1_notNA(format, "format"); bool lenient_val = stri__prepare_arg_logical_1_notNA(lenient, "lenient"); if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ // "format" may be one of: const char* format_opts[] = { "date_full", "date_long", "date_medium", "date_short", "date_relative_full", "date_relative_long", "date_relative_medium", "date_relative_short", "time_full", "time_long", "time_medium", "time_short", "time_relative_full", "time_relative_long", "time_relative_medium", "time_relative_short", "datetime_full", "datetime_long", "datetime_medium", "datetime_short", "datetime_relative_full", "datetime_relative_long", "datetime_relative_medium", "datetime_relative_short", NULL}; int format_cur = stri__match_arg(format_val, format_opts); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; DateFormat* fmt = NULL; STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = LENGTH(str); StriContainerUTF16 str_cont(str, vectorize_length); UnicodeString format_str(format_val); UErrorCode status = U_ZERO_ERROR; if (format_cur >= 0) { DateFormat::EStyle style = DateFormat::kNone; switch (format_cur % 8) { case 0: style = DateFormat::kFull; break; case 1: style = DateFormat::kLong; break; case 2: style = DateFormat::kMedium; break; case 3: style = DateFormat::kShort; break; case 4: style = DateFormat::kFullRelative; break; case 5: style = DateFormat::kLongRelative; break; case 6: style = DateFormat::kMediumRelative; break; case 7: style = DateFormat::kShortRelative; break; default: style = DateFormat::kNone; break; } /* ICU 54.1: Relative time styles are not currently supported. */ switch (format_cur / 8) { case 0: fmt = DateFormat::createDateInstance(style, Locale::createFromName(locale_val)); break; case 1: fmt = DateFormat::createTimeInstance((DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; case 2: fmt = DateFormat::createDateTimeInstance(style, (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; default: fmt = NULL; break; } } else fmt = new SimpleDateFormat(format_str, Locale::createFromName(locale_val), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ cal->setLenient(lenient_val); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); for (R_len_t i=0; i<vectorize_length; ++i) { if (str_cont.isNA(i)) { REAL(ret)[i] = NA_REAL; continue; } status = U_ZERO_ERROR; ParsePosition pos; fmt->parse(str_cont.get(i), *cal, pos); if (pos.getErrorIndex() >= 0) REAL(ret)[i] = NA_REAL; else { status = U_ZERO_ERROR; REAL(ret)[i] = ((double)cal->getTime(status))/1000.0; if (U_FAILURE(status)) REAL(ret)[i] = NA_REAL; } } if (!isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } }) }