/** * Reverse Each String * @param str character vector * @return character vector with every string reversed * * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect incorrect utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_reverse(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); StriContainerUTF8 str_cont(str, str_len); // writable, no recycle // STEP 1. // Calculate the required buffer length R_len_t bufsize = 0; for (R_len_t i=0; i<str_len; ++i) { if (str_cont.isNA(i)) continue; R_len_t cursize = str_cont.get(i).length(); if (cursize > bufsize) bufsize = cursize; } // STEP 2. // Alloc buffer & result vector String8buf buf(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, k; UChar32 chr; UBool isError = FALSE; for (j=str_cur_n, k=0; !isError && j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) { throw StriException(MSG__INVALID_UTF8); } U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError); } if (isError) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
static void TestAppend() { static const UChar32 codePoints[]={ 0x61, 0xdf, 0x901, 0x3040, 0xac00, 0xd800, 0xdbff, 0xdcde, 0xdffd, 0xe000, 0xffff, 0x10000, 0x12345, 0xe0021, 0x10ffff, 0x110000, 0x234567, 0x7fffffff, -1, -1000, 0, 0x400 }; static const uint8_t expectUnsafe[]={ 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e, 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ /* none from this line */ 0, 0xd0, 0x80 }, expectSafe[]={ 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, 0xea, 0xb0, 0x80, /* no surrogates */ /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ /* none from this line */ 0, 0xd0, 0x80 }; uint8_t buffer[100]; UChar32 c; int32_t i, length; UBool isError, expectIsError, wrongIsError; length=0; for(i=0; i<LENGTHOF(codePoints); ++i) { c=codePoints[i]; if(c<0 || 0x10ffff<c) { continue; /* skip non-code points for U8_APPEND_UNSAFE */ } U8_APPEND_UNSAFE(buffer, length, c); } if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) { log_err("U8_APPEND_UNSAFE did not generate the expected output\n"); } length=0; wrongIsError=FALSE; for(i=0; i<LENGTHOF(codePoints); ++i) { c=codePoints[i]; expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c); isError=FALSE; U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError); wrongIsError|= isError!=expectIsError; } if(wrongIsError) { log_err("U8_APPEND did not set isError correctly\n"); } if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) { log_err("U8_APPEND did not generate the expected output\n"); } }
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ static U_INLINE int32_t appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, int32_t result, const UChar *s) { UChar32 c; int32_t length, destLength; UErrorCode errorCode; /* decode the result */ if(result<0) { /* (not) original code point */ c=~result; length=-1; } else if(result<=UCASE_MAX_STRING_LENGTH) { c=U_SENTINEL; length=result; } else { c=result; length=-1; } if(destIndex<destCapacity) { /* append the result */ if(length<0) { /* code point */ UBool isError=FALSE; U8_APPEND(dest, destIndex, destCapacity, c, isError); if(isError) { /* overflow, nothing written */ destIndex+=U8_LENGTH(c); } } else { /* string */ errorCode=U_ZERO_ERROR; u_strToUTF8( (char *)(dest+destIndex), destCapacity-destIndex, &destLength, s, length, &errorCode); destIndex+=destLength; /* we might have an overflow, but we know the actual length */ } } else { /* preflight */ if(length<0) { destIndex+=U8_LENGTH(c); } else { errorCode=U_ZERO_ERROR; u_strToUTF8( NULL, 0, &destLength, s, length, &errorCode); destIndex+=destLength; } } return destIndex; }
/** Convert from UTF-32 * * @param vec integer vector or list with integer vectors * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * StriException friently; * use StriContainerListInt * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_fromutf32(SEXP vec) { PROTECT(vec = stri_prepare_arg_list_integer(vec, "vec")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListInt vec_cont(vec); R_len_t vec_n = vec_cont.get_n(); // get required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<vec_n; ++i) { if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize) bufsize = vec_cont.get(i).size(); } bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n)); for (R_len_t i=0; i<vec_n; ++i) { if (vec_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const int* cur_data = vec_cont.get(i).data(); R_len_t cur_n = vec_cont.get(i).size(); UChar32 c = (UChar32)0; R_len_t j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && k < cur_n) { c = cur_data[k++]; U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); // Rf_mkCharLenCE detects embedded nuls, but stops execution completely if (c == 0) err = TRUE; } if (err) { Rf_warning(MSG__INVALID_CODE_POINT, (int)c); SET_STRING_ELT(ret, i, NA_STRING); } else SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } STRI__UNPROTECT_ALL; return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
std::size_t copy_uchar_as_utf8_(char32_t const c, std::uint8_t (&buf)[4u]) { std::int32_t i = 0; constexpr std::int32_t capacity = sizeof(buf) / sizeof(buf[0u]); static_assert(capacity == 4u, ""); UBool is_error = FALSE; U8_APPEND(buf, i, capacity, c, is_error); if (is_error != FALSE) { CRADLE_THROW_EXCEPTION((std::invalid_argument(""))); } return i; }
/** Convert from UTF-32 [single string, internal] * * On invalid codepoint, warning is generated and -1 is returned * @param data UTF-32 codes * @param ndata number of codes * @param buf [out] output buffer * @param bufsize buffer size * @return number of bytes written * * @version 0.1 (Marek Gagolewski) */ R_len_t stri__enc_fromutf32(int* data, R_len_t ndata, char* buf, R_len_t bufsize) { R_len_t i = 0; R_len_t k = 0; UBool err = FALSE; while (k < ndata) { UChar32 c = data[k++]; U8_APPEND((uint8_t*)buf, i, bufsize, c, err); if (err) { Rf_warning(MSG__INVALID_CODE_POINT, (int)c); return -1; } } return i; }
void utf8_append_codepoint(std::string& dest, UChar32 codepoint) { std::array<uint8_t, U8_MAX_LENGTH> buf; int32_t len = 0; UBool err = FALSE; // ICU has some conversions within this macro, which we can't control #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" U8_APPEND(&buf[0], len, U8_MAX_LENGTH, codepoint, err); #pragma GCC diagnostic pop if (err) throw std::runtime_error{"failed to add codepoint to string"}; dest.append(reinterpret_cast<char*>(&buf[0]), static_cast<std::size_t>(len)); }
/** * Obtains the first UNICODE letter from the supplied string, normalizes and returns it. */ static void get_phonebook_index( sqlite3_context * context, int argc, sqlite3_value ** argv) { if (argc != 2) { sqlite3_result_null(context); return; } char const * src = (char const *)sqlite3_value_text(argv[0]); char const * locale = (char const *)sqlite3_value_text(argv[1]); if (src == NULL || src[0] == 0 || locale == NULL) { sqlite3_result_null(context); return; } UCharIterator iter; uiter_setUTF8(&iter, src, -1); UBool isError = FALSE; UChar index[SMALL_BUFFER_SIZE]; uint32_t len = android::GetPhonebookIndex(&iter, locale, index, sizeof(index), &isError); if (isError) { sqlite3_result_null(context); return; } uint32_t outlen = 0; uint8_t out[SMALL_BUFFER_SIZE]; for (uint32_t i = 0; i < len; i++) { U8_APPEND(out, outlen, sizeof(out), index[i], isError); if (isError) { sqlite3_result_null(context); return; } } if (outlen == 0) { sqlite3_result_null(context); return; } sqlite3_result_text(context, (const char*)out, outlen, SQLITE_TRANSIENT); }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Generate random permutations of code points in each string * * @param str character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.5 (Marek Gagolewski, 2019-07-23) * #319: Fixed overflow in `stri_rand_shuffle()`. */ SEXP stri_rand_shuffle(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.) String8buf buf2(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // fill buf1 UChar32 c = (UChar32)0; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf1[k++] = (int)c; } if (c < 0) { Rf_warning(MSG__INVALID_UTF8); SET_STRING_ELT(ret, i, NA_STRING); continue; } // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle) R_len_t cur_n = k; for (j=0; j<cur_n-1; ++j) { // rand from i to cur_n-1 R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j); UChar32 tmp = buf1[r]; buf1[r] = buf1[j]; buf1[j] = tmp; } // create string: char* buf2data = buf2.data(); c = (UChar32)0; j = 0; k = 0; UBool err = FALSE; while (!err && k < cur_n) { c = buf1[k++]; U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err); } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }
/** Generate random strings * * @param n single integer * @param length integer vector * @param pattern character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * Use StriContainerCharClass which now contains UnicodeSets; * vectorized also over pattern * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern) { int n_val = stri__prepare_arg_integer_1_notNA(n, "n"); PROTECT(length = stri_prepare_arg_integer(length, "length")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); if (n_val < 0) n_val = 0; /* that's not NA for sure now */ R_len_t length_len = LENGTH(length); if (length_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length"); } else if (length_len > n_val || n_val % length_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); R_len_t pattern_len = LENGTH(pattern); if (pattern_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern"); } else if (pattern_len > n_val || n_val % pattern_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(2) StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len)); StriContainerInteger length_cont(length, max(n_val, length_len)); // get max required bufsize int* length_tab = INTEGER(length); R_len_t bufsize = 0; for (R_len_t i=0; i<length_len; ++i) { if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize) bufsize = length_tab[i]; } bufsize *= 4; // 1 UChar32 -> max. 4 UTF-8 bytes String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val)); for (R_len_t i=0; i<n_val; ++i) { if (length_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } int length_cur = length_cont.get(i); if (length_cur < 0) length_cur = 0; const UnicodeSet* uset = &(pattern_cont.get(i)); int32_t uset_size = uset->size(); // generate string: R_len_t j = 0; UBool err = FALSE; for (R_len_t k=0; k<length_cur; ++k) { int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */ UChar32 c = uset->charAt(idx); if (c < 0) throw StriException(MSG__INTERNAL_ERROR); U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); if (err) throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }