static void TestCodeUnitValues() { static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,}; int16_t i; for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){ uint8_t c=codeunit[i]; log_verbose("Testing code unit value of %x\n", c); if(i<4){ if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){ log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n", c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n'); } } else if(i< 8){ if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){ log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n", c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n'); } } else if(i< 12){ if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){ log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n", c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n'); } } } }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toascii(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } else if (IS_ASCII(curs)) { SET_STRING_ELT(ret, i, curs); } else if (IS_UTF8(curs)) { R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); // this may be 4 times too much R_len_t k = 0; UChar32 c; for (int j=0; j<curn; ) { U8_NEXT(curs_tab, j, curn, c); if (c > ASCII_MAXCHARCODE) buf.data()[k++] = ASCII_SUBSTITUTE; else buf.data()[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) buf.data()[k++] = curs_tab[j]; else { buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @return character vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); STRI__ERROR_HANDLER_BEGIN if (is_unknown_8bit_logical) { SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } else if (IS_ASCII(curs) || IS_UTF8(curs)) { SET_STRING_ELT(ret, i, curs); } else { // some 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn*3+1); // one byte -> either one byte or FFFD, which is 3 bytes in UTF-8 R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) buf.data()[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); } } UNPROTECT(1); return ret; } else { // Trivial - everything we need is in StriContainerUTF8 :) StriContainerUTF8 str_cont(str, n); return str_cont.toR(); } STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-30) * use single common buf; * warn on invalid utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toascii(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize); // no more bytes than this needed char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs)) { // nothing to do SET_STRING_ELT(ret, i, curs); continue; } R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); if (IS_UTF8(curs)) { R_len_t k = 0, j = 0; UChar32 c; while (j<curn) { U8_NEXT(curs_tab, j, curn, c); if (c < 0) { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = ASCII_SUBSTITUTE; } else if (c > ASCII_MAXCHARCODE) bufdata[k++] = ASCII_SUBSTITUTE; else bufdata[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }