wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){ static wchar_t filename[BSIZE + 1]; void *obj; const char *from = "", *inbuf; char *outbuf; size_t inb, outb, res; if(!strlen(CHAR(fn))){ wcscpy(filename, L""); return filename; } if(IS_LATIN1(fn)) from = "latin1"; if(IS_UTF8(fn)) from = "UTF-8"; if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'"); obj = Riconv_open("UCS-2LE", from); if(obj == (void *)(-1)) REprintf("unsupported conversion from '%s' in shellexec_wcc.c", from); if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn); inb = strlen(inbuf)+1; outb = 2*BSIZE; outbuf = (char *) filename; res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); Riconv_close(obj); if(inb > 0) REprintf("file name conversion problem -- name too long?"); if(res == -1) REprintf("file name conversion problem"); return filename; } /* End of filenameToWchar_wcc(). */
int muste_get_R_string_vec(char *dest,char *sour,int length,int element) { SEXP enc; SEXP avar=R_NilValue; char *hakuapu,*hakubuf; int len; hakuapu=strchr(sour,'$')+1; if (hakuapu==NULL) hakuapu=sour; avar = findVar(install(hakuapu),muste_environment); // RS CHA R_GlobalEnv); if (!isString(avar)) // RS 29.8.2013 { *dest=EOS; return(0); } enc=STRING_ELT(avar,element); // RS 20.12.2012 Convert automatically to CP850 hakuapu=(char *)CHAR(enc); len=strlen(hakuapu); hakubuf=(char *)malloc(len+2); if (hakubuf==NULL) return(0); strcpy(hakubuf,hakuapu); if (IS_UTF8(enc)) { muste_iconv(hakubuf,"CP850","UTF-8"); } else muste_iconv(hakubuf,"CP850",""); snprintf(dest,length,"%s",hakubuf); free(hakubuf); return(1); }
/** Check R encoding marking *for testing only* * This function should not be exported * * @param s character vector * * Results are printed on STDERR * * @version 0.1 (Marek Gagolewski) */ SEXP stri_test_Rmark(SEXP s) { #ifndef NDEBUG s = stri_prepare_arg_string(s, "str"); int ns = LENGTH(s); for (int i=0; i < ns; ++i) { fprintf(stdout, "!NDEBUG: Element #%d:\n", i); SEXP curs = STRING_ELT(s, i); if (curs == NA_STRING){ fprintf(stdout, "!NDEBUG: \tNA\n"); continue; } //const char* string = CHAR(curs); fprintf(stdout, "!NDEBUG: \tMARK_ASCII = %d\n", (IS_ASCII(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_UTF8 = %d\n", (IS_UTF8(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_LATIN1= %d\n", (IS_LATIN1(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_BYTES = %d\n", (IS_BYTES(curs) > 0)); fprintf(stdout, "!NDEBUG: \n"); } return R_NilValue; #else Rf_error("This function is enabled only if NDEBUG is undef."); return s; // s here avoids compiler warning #endif }
cetype_t getCharCE(SEXP x) { if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "getCharCE"); if(IS_UTF8(x)) return CE_UTF8; else if(IS_LATIN1(x)) return CE_LATIN1; else if(IS_BYTES(x)) return CE_BYTES; else return CE_NATIVE; }
R_xlen_t get_first_reencode_pos(const CharacterVector& xc) { R_xlen_t len = xc.length(); for (R_xlen_t i = 0; i < len; ++i) { SEXP xci = xc[i]; if (xci != NA_STRING && !IS_ASCII(xci) && !IS_UTF8(xci)) { return i; } } return len; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateCharUTF8(SEXP x) { void *obj; const char *inbuf, *ans = CHAR(x); char *outbuf, *p; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8"); if(x == NA_STRING) return ans; if(IS_UTF8(x)) return ans; if(IS_ASCII(x)) return ans; if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8"); #endif R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; } *outbuf = '\0'; Riconv_close(obj); res = strlen(cbuff.data) + 1; p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toascii(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } else if (IS_ASCII(curs)) { SET_STRING_ELT(ret, i, curs); } else if (IS_UTF8(curs)) { R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); // this may be 4 times too much R_len_t k = 0; UChar32 c; for (int j=0; j<curn; ) { U8_NEXT(curs_tab, j, curn, c); if (c > ASCII_MAXCHARCODE) buf.data()[k++] = ASCII_SUBSTITUTE; else buf.data()[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn+1); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) buf.data()[k++] = curs_tab[j]; else { buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @return character vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); STRI__ERROR_HANDLER_BEGIN if (is_unknown_8bit_logical) { SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } else if (IS_ASCII(curs) || IS_UTF8(curs)) { SET_STRING_ELT(ret, i, curs); } else { // some 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: buffer reuse.... String8 buf(curn*3+1); // one byte -> either one byte or FFFD, which is 3 bytes in UTF-8 R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) buf.data()[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); } } UNPROTECT(1); return ret; } else { // Trivial - everything we need is in StriContainerUTF8 :) StriContainerUTF8 str_cont(str, n); return str_cont.toR(); } STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
CharacterVector reencode_char(SEXP x) { if (Rf_isFactor(x)) return reencode_factor(x); CharacterVector xc(x); R_xlen_t first = get_first_reencode_pos(xc); if (first >= xc.length()) return x; CharacterVector ret(Rf_duplicate(xc)); R_xlen_t len = ret.length(); for (R_xlen_t i = first; i < len; ++i) { SEXP reti = ret[i]; if (reti != NA_STRING && !IS_ASCII(reti) && !IS_UTF8(reti)) { ret[i] = String(Rf_translateCharUTF8(reti), CE_UTF8); } } return ret; }
/** Get Declared Encodings of Each String * * @param str a character vector or an object coercible to * @return a character vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_mark(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); // some of them will not be used in this call, but we're lazy SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes; STRI__PROTECT(mark_ascii = Rf_mkChar("ASCII")); STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1")); STRI__PROTECT(mark_utf8 = Rf_mkChar("UTF-8")); STRI__PROTECT(mark_native = Rf_mkChar("native")); STRI__PROTECT(mark_bytes = Rf_mkChar("bytes")); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i=0; i<str_len; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs)) SET_STRING_ELT(ret, i, mark_ascii); else if (IS_UTF8(curs)) SET_STRING_ELT(ret, i, mark_utf8); else if (IS_BYTES(curs)) SET_STRING_ELT(ret, i, mark_bytes); else if (IS_LATIN1(curs)) SET_STRING_ELT(ret, i, mark_latin1); else SET_STRING_ELT(ret, i, mark_native); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP d, s, x, stype; int i, len, allowNA; size_t ntype; int nc; const char *type; const char *xi; wchar_t *wc; const void *vmax; checkArity(op, args); if (isFactor(CAR(args))) error(_("'%s' requires a character vector"), "nchar()"); PROTECT(x = coerceVector(CAR(args), STRSXP)); if (!isString(x)) error(_("'%s' requires a character vector"), "nchar()"); len = LENGTH(x); stype = CADR(args); if (!isString(stype) || LENGTH(stype) != 1) error(_("invalid '%s' argument"), "type"); type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */ ntype = strlen(type); if (ntype == 0) error(_("invalid '%s' argument"), "type"); allowNA = asLogical(CADDR(args)); if (allowNA == NA_LOGICAL) allowNA = 0; PROTECT(s = allocVector(INTSXP, len)); vmax = vmaxget(); for (i = 0; i < len; i++) { SEXP sxi = STRING_ELT(x, i); if (sxi == NA_STRING) { INTEGER(s)[i] = 2; continue; } if (strncmp(type, "bytes", ntype) == 0) { INTEGER(s)[i] = LENGTH(sxi); } else if (strncmp(type, "chars", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); nc = 0; for( ; *p; p += utf8clen(*p)) nc++; INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { nc = mbstowcs(NULL, translateChar(sxi), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string %d"), i+1); INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else if (strncmp(type, "width", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); wchar_t wc1; nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { xi = translateChar(sxi); nc = mbstowcs(NULL, xi, 0); if (nc >= 0) { wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647); if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc; } else if (allowNA) error(_("invalid multibyte string %d"), i+1); else INTEGER(s)[i] = NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else error(_("invalid '%s' argument"), "type"); vmaxset(vmax); } R_FreeStringBufferL(&cbuff); if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(s, R_NamesSymbol, d); if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue) setAttrib(s, R_DimSymbol, d); if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(s, R_DimNamesSymbol, d); UNPROTECT(2); return s; }
/* Note that NA_STRING is not handled separately here. This is deliberate -- see ?paste -- and implicitly coerces it to "NA" */ SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, collapse, sep, x; int sepw, u_sepw, ienc; R_xlen_t i, j, k, maxlen, nx, pwidth; const char *s, *cbuf, *csep=NULL, *u_csep=NULL; char *buf; Rboolean allKnown, anyKnown, use_UTF8, use_Bytes, sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE, use_sep = (PRIMVAL(op) == 0); const void *vmax; checkArity(op, args); /* We use formatting and so we must initialize printing. */ PrintDefaults(); /* Check the arguments */ x = CAR(args); if (!isVectorList(x)) error(_("invalid first argument")); nx = xlength(x); if(use_sep) { /* paste(..., sep, .) */ sep = CADR(args); if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING) error(_("invalid separator")); sep = STRING_ELT(sep, 0); csep = translateChar(sep); u_sepw = sepw = (int) strlen(csep); // will be short sepASCII = strIsASCII(csep); sepKnown = ENC_KNOWN(sep) > 0; sepUTF8 = IS_UTF8(sep); sepBytes = IS_BYTES(sep); collapse = CADDR(args); } else { /* paste0(..., .) */ u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */ collapse = CADR(args); } if (!isNull(collapse)) if(!isString(collapse) || LENGTH(collapse) <= 0 || STRING_ELT(collapse, 0) == NA_STRING) error(_("invalid '%s' argument"), "collapse"); if(nx == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); /* Maximum argument length, coerce if needed */ maxlen = 0; for (j = 0; j < nx; j++) { if (!isString(VECTOR_ELT(x, j))) { /* formerly in R code: moved to C for speed */ SEXP call, xj = VECTOR_ELT(x, j); if(OBJECT(xj)) { /* method dispatch */ PROTECT(call = lang2(install("as.character"), xj)); SET_VECTOR_ELT(x, j, eval(call, env)); UNPROTECT(1); } else if (isSymbol(xj)) SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj))); else SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP)); if (!isString(VECTOR_ELT(x, j))) error(_("non-string argument to internal 'paste'")); } if(xlength(VECTOR_ELT(x, j)) > maxlen) maxlen = xlength(VECTOR_ELT(x, j)); } if(maxlen == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); PROTECT(ans = allocVector(STRSXP, maxlen)); for (i = 0; i < maxlen; i++) { /* Strategy for marking the encoding: if all inputs (including * the separator) are ASCII, so is the output and we don't * need to mark. Otherwise if all non-ASCII inputs are of * declared encoding, we should mark. * Need to be careful only to include separator if it is used. */ anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE; if(nx > 1) { allKnown = sepKnown || sepASCII; anyKnown = sepKnown; use_UTF8 = sepUTF8; use_Bytes = sepBytes; } pwidth = 0; for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if(IS_UTF8(cs)) use_UTF8 = TRUE; if(IS_BYTES(cs)) use_Bytes = TRUE; } } if (use_Bytes) use_UTF8 = FALSE; vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { if(use_Bytes) pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k))); else if(use_UTF8) pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k))); else pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k))); vmaxset(vmax); } } if(use_sep) { if (use_UTF8 && !u_csep) { u_csep = translateCharUTF8(sep); u_sepw = (int) strlen(u_csep); // will be short } pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw); } if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if (use_UTF8) { s = translateCharUTF8(cs); strcpy(buf, s); buf += strlen(s); } else { s = use_Bytes ? CHAR(cs) : translateChar(cs); strcpy(buf, s); buf += strlen(s); allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0)); anyKnown = anyKnown || (ENC_KNOWN(cs)> 0); } } if (sepw != 0 && j != nx - 1) { if (use_UTF8) { strcpy(buf, u_csep); buf += u_sepw; } else { strcpy(buf, csep); buf += sepw; } } vmax = vmaxget(); } ienc = 0; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc)); } /* Now collapse, if required. */ if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) { sep = STRING_ELT(collapse, 0); use_UTF8 = IS_UTF8(sep); use_Bytes = IS_BYTES(sep); for (i = 0; i < nx; i++) { if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE; if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE; } if(use_Bytes) { csep = CHAR(sep); use_UTF8 = FALSE; } else if(use_UTF8) csep = translateCharUTF8(sep); else csep = translateChar(sep); sepw = (int) strlen(csep); anyKnown = ENC_KNOWN(sep) > 0; allKnown = anyKnown || strIsASCII(csep); pwidth = 0; vmax = vmaxget(); for (i = 0; i < nx; i++) if(use_UTF8) { pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i))); vmaxset(vmax); } else /* already translated */ pwidth += strlen(CHAR(STRING_ELT(ans, i))); pwidth += (nx - 1) * sepw; if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (i = 0; i < nx; i++) { if(i > 0) { strcpy(buf, csep); buf += sepw; } if(use_UTF8) s = translateCharUTF8(STRING_ELT(ans, i)); else /* already translated */ s = CHAR(STRING_ELT(ans, i)); strcpy(buf, s); while (*buf) buf++; allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0)); anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0); if(use_UTF8) vmaxset(vmax); } UNPROTECT(1); ienc = CE_NATIVE; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc)); } R_FreeStringBufferL(&cbuff); UNPROTECT(1); return ans; }
/** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs)) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs) || IS_UTF8(curs)) { R_len_t curs_n = LENGTH(curs); const char* curs_s = CHAR(curs); if (curs_n >= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { // 0xEF 0xBF 0xBD bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } // validate utf8 byte stream if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE R_len_t ret_n = LENGTH(ret); for (R_len_t i=0; i<ret_n; ++i) { SEXP curs = STRING_ELT(ret, i); if (curs == NA_STRING) continue; const char* s = CHAR(curs); R_len_t sn = LENGTH(curs); R_len_t j = 0; UChar32 c = 0; while (c >= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; R_len_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
int R_nchar(SEXP string, nchar_type type_, Rboolean allowNA, Rboolean keepNA, const char* msg_name) { if (string == NA_STRING) return keepNA ? NA_INTEGER : 2; // else : switch(type_) { case Bytes: return LENGTH(string); break; case Chars: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { int nc = 0; for( ; *p; p += utf8clen(*p)) nc++; return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable in \"bytes\" encoding, %s"), msg_name); return NA_INTEGER; } else if (mbcslocale) { int nc = (int) mbstowcs(NULL, translateChar(string), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string, %s"), msg_name); return (nc >= 0 ? nc : NA_INTEGER); } else return ((int) strlen(translateChar(string))); break; case Width: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { wchar_t wc1; int nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for %s in \"bytes\" encoding"), msg_name); return NA_INTEGER; } else if (mbcslocale) { const char *xi = translateChar(string); int nc = (int) mbstowcs(NULL, xi, 0); if (nc >= 0) { const void *vmax = vmaxget(); wchar_t *wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); int nci18n = Ri18n_wcswidth(wc, 2147483647); vmaxset(vmax); return (nci18n < 1) ? nc : nci18n; } else if (allowNA) error(_("invalid multibyte string, %s"), msg_name); else return NA_INTEGER; } else return (int) strlen(translateChar(string)); } // switch return NA_INTEGER; // -Wall } // R_nchar()
SEXP installTrChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; size_t inb, outb, res; cetype_t ienc = getCharCE(x); R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "installTrChar"); if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(utf8locale && IS_UTF8(x)) return install(ans); if(latin1locale && IS_LATIN1(x)) return install(ans); if(IS_ASCII(x)) return install(ans); if(IS_LATIN1(x)) { if(!latin1_obj) { obj = Riconv_open("", "latin1"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif latin1_obj = obj; } obj = latin1_obj; } else { if(!utf8_obj) { obj = Riconv_open("", "UTF-8"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif utf8_obj = obj; } obj = utf8_obj; } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 13) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } if (ienc == CE_UTF8) { /* if starting in UTF-8, use \uxxxx */ /* This must be the first byte */ size_t clen; wchar_t wc; clen = utf8toucs(&wc, inbuf); if(clen > 0 && inb >= clen) { inbuf += clen; inb -= clen; # ifndef Win32 if((unsigned int) wc < 65536) { # endif snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc); outbuf += 8; outb -= 8; # ifndef Win32 } else { snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc); outbuf += 12; outb -= 12; } # endif } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } goto next_char; } *outbuf = '\0'; SEXP Sans = install(cbuff.data); R_FreeStringBuffer(&cbuff); return Sans; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */ const wchar_t *wtransChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; wchar_t *p; size_t inb, outb, res, top; Rboolean knownEnc = FALSE; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "wtransChar"); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(IS_LATIN1(x)) { if(!latin1_wobj) { obj = Riconv_open(TO_WCHAR, "latin1"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); latin1_wobj = obj; } else obj = latin1_wobj; knownEnc = TRUE; } else if(IS_UTF8(x)) { if(!utf8_wobj) { obj = Riconv_open(TO_WCHAR, "UTF-8"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); utf8_wobj = obj; } else obj = utf8_wobj; knownEnc = TRUE; } else { obj = Riconv_open(TO_WCHAR, ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion to '%s' from codepage %d"), TO_WCHAR, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR); #endif } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; /* if(!knownEnc) Riconv_close(obj); error(_("invalid input in wtransChar")); */ } if(!knownEnc) Riconv_close(obj); res = (top - outb); /* terminator is 2 or 4 null bytes */ p = (wchar_t *) R_alloc(res+4, 1); memset(p, 0, res+4); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
// If we find a non-ASCII, non-NA, non-UTF8 encoding, we try to convert it to UTF8. That is, marked non-ascii/non-UTF8 encodings will always be checked in UTF8 locale. This seems to be the best fix I could think of to put the encoding issues to rest.. // Since the if-statement will fail with the first condition check in "normal" ASCII cases, there shouldn't be huge penalty issues for default setup. // Fix for #66, #69, #469 and #1293 // TODO: compare 1.9.6 performance with 1.9.7 with huge number of ASCII strings. SEXP ENC2UTF8(SEXP s) { if (!IS_ASCII(s) && s != NA_STRING && !IS_UTF8(s)) s = mkCharCE(translateCharUTF8(s), CE_UTF8); return (s); }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Construct String Container from an R character vector * * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n == 0) return; /* nothing more to do */ this->str = new UnicodeString[this->n]; if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) this->str[i].setToBogus(); // in case it fails during conversion (this is NA) /* Important: ICU provides full internationalization functionality without any conversion table data. The common library contains code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ StriUcnv ucnvASCII("US-ASCII"); StriUcnv ucnvLatin1("ISO-8859-1"); StriUcnv ucnvNative(NULL); for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } if (IS_ASCII(curs)) { // Version 1: UConverter* ucnv = ucnvASCII.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster than the ucnvASCII approach. // Performance improvement attempt #3: // slightly slower than ucnvASCII // R_len_t curs_n = LENGTH(curs); // const char* curs_s = CHAR(curs); // this->str[i].remove(); // unset bogus (NA) // UChar* buf = this->str[i].getBuffer(curs_n); // for (R_len_t k=0; k<curs_n; ++k) // buf[k] = (UChar)curs_s[k]; // well, this is ASCII :) // this->str[i].releaseBuffer(curs_n); } else if (IS_UTF8(curs)) { // using ucnvUTF8 is slower for UTF-8 // the same is done for native encoding && ucnvNative_isUTF8 this->str[i].setTo(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { UConverter* ucnv = ucnvLatin1.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else if (IS_BYTES(curs)) {
/** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-30) * use single common buf; * warn on invalid utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toascii(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) // get buf size R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) continue; R_len_t ni = LENGTH(curs); if (ni > bufsize) bufsize = ni; } String8buf buf(bufsize); // no more bytes than this needed char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i<n; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING || IS_ASCII(curs)) { // nothing to do SET_STRING_ELT(ret, i, curs); continue; } R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); if (IS_UTF8(curs)) { R_len_t k = 0, j = 0; UChar32 c; while (j<curn) { U8_NEXT(curs_tab, j, curn, c); if (c < 0) { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = ASCII_SUBSTITUTE; } else if (c > ASCII_MAXCHARCODE) bufdata[k++] = ASCII_SUBSTITUTE; else bufdata[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t k = 0; for (R_len_t j=0; j<curn; ++j) { if (U8_IS_SINGLE(curs_tab[j])) bufdata[k++] = curs_tab[j]; else { bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n > 0) { this->str = new UnicodeString*[this->n]; for (R_len_t i=0; i<this->n; ++i) this->str[i] = NULL; // in case it fails during conversion (this is NA) UConverter* ucnvASCII = NULL; // UConverter* ucnvUTF8 = NULL; UConverter* ucnvLatin1 = NULL; UConverter* ucnvNative = NULL; for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } else { if (IS_ASCII(curs)) { if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvASCII, status); if (U_FAILURE(status)) throw StriException(status); // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster tham the ucnvASCII approach. } else if (IS_UTF8(curs)) { // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8 this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvLatin1, status); if (U_FAILURE(status)) throw StriException(status); } else if (IS_BYTES(curs)) throw StriException(MSG__BYTESENC); else { // Any encoding - detection needed // Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvNative, status); if (U_FAILURE(status)) throw StriException(status); } } } if (ucnvASCII) ucnv_close(ucnvASCII); // if (ucnvUTF8) ucnv_close(ucnvUTF8); if (ucnvLatin1) ucnv_close(ucnvLatin1); if (ucnvNative) ucnv_close(ucnvNative); if (!_shallowrecycle) { for (R_len_t i=nrstr; i<this->n; ++i) { if (this->str[i%nrstr] == NULL) this->str[i] = NULL; else this->str[i] = new UnicodeString(*this->str[i%nrstr]); } } } }