/* used in plot.c for non-UTF-8 MBCS */ size_t attribute_hidden mbtoucs(unsigned int *wc, const char *s, size_t n) { unsigned int wcs[2]; char buf[16]; void *cd; const char *inbuf = s; size_t inbytesleft = strlen(s); char *outbuf = (char *) wcs; size_t outbytesleft = sizeof(buf); size_t status; if(s[0] == 0) {*wc = 0; return 1;} if((void *)(-1) == (cd = Riconv_open(UNICODE, ""))) return (size_t)(-1); status = Riconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (status == (size_t) -1) { switch(errno){ case EINVAL: return (size_t) -2; case EILSEQ: return (size_t) -1; case E2BIG: break; default: errno = EILSEQ; return (size_t) -1; } } Riconv_close(cd); *wc = wcs[0]; return (size_t) 1; }
wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){ static wchar_t filename[BSIZE + 1]; void *obj; const char *from = "", *inbuf; char *outbuf; size_t inb, outb, res; if(!strlen(CHAR(fn))){ wcscpy(filename, L""); return filename; } if(IS_LATIN1(fn)) from = "latin1"; if(IS_UTF8(fn)) from = "UTF-8"; if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'"); obj = Riconv_open("UCS-2LE", from); if(obj == (void *)(-1)) REprintf("unsupported conversion from '%s' in shellexec_wcc.c", from); if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn); inb = strlen(inbuf)+1; outb = 2*BSIZE; outbuf = (char *) filename; res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); Riconv_close(obj); if(inb > 0) REprintf("file name conversion problem -- name too long?"); if(res == -1) REprintf("file name conversion problem"); return filename; } /* End of filenameToWchar_wcc(). */
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateCharUTF8(SEXP x) { void *obj; const char *inbuf, *ans = CHAR(x); char *outbuf, *p; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8"); if(x == NA_STRING) return ans; if(IS_UTF8(x)) return ans; if(IS_ASCII(x)) return ans; if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8"); #endif R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; } *outbuf = '\0'; Riconv_close(obj); res = strlen(cbuff.data) + 1; p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/* iconv(x, from, to, sub, mark) */ SEXP attribute_hidden do_iconv(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x = CAR(args), si; void * obj; const char *inbuf; char *outbuf; const char *sub; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; Rboolean isRawlist = FALSE; checkArity(op, args); if(isNull(x)) { /* list locales */ #ifdef HAVE_ICONVLIST cnt = 0; iconvlist(count_one, NULL); PROTECT(ans = allocVector(STRSXP, cnt)); cnt = 0; iconvlist(write_one, (void *)ans); #else PROTECT(ans = R_NilValue); #endif } else { int mark, toRaw; const char *from, *to; Rboolean isLatin1 = FALSE, isUTF8 = FALSE; args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "from"); from = CHAR(STRING_ELT(CAR(args), 0)); /* ASCII */ args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "to"); to = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "sub"); if(STRING_ELT(CAR(args), 0) == NA_STRING) sub = NULL; else sub = translateChar(STRING_ELT(CAR(args), 0)); args = CDR(args); mark = asLogical(CAR(args)); if(mark == NA_LOGICAL) error(_("invalid '%s' argument"), "mark"); args = CDR(args); toRaw = asLogical(CAR(args)); if(toRaw == NA_LOGICAL) error(_("invalid '%s' argument"), "toRaw"); /* some iconv's allow "UTF8", but libiconv does not */ if(streql(from, "UTF8") || streql(from, "utf8") ) from = "UTF-8"; if(streql(to, "UTF8") || streql(to, "utf8") ) to = "UTF-8"; /* Should we do something about marked CHARSXPs in 'from = ""'? */ if(streql(to, "UTF-8")) isUTF8 = TRUE; if(streql(to, "latin1") || streql(to, "ISO_8859-1") || streql(to, "CP1252")) isLatin1 = TRUE; if(streql(to, "") && known_to_be_latin1) isLatin1 = TRUE; if(streql(to, "") && known_to_be_utf8) isUTF8 = TRUE; obj = Riconv_open(to, from); if(obj == (iconv_t)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' to '%s' in codepage %d"), from, to, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), from, to); #endif isRawlist = (TYPEOF(x) == VECSXP); if(isRawlist) { if(toRaw) PROTECT(ans = duplicate(x)); else { PROTECT(ans = allocVector(STRSXP, LENGTH(x))); DUPLICATE_ATTRIB(ans, x); } } else { if(TYPEOF(x) != STRSXP) error(_("'x' must be a character vector")); if(toRaw) { PROTECT(ans = allocVector(VECSXP, LENGTH(x))); DUPLICATE_ATTRIB(ans, x); } else PROTECT(ans = duplicate(x)); } R_AllocStringBuffer(0, &cbuff); /* 0 -> default */ for(R_xlen_t i = 0; i < XLENGTH(x); i++) { if (isRawlist) { si = VECTOR_ELT(x, i); if (TYPEOF(si) == NILSXP) { if (!toRaw) SET_STRING_ELT(ans, i, NA_STRING); continue; } else if (TYPEOF(si) != RAWSXP) error(_("'x' must be a list of NULL or raw vectors")); } else { si = STRING_ELT(x, i); if (si == NA_STRING) { if(!toRaw) SET_STRING_ELT(ans, i, NA_STRING); continue; } } top_of_loop: inbuf = isRawlist ? (const char *) RAW(si) : CHAR(si); inb = LENGTH(si); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); *outbuf = '\0'; /* other possible error conditions are incomplete and invalid multibyte chars */ if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && sub && (errno == EILSEQ || errno == EINVAL)) { /* it seems this gets thrown for non-convertible input too */ if(strcmp(sub, "byte") == 0) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; } else { size_t j; if(outb < strlen(sub)) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } memcpy(outbuf, sub, j = strlen(sub)); outbuf += j; outb -= j; } inbuf++; inb--; goto next_char; } if(toRaw) { if(res != -1 && inb == 0) { size_t nout = cbuff.bufsize - 1 - outb; SEXP el = allocVector(RAWSXP, nout); memcpy(RAW(el), cbuff.data, nout); SET_VECTOR_ELT(ans, i, el); } /* otherwise is already NULL */ } else { if(res != -1 && inb == 0) { cetype_t ienc = CE_NATIVE; size_t nout = cbuff.bufsize - 1 - outb; if(mark) { if(isLatin1) ienc = CE_LATIN1; else if(isUTF8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, (int) nout, ienc)); } else SET_STRING_ELT(ans, i, NA_STRING); } } Riconv_close(obj); R_FreeStringBuffer(&cbuff); } UNPROTECT(1); return ans; }
/* A version avoiding R_alloc for use in the Rgui editor */ void reEnc2(const char *x, char *y, int ny, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; char buf[20]; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; strncpy(y, x, ny); y[ny - 1] = '\0'; if(ce_in == ce_out || ce_in == CE_ANY || ce_out == CE_ANY) return; if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return; if(strIsASCII(x)) return; switch(ce_in) { case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; case CE_UTF8: fromcode = "UTF-8"; break; default: return; } switch(ce_out) { case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ if (res > ny) error("converted string too long for buffer"); memcpy(y, cbuff.data, res); R_FreeStringBuffer(&cbuff); }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf, *p; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; #ifdef Win32 char buf[20]; #endif R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; /* We can only encode from Symbol to UTF-8 */ if(ce_in == ce_out || ce_out == CE_SYMBOL || ce_in == CE_ANY || ce_out == CE_ANY) return x; if(ce_in == CE_SYMBOL) { if(ce_out == CE_UTF8) { size_t nc = 3*strlen(x)+1; /* all in BMP */ p = R_alloc(nc, 1); Rf_AdobeSymbol2utf8(p, x, nc); return p; } else return x; } if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return x; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return x; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return x; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return x; if(strIsASCII(x)) return x; switch(ce_in) { #ifdef Win32 case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; #else case CE_NATIVE: fromcode = ""; break; case CE_LATIN1: fromcode = "latin1"; break; #endif case CE_UTF8: fromcode = "UTF-8"; break; default: return x; } switch(ce_out) { #ifdef Win32 case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } #else case CE_NATIVE: tocode = ""; break; #endif case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return x; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return x; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */ const wchar_t *wtransChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; wchar_t *p; size_t inb, outb, res, top; Rboolean knownEnc = FALSE; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "wtransChar"); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(IS_LATIN1(x)) { if(!latin1_wobj) { obj = Riconv_open(TO_WCHAR, "latin1"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); latin1_wobj = obj; } else obj = latin1_wobj; knownEnc = TRUE; } else if(IS_UTF8(x)) { if(!utf8_wobj) { obj = Riconv_open(TO_WCHAR, "UTF-8"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); utf8_wobj = obj; } else obj = utf8_wobj; knownEnc = TRUE; } else { obj = Riconv_open(TO_WCHAR, ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion to '%s' from codepage %d"), TO_WCHAR, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR); #endif } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; /* if(!knownEnc) Riconv_close(obj); error(_("invalid input in wtransChar")); */ } if(!knownEnc) Riconv_close(obj); res = (top - outb); /* terminator is 2 or 4 null bytes */ p = (wchar_t *) R_alloc(res+4, 1); memset(p, 0, res+4); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }