/* used in gram.c and devX11.c */ size_t ucstomb(char *s, const unsigned int wc) { char buf[MB_CUR_MAX+1]; void *cd = NULL ; unsigned int wcs[2]; const char *inbuf = (const char *) wcs; size_t inbytesleft = sizeof(unsigned int); /* better be 4 */ char *outbuf = buf; size_t outbytesleft = sizeof(buf); size_t status; if(wc == 0) {*s = '\0'; return 1;} memset(buf, 0, sizeof(buf)); memset(wcs, 0, sizeof(wcs)); wcs[0] = wc; if(ucsmb_obj == NULL) { if((void *)(-1) == (cd = Riconv_open("", UNICODE))) { #ifndef Win32 char tocode[128]; /* locale set fuzzy case */ strncpy(tocode, locale2charset(NULL), sizeof(tocode)); tocode[sizeof(tocode) - 1] = '\0'; if((void *)(-1) == (cd = Riconv_open(tocode, UNICODE))) return (size_t)(-1); #else return (size_t)(-1); #endif } ucsmb_obj = cd; } status = Riconv(ucsmb_obj, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (status == (size_t) -1) { switch(errno){ case EINVAL: return (size_t) -2; case EILSEQ: return (size_t) -1; case E2BIG: break; default: errno = EILSEQ; return (size_t) -1; } } buf[MB_CUR_MAX] = '\0'; /* safety measure */ strcpy(s, buf); return strlen(buf); }
wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){ static wchar_t filename[BSIZE + 1]; void *obj; const char *from = "", *inbuf; char *outbuf; size_t inb, outb, res; if(!strlen(CHAR(fn))){ wcscpy(filename, L""); return filename; } if(IS_LATIN1(fn)) from = "latin1"; if(IS_UTF8(fn)) from = "UTF-8"; if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'"); obj = Riconv_open("UCS-2LE", from); if(obj == (void *)(-1)) REprintf("unsupported conversion from '%s' in shellexec_wcc.c", from); if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn); inb = strlen(inbuf)+1; outb = 2*BSIZE; outbuf = (char *) filename; res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); Riconv_close(obj); if(inb > 0) REprintf("file name conversion problem -- name too long?"); if(res == -1) REprintf("file name conversion problem"); return filename; } /* End of filenameToWchar_wcc(). */
/* used in plot.c for non-UTF-8 MBCS */ size_t attribute_hidden mbtoucs(unsigned int *wc, const char *s, size_t n) { unsigned int wcs[2]; char buf[16]; void *cd; const char *inbuf = s; size_t inbytesleft = strlen(s); char *outbuf = (char *) wcs; size_t outbytesleft = sizeof(buf); size_t status; if(s[0] == 0) {*wc = 0; return 1;} if((void *)(-1) == (cd = Riconv_open(UNICODE, ""))) return (size_t)(-1); status = Riconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (status == (size_t) -1) { switch(errno){ case EINVAL: return (size_t) -2; case EILSEQ: return (size_t) -1; case E2BIG: break; default: errno = EILSEQ; return (size_t) -1; } } Riconv_close(cd); *wc = wcs[0]; return (size_t) 1; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateCharUTF8(SEXP x) { void *obj; const char *inbuf, *ans = CHAR(x); char *outbuf, *p; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8"); if(x == NA_STRING) return ans; if(IS_UTF8(x)) return ans; if(IS_ASCII(x)) return ans; if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8"); #endif R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; } *outbuf = '\0'; Riconv_close(obj); res = strlen(cbuff.data) + 1; p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
static int FileReadConsole(const char *prompt, char *buf, int len, int addhistory) { int ll, err = 0; if (!R_Slave) { fputs(prompt, stdout); fflush(stdout); } if (fgets(buf, len, ifp ? ifp : stdin) == NULL) return 0; /* translate if necessary */ if(strlen(R_StdinEnc) && strcmp(R_StdinEnc, "native.enc")) { size_t res, inb = strlen(buf), onb = len; const char *ib = buf; char obuf[len+1], *ob = obuf; if(!cd) { cd = Riconv_open("", R_StdinEnc); if(cd == (void *)-1) error(_("encoding '%s' is not recognised"), R_StdinEnc); } res = Riconv(cd, &ib, &inb, &ob, &onb); *ob = '\0'; err = (res == (size_t)(-1)); /* errors lead to part of the input line being ignored */ if(err) printf(_("<ERROR: re-encoding failure from encoding '%s'>\n"), R_StdinEnc); strncpy(buf, obuf, len); } /* according to system.txt, should be terminated in \n, so check this at eof or error */ ll = strlen(buf); if ((err || feof(ifp ? ifp: stdin)) && buf[ll - 1] != '\n' && ll < len) { buf[ll++] = '\n'; buf[ll] = '\0'; } if (!R_Interactive && !R_Slave) { fputs(buf, stdout); fflush(stdout); } return 1; }
/* made available for use in graphics devices */ size_t ucstoutf8(char *s, const unsigned int wc) { char buf[16]; void *cd = NULL ; unsigned int wcs[2]; const char *inbuf = (const char *) wcs; size_t inbytesleft = sizeof(unsigned int); /* better be 4 */ char *outbuf = buf; size_t outbytesleft = sizeof(buf); size_t status; if(wc == 0) {*s = '\0'; return 1;} memset(buf, 0, sizeof(buf)); wcs[0] = wc; wcs[1] = 0; if(ucsutf8_obj == NULL) { if((void *)(-1) == (cd = Riconv_open("UTF-8", UNICODE))) { error(_("unsupported conversion from '%s' to '%s'"), UNICODE, "UTF-8"); return (size_t)(-1); } ucsutf8_obj = cd; } status = Riconv(ucsutf8_obj, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (status == (size_t) -1) { switch(errno){ case E2BIG: break; default: error(_("invalid Unicode point %u"), wc); return (size_t) -1; // Not reached } } *outbuf = '\0'; strcpy(s, buf); return strlen(buf); }
static int FileReadConsole(char *prompt, char *buf, int len, int addhistory) { int ll, err = 0; char inbuf[1001]; if (!R_Slave) { fputs(prompt, stdout); fflush(stdout); } if (fgets(inbuf, len, stdin) == NULL) return 0; /* translate if necessary */ if(strlen(R_StdinEnc) && strcmp(R_StdinEnc, "native.enc")) { size_t res, inb = strlen(inbuf), onb = len; char *ib = inbuf, *ob = buf; if(!cd) { cd = Riconv_open("", R_StdinEnc); if(!cd) error(_("encoding '%s' is not recognised"), R_StdinEnc); } res = Riconv(cd, &ib, &inb, &ob, &onb); *ob = '\0'; err = res == (size_t)(-1); /* errors lead to part of the input line being ignored */ if(err) fputs(_("<ERROR: invalid input in encoding> "), stdout); } else strncpy(buf, inbuf, strlen(inbuf)+1); /* according to system.txt, should be terminated in \n, so check this at eof or error */ ll = strlen((char *)buf); if ((err || feof(stdin)) && buf[ll - 1] != '\n' && ll < len) { buf[ll++] = '\n'; buf[ll] = '\0'; } if (!R_Interactive && !R_Slave) fputs(buf, stdout); return 1; }
int attribute_hidden Rstd_ReadConsole(const char *prompt, unsigned char *buf, int len, int addtohistory) { if(!R_Interactive) { size_t ll; int err = 0; if (!R_Slave) { fputs(prompt, stdout); fflush(stdout); /* make sure prompt is output */ } if (fgets((char *)buf, len, ifp ? ifp: stdin) == NULL) return 0; ll = strlen((char *)buf); /* remove CR in CRLF ending */ if (ll >= 2 && buf[ll - 1] == '\n' && buf[ll - 2] == '\r') { buf[ll - 2] = '\n'; buf[--ll] = '\0'; } /* translate if necessary */ if(strlen(R_StdinEnc) && strcmp(R_StdinEnc, "native.enc")) { size_t res, inb = strlen((char *)buf), onb = len; /* NB: this is somewhat dangerous. R's main loop and scan will not call it with a larger value, but contributed code might. */ char obuf[CONSOLE_BUFFER_SIZE+1]; const char *ib = (const char *)buf; char *ob = obuf; if(!cd) { cd = Riconv_open("", R_StdinEnc); if(cd == (void *)-1) error(_("encoding '%s' is not recognised"), R_StdinEnc); } res = Riconv(cd, &ib, &inb, &ob, &onb); *ob = '\0'; err = res == (size_t)(-1); /* errors lead to part of the input line being ignored */ if(err) printf(_("<ERROR: re-encoding failure from encoding '%s'>\n"), R_StdinEnc); strncpy((char *)buf, obuf, len); } /* according to system.txt, should be terminated in \n, so check this at eof and error */ if ((err || feof(ifp ? ifp : stdin)) && (ll == 0 || buf[ll - 1] != '\n') && ll < len) { buf[ll++] = '\n'; buf[ll] = '\0'; } if (!R_Slave) { fputs((char *)buf, stdout); fflush(stdout); } return 1; } else { #ifdef HAVE_LIBREADLINE R_ReadlineData rl_data; if (UsingReadline) { rl_data.readline_gotaline = 0; rl_data.readline_buf = buf; rl_data.readline_addtohistory = addtohistory; rl_data.readline_len = len; rl_data.readline_eof = 0; rl_data.prev = rl_top; rl_top = &rl_data; /* Allow conditional parsing of the ~/.inputrc file. */ rl_readline_name = "R"; pushReadline(prompt, readline_handler); #ifdef HAVE_RL_COMPLETION_MATCHES initialize_rlcompletion(); #endif } else #endif /* HAVE_LIBREADLINE */ { fputs(prompt, stdout); fflush(stdout); } if(R_InputHandlers == NULL) initStdinHandler(); for (;;) { fd_set *what; int wt = -1; if (R_wait_usec > 0) wt = R_wait_usec; if (Rg_wait_usec > 0 && (wt < 0 || wt > Rg_wait_usec)) wt = Rg_wait_usec; what = R_checkActivityEx(wt, 0, handleInterrupt); /* This is slightly clumsy. We have advertised the * convention that R_wait_usec == 0 means "wait forever", * but we also need to enable R_checkActivity to return * immediately. */ R_runHandlers(R_InputHandlers, what); if (what == NULL) continue; if (FD_ISSET(fileno(stdin), what)) { /* We could make this a regular handler, but we need * to pass additional arguments. */ #ifdef HAVE_LIBREADLINE if (UsingReadline) { rl_callback_read_char(); if(rl_data.readline_eof || rl_data.readline_gotaline) { rl_top = rl_data.prev; return(rl_data.readline_eof ? 0 : 1); } } else #endif /* HAVE_LIBREADLINE */ { if(fgets((char *)buf, len, stdin) == NULL) return 0; else return 1; } } } } }
SEXP installTrChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; size_t inb, outb, res; cetype_t ienc = getCharCE(x); R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "installTrChar"); if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(utf8locale && IS_UTF8(x)) return install(ans); if(latin1locale && IS_LATIN1(x)) return install(ans); if(IS_ASCII(x)) return install(ans); if(IS_LATIN1(x)) { if(!latin1_obj) { obj = Riconv_open("", "latin1"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif latin1_obj = obj; } obj = latin1_obj; } else { if(!utf8_obj) { obj = Riconv_open("", "UTF-8"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif utf8_obj = obj; } obj = utf8_obj; } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 13) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } if (ienc == CE_UTF8) { /* if starting in UTF-8, use \uxxxx */ /* This must be the first byte */ size_t clen; wchar_t wc; clen = utf8toucs(&wc, inbuf); if(clen > 0 && inb >= clen) { inbuf += clen; inb -= clen; # ifndef Win32 if((unsigned int) wc < 65536) { # endif snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc); outbuf += 8; outb -= 8; # ifndef Win32 } else { snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc); outbuf += 12; outb -= 12; } # endif } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } goto next_char; } *outbuf = '\0'; SEXP Sans = install(cbuff.data); R_FreeStringBuffer(&cbuff); return Sans; }
/* iconv(x, from, to, sub, mark) */ SEXP attribute_hidden do_iconv(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x = CAR(args), si; void * obj; const char *inbuf; char *outbuf; const char *sub; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; Rboolean isRawlist = FALSE; checkArity(op, args); if(isNull(x)) { /* list locales */ #ifdef HAVE_ICONVLIST cnt = 0; iconvlist(count_one, NULL); PROTECT(ans = allocVector(STRSXP, cnt)); cnt = 0; iconvlist(write_one, (void *)ans); #else PROTECT(ans = R_NilValue); #endif } else { int mark, toRaw; const char *from, *to; Rboolean isLatin1 = FALSE, isUTF8 = FALSE; args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "from"); from = CHAR(STRING_ELT(CAR(args), 0)); /* ASCII */ args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "to"); to = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args); if(!isString(CAR(args)) || length(CAR(args)) != 1) error(_("invalid '%s' argument"), "sub"); if(STRING_ELT(CAR(args), 0) == NA_STRING) sub = NULL; else sub = translateChar(STRING_ELT(CAR(args), 0)); args = CDR(args); mark = asLogical(CAR(args)); if(mark == NA_LOGICAL) error(_("invalid '%s' argument"), "mark"); args = CDR(args); toRaw = asLogical(CAR(args)); if(toRaw == NA_LOGICAL) error(_("invalid '%s' argument"), "toRaw"); /* some iconv's allow "UTF8", but libiconv does not */ if(streql(from, "UTF8") || streql(from, "utf8") ) from = "UTF-8"; if(streql(to, "UTF8") || streql(to, "utf8") ) to = "UTF-8"; /* Should we do something about marked CHARSXPs in 'from = ""'? */ if(streql(to, "UTF-8")) isUTF8 = TRUE; if(streql(to, "latin1") || streql(to, "ISO_8859-1") || streql(to, "CP1252")) isLatin1 = TRUE; if(streql(to, "") && known_to_be_latin1) isLatin1 = TRUE; if(streql(to, "") && known_to_be_utf8) isUTF8 = TRUE; obj = Riconv_open(to, from); if(obj == (iconv_t)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' to '%s' in codepage %d"), from, to, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), from, to); #endif isRawlist = (TYPEOF(x) == VECSXP); if(isRawlist) { if(toRaw) PROTECT(ans = duplicate(x)); else { PROTECT(ans = allocVector(STRSXP, LENGTH(x))); DUPLICATE_ATTRIB(ans, x); } } else { if(TYPEOF(x) != STRSXP) error(_("'x' must be a character vector")); if(toRaw) { PROTECT(ans = allocVector(VECSXP, LENGTH(x))); DUPLICATE_ATTRIB(ans, x); } else PROTECT(ans = duplicate(x)); } R_AllocStringBuffer(0, &cbuff); /* 0 -> default */ for(R_xlen_t i = 0; i < XLENGTH(x); i++) { if (isRawlist) { si = VECTOR_ELT(x, i); if (TYPEOF(si) == NILSXP) { if (!toRaw) SET_STRING_ELT(ans, i, NA_STRING); continue; } else if (TYPEOF(si) != RAWSXP) error(_("'x' must be a list of NULL or raw vectors")); } else { si = STRING_ELT(x, i); if (si == NA_STRING) { if(!toRaw) SET_STRING_ELT(ans, i, NA_STRING); continue; } } top_of_loop: inbuf = isRawlist ? (const char *) RAW(si) : CHAR(si); inb = LENGTH(si); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); *outbuf = '\0'; /* other possible error conditions are incomplete and invalid multibyte chars */ if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && sub && (errno == EILSEQ || errno == EINVAL)) { /* it seems this gets thrown for non-convertible input too */ if(strcmp(sub, "byte") == 0) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; } else { size_t j; if(outb < strlen(sub)) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } memcpy(outbuf, sub, j = strlen(sub)); outbuf += j; outb -= j; } inbuf++; inb--; goto next_char; } if(toRaw) { if(res != -1 && inb == 0) { size_t nout = cbuff.bufsize - 1 - outb; SEXP el = allocVector(RAWSXP, nout); memcpy(RAW(el), cbuff.data, nout); SET_VECTOR_ELT(ans, i, el); } /* otherwise is already NULL */ } else { if(res != -1 && inb == 0) { cetype_t ienc = CE_NATIVE; size_t nout = cbuff.bufsize - 1 - outb; if(mark) { if(isLatin1) ienc = CE_LATIN1; else if(isUTF8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, (int) nout, ienc)); } else SET_STRING_ELT(ans, i, NA_STRING); } } Riconv_close(obj); R_FreeStringBuffer(&cbuff); } UNPROTECT(1); return ans; }
/* A version avoiding R_alloc for use in the Rgui editor */ void reEnc2(const char *x, char *y, int ny, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; char buf[20]; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; strncpy(y, x, ny); y[ny - 1] = '\0'; if(ce_in == ce_out || ce_in == CE_ANY || ce_out == CE_ANY) return; if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return; if(strIsASCII(x)) return; switch(ce_in) { case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; case CE_UTF8: fromcode = "UTF-8"; break; default: return; } switch(ce_out) { case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ if (res > ny) error("converted string too long for buffer"); memcpy(y, cbuff.data, res); R_FreeStringBuffer(&cbuff); }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf, *p; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; #ifdef Win32 char buf[20]; #endif R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; /* We can only encode from Symbol to UTF-8 */ if(ce_in == ce_out || ce_out == CE_SYMBOL || ce_in == CE_ANY || ce_out == CE_ANY) return x; if(ce_in == CE_SYMBOL) { if(ce_out == CE_UTF8) { size_t nc = 3*strlen(x)+1; /* all in BMP */ p = R_alloc(nc, 1); Rf_AdobeSymbol2utf8(p, x, nc); return p; } else return x; } if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return x; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return x; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return x; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return x; if(strIsASCII(x)) return x; switch(ce_in) { #ifdef Win32 case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; #else case CE_NATIVE: fromcode = ""; break; case CE_LATIN1: fromcode = "latin1"; break; #endif case CE_UTF8: fromcode = "UTF-8"; break; default: return x; } switch(ce_out) { #ifdef Win32 case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } #else case CE_NATIVE: tocode = ""; break; #endif case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return x; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return x; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */ const wchar_t *wtransChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; wchar_t *p; size_t inb, outb, res, top; Rboolean knownEnc = FALSE; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "wtransChar"); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(IS_LATIN1(x)) { if(!latin1_wobj) { obj = Riconv_open(TO_WCHAR, "latin1"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); latin1_wobj = obj; } else obj = latin1_wobj; knownEnc = TRUE; } else if(IS_UTF8(x)) { if(!utf8_wobj) { obj = Riconv_open(TO_WCHAR, "UTF-8"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); utf8_wobj = obj; } else obj = utf8_wobj; knownEnc = TRUE; } else { obj = Riconv_open(TO_WCHAR, ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion to '%s' from codepage %d"), TO_WCHAR, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR); #endif } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; /* if(!knownEnc) Riconv_close(obj); error(_("invalid input in wtransChar")); */ } if(!knownEnc) Riconv_close(obj); res = (top - outb); /* terminator is 2 or 4 null bytes */ p = (wchar_t *) R_alloc(res+4, 1); memset(p, 0, res+4); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }