static void substr(char *buf, const char *str, int ienc, int sa, int so) { /* Store the substring str [sa:so] into buf[] */ int i, j, used; if (ienc == CE_UTF8) { const char *end = str + strlen(str); for (i = 0; i < so && str < end; i++) { int used = utf8clen(*str); if (i < sa - 1) { str += used; continue; } for (j = 0; j < used; j++) *buf++ = *str++; } } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) { for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++; } else { if (mbcslocale && !strIsASCII(str)) { const char *end = str + strlen(str); mbstate_t mb_st; mbs_init(&mb_st); for (i = 1; i < sa; i++) str += Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st); for (i = sa; i <= so && str < end; i++) { used = (int) Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st); for (j = 0; j < used; j++) *buf++ = *str++; } } else for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++; } *buf = '\0'; }
SEXP readClipboard(SEXP sformat, SEXP sraw) { SEXP ans = R_NilValue; HGLOBAL hglb; const char *pc; int j, format, raw, size; format = asInteger(sformat); raw = asLogical(sraw); if(OpenClipboard(NULL)) { if(IsClipboardFormatAvailable(format) && (hglb = GetClipboardData(format)) && (pc = (const char *) GlobalLock(hglb))) { if(raw) { Rbyte *pans; size = GlobalSize(hglb); ans = allocVector(RAWSXP, size); /* no R allocation below */ pans = RAW(ans); for (j = 0; j < size; j++) pans[j] = *pc++; } else if (format == CF_UNICODETEXT) { int n, ienc = CE_NATIVE; const wchar_t *wpc = (wchar_t *) pc; n = wcslen(wpc); char text[2 * (n+1)]; /* UTF-8 is at most 1.5x longer */ R_CheckStack(); wcstoutf8(text, wpc, n+1); if(!strIsASCII(text)) ienc = CE_UTF8; ans = splitClipboardText(text, ienc); } else if (format == CF_TEXT || format == CF_OEMTEXT || format == CF_DIF) { /* can we get the encoding out of a CF_LOCALE entry? */ ans = splitClipboardText(pc, 0); } else error("'raw = FALSE' and format is a not a known text format"); GlobalUnlock(hglb); } CloseClipboard(); } return ans; }
SEXP attribute_hidden do_abbrev(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP x, ans; R_xlen_t i, len; int minlen; Rboolean warn = FALSE; const char *s; const void *vmax; checkArity(op,args); x = CAR(args); if (!isString(x)) error(_("the first argument must be a character vector")); len = XLENGTH(x); PROTECT(ans = allocVector(STRSXP, len)); minlen = asInteger(CADR(args)); vmax = vmaxget(); for (i = 0 ; i < len ; i++) { if (STRING_ELT(x, i) == NA_STRING) SET_STRING_ELT(ans, i, NA_STRING); else { s = translateChar(STRING_ELT(x, i)); if(strlen(s) > minlen) { warn = warn | !strIsASCII(s); R_AllocStringBuffer(strlen(s), &cbuff); SET_STRING_ELT(ans, i, stripchars(s, minlen)); } else SET_STRING_ELT(ans, i, mkChar(s)); } vmaxset(vmax); } if (warn) warning(_("abbreviate used with non-ASCII chars")); DUPLICATE_ATTRIB(ans, x); /* This copied the class, if any */ R_FreeStringBufferL(&cbuff); UNPROTECT(1); return(ans); }
SEXP attribute_hidden do_substrgets(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP s, x, sa, so, value, el, v_el; R_xlen_t i, len; int start, stop, k, l, v; size_t slen; cetype_t ienc, venc; const char *ss, *v_ss; char *buf; const void *vmax; checkArity(op, args); x = CAR(args); sa = CADR(args); so = CADDR(args); value = CADDDR(args); k = LENGTH(sa); l = LENGTH(so); if (!isString(x)) error(_("replacing substrings in a non-character object")); len = LENGTH(x); PROTECT(s = allocVector(STRSXP, len)); if (len > 0) { if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0) error(_("invalid substring arguments")); v = LENGTH(value); if (!isString(value) || v == 0) error(_("invalid value")); vmax = vmaxget(); for (i = 0; i < len; i++) { el = STRING_ELT(x, i); v_el = STRING_ELT(value, i % v); start = INTEGER(sa)[i % k]; stop = INTEGER(so)[i % l]; if (el == NA_STRING || v_el == NA_STRING || start == NA_INTEGER || stop == NA_INTEGER) { SET_STRING_ELT(s, i, NA_STRING); continue; } ienc = getCharCE(el); ss = CHAR(el); slen = strlen(ss); if (start < 1) start = 1; if (stop > slen) stop = (int) slen; /* SBCS optimization */ if (start > stop) { /* just copy element across */ SET_STRING_ELT(s, i, STRING_ELT(x, i)); } else { int ienc2 = ienc; v_ss = CHAR(v_el); /* is the value in the same encoding? FIXME: could prefer UTF-8 here */ venc = getCharCE(v_el); if (venc != ienc && !strIsASCII(v_ss)) { ss = translateChar(el); slen = strlen(ss); v_ss = translateChar(v_el); ienc2 = CE_NATIVE; } /* might expand under MBCS */ buf = R_AllocStringBuffer(slen+strlen(v_ss), &cbuff); strcpy(buf, ss); substrset(buf, v_ss, ienc2, start, stop); SET_STRING_ELT(s, i, mkCharCE(buf, ienc2)); } vmaxset(vmax); } R_FreeStringBufferL(&cbuff); } UNPROTECT(1); return s; }
/* A version avoiding R_alloc for use in the Rgui editor */ void reEnc2(const char *x, char *y, int ny, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; char buf[20]; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; strncpy(y, x, ny); y[ny - 1] = '\0'; if(ce_in == ce_out || ce_in == CE_ANY || ce_out == CE_ANY) return; if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return; if(strIsASCII(x)) return; switch(ce_in) { case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; case CE_UTF8: fromcode = "UTF-8"; break; default: return; } switch(ce_out) { case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ if (res > ny) error("converted string too long for buffer"); memcpy(y, cbuff.data, res); R_FreeStringBuffer(&cbuff); }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out, int subst) { void * obj; const char *inbuf; char *outbuf, *p; size_t inb, outb, res, top; char *tocode = NULL, *fromcode = NULL; #ifdef Win32 char buf[20]; #endif R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; /* We can only encode from Symbol to UTF-8 */ if(ce_in == ce_out || ce_out == CE_SYMBOL || ce_in == CE_ANY || ce_out == CE_ANY) return x; if(ce_in == CE_SYMBOL) { if(ce_out == CE_UTF8) { size_t nc = 3*strlen(x)+1; /* all in BMP */ p = R_alloc(nc, 1); Rf_AdobeSymbol2utf8(p, x, nc); return p; } else return x; } if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return x; if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return x; if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return x; if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return x; if(strIsASCII(x)) return x; switch(ce_in) { #ifdef Win32 case CE_NATIVE: { /* Looks like CP1252 is treated as Latin-1 by iconv */ snprintf(buf, 20, "CP%d", localeCP); fromcode = buf; break; } case CE_LATIN1: fromcode = "CP1252"; break; #else case CE_NATIVE: fromcode = ""; break; case CE_LATIN1: fromcode = "latin1"; break; #endif case CE_UTF8: fromcode = "UTF-8"; break; default: return x; } switch(ce_out) { #ifdef Win32 case CE_NATIVE: { /* avoid possible misidentification of CP1250 as LATIN-2 */ snprintf(buf, 20, "CP%d", localeCP); tocode = buf; break; } #else case CE_NATIVE: tocode = ""; break; #endif case CE_LATIN1: tocode = "latin1"; break; case CE_UTF8: tocode = "UTF-8"; break; default: return x; } obj = Riconv_open(tocode, fromcode); if(obj == (void *)(-1)) return x; R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = x; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { switch(subst) { case 1: /* substitute hex */ if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; break; case 2: /* substitute . */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '.'; inbuf++; outb--; inb--; goto next_char; break; case 3: /* substitute ? */ if(outb < 1) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } *outbuf++ = '?'; inbuf++; outb--; inb--; goto next_char; break; default: /* skip byte */ inbuf++; inb--; goto next_char; } } Riconv_close(obj); *outbuf = '\0'; res = (top-outb)+1; /* strlen(cbuff.data) + 1; */ p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/* Note that NA_STRING is not handled separately here. This is deliberate -- see ?paste -- and implicitly coerces it to "NA" */ SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, collapse, sep, x; int sepw, u_sepw, ienc; R_xlen_t i, j, k, maxlen, nx, pwidth; const char *s, *cbuf, *csep=NULL, *u_csep=NULL; char *buf; Rboolean allKnown, anyKnown, use_UTF8, use_Bytes, sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE, use_sep = (PRIMVAL(op) == 0); const void *vmax; checkArity(op, args); /* We use formatting and so we must initialize printing. */ PrintDefaults(); /* Check the arguments */ x = CAR(args); if (!isVectorList(x)) error(_("invalid first argument")); nx = xlength(x); if(use_sep) { /* paste(..., sep, .) */ sep = CADR(args); if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING) error(_("invalid separator")); sep = STRING_ELT(sep, 0); csep = translateChar(sep); u_sepw = sepw = (int) strlen(csep); // will be short sepASCII = strIsASCII(csep); sepKnown = ENC_KNOWN(sep) > 0; sepUTF8 = IS_UTF8(sep); sepBytes = IS_BYTES(sep); collapse = CADDR(args); } else { /* paste0(..., .) */ u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */ collapse = CADR(args); } if (!isNull(collapse)) if(!isString(collapse) || LENGTH(collapse) <= 0 || STRING_ELT(collapse, 0) == NA_STRING) error(_("invalid '%s' argument"), "collapse"); if(nx == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); /* Maximum argument length, coerce if needed */ maxlen = 0; for (j = 0; j < nx; j++) { if (!isString(VECTOR_ELT(x, j))) { /* formerly in R code: moved to C for speed */ SEXP call, xj = VECTOR_ELT(x, j); if(OBJECT(xj)) { /* method dispatch */ PROTECT(call = lang2(install("as.character"), xj)); SET_VECTOR_ELT(x, j, eval(call, env)); UNPROTECT(1); } else if (isSymbol(xj)) SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj))); else SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP)); if (!isString(VECTOR_ELT(x, j))) error(_("non-string argument to internal 'paste'")); } if(xlength(VECTOR_ELT(x, j)) > maxlen) maxlen = xlength(VECTOR_ELT(x, j)); } if(maxlen == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); PROTECT(ans = allocVector(STRSXP, maxlen)); for (i = 0; i < maxlen; i++) { /* Strategy for marking the encoding: if all inputs (including * the separator) are ASCII, so is the output and we don't * need to mark. Otherwise if all non-ASCII inputs are of * declared encoding, we should mark. * Need to be careful only to include separator if it is used. */ anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE; if(nx > 1) { allKnown = sepKnown || sepASCII; anyKnown = sepKnown; use_UTF8 = sepUTF8; use_Bytes = sepBytes; } pwidth = 0; for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if(IS_UTF8(cs)) use_UTF8 = TRUE; if(IS_BYTES(cs)) use_Bytes = TRUE; } } if (use_Bytes) use_UTF8 = FALSE; vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { if(use_Bytes) pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k))); else if(use_UTF8) pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k))); else pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k))); vmaxset(vmax); } } if(use_sep) { if (use_UTF8 && !u_csep) { u_csep = translateCharUTF8(sep); u_sepw = (int) strlen(u_csep); // will be short } pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw); } if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if (use_UTF8) { s = translateCharUTF8(cs); strcpy(buf, s); buf += strlen(s); } else { s = use_Bytes ? CHAR(cs) : translateChar(cs); strcpy(buf, s); buf += strlen(s); allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0)); anyKnown = anyKnown || (ENC_KNOWN(cs)> 0); } } if (sepw != 0 && j != nx - 1) { if (use_UTF8) { strcpy(buf, u_csep); buf += u_sepw; } else { strcpy(buf, csep); buf += sepw; } } vmax = vmaxget(); } ienc = 0; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc)); } /* Now collapse, if required. */ if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) { sep = STRING_ELT(collapse, 0); use_UTF8 = IS_UTF8(sep); use_Bytes = IS_BYTES(sep); for (i = 0; i < nx; i++) { if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE; if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE; } if(use_Bytes) { csep = CHAR(sep); use_UTF8 = FALSE; } else if(use_UTF8) csep = translateCharUTF8(sep); else csep = translateChar(sep); sepw = (int) strlen(csep); anyKnown = ENC_KNOWN(sep) > 0; allKnown = anyKnown || strIsASCII(csep); pwidth = 0; vmax = vmaxget(); for (i = 0; i < nx; i++) if(use_UTF8) { pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i))); vmaxset(vmax); } else /* already translated */ pwidth += strlen(CHAR(STRING_ELT(ans, i))); pwidth += (nx - 1) * sepw; if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (i = 0; i < nx; i++) { if(i > 0) { strcpy(buf, csep); buf += sepw; } if(use_UTF8) s = translateCharUTF8(STRING_ELT(ans, i)); else /* already translated */ s = CHAR(STRING_ELT(ans, i)); strcpy(buf, s); while (*buf) buf++; allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0)); anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0); if(use_UTF8) vmaxset(vmax); } UNPROTECT(1); ienc = CE_NATIVE; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc)); } R_FreeStringBufferL(&cbuff); UNPROTECT(1); return ans; }