SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP d, s, x, stype; int nargs = length(args); #ifdef R_version_3_4_or_so checkArity(op, args); #else // will work also for code byte-compiled *before* 'keepNA' was introduced if (nargs < 3 || nargs > 4) errorcall(call, ngettext("%d argument passed to '%s' which requires %d to %d", "%d arguments passed to '%s' which requires %d to %d", (unsigned long) nargs), nargs, PRIMNAME(op), 3, 4); #endif if (isFactor(CAR(args))) error(_("'%s' requires a character vector"), "nchar()"); PROTECT(x = coerceVector(CAR(args), STRSXP)); if (!isString(x)) error(_("'%s' requires a character vector"), "nchar()"); R_xlen_t len = XLENGTH(x); stype = CADR(args); if (!isString(stype) || LENGTH(stype) != 1) error(_("invalid '%s' argument"), "type"); const char *type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */ size_t ntype = strlen(type); if (ntype == 0) error(_("invalid '%s' argument"), "type"); nchar_type type_; if (strncmp(type, "bytes", ntype) == 0) type_ = Bytes; else if (strncmp(type, "chars", ntype) == 0) type_ = Chars; else if (strncmp(type, "width", ntype) == 0) type_ = Width; else error(_("invalid '%s' argument"), "type"); int allowNA = asLogical(CADDR(args)); if (allowNA == NA_LOGICAL) allowNA = 0; int keepNA; if(nargs >= 4) { keepNA = asLogical(CADDDR(args)); if (keepNA == NA_LOGICAL) // default keepNA = (type_ == Width) ? FALSE : TRUE; } else keepNA = FALSE; // default PROTECT(s = allocVector(INTSXP, len)); int *s_ = INTEGER(s); for (R_xlen_t i = 0; i < len; i++) { SEXP sxi = STRING_ELT(x, i); char msg_i[20]; sprintf(msg_i, "element %ld", (long)i+1); s_[i] = R_nchar(sxi, type_, allowNA, keepNA, msg_i); } R_FreeStringBufferL(&cbuff); if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(s, R_NamesSymbol, d); if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue) setAttrib(s, R_DimSymbol, d); if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(s, R_DimNamesSymbol, d); UNPROTECT(2); return s; }
SEXP attribute_hidden do_substr(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP s, x, sa, so, el; R_xlen_t i, len; int start, stop, k, l; size_t slen; cetype_t ienc; const char *ss; char *buf; checkArity(op, args); x = CAR(args); sa = CADR(args); so = CADDR(args); k = LENGTH(sa); l = LENGTH(so); if (!isString(x)) error(_("extracting substrings from a non-character object")); len = XLENGTH(x); PROTECT(s = allocVector(STRSXP, len)); if (len > 0) { if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0) error(_("invalid substring arguments")); for (i = 0; i < len; i++) { start = INTEGER(sa)[i % k]; stop = INTEGER(so)[i % l]; el = STRING_ELT(x,i); if (el == NA_STRING || start == NA_INTEGER || stop == NA_INTEGER) { SET_STRING_ELT(s, i, NA_STRING); continue; } ienc = getCharCE(el); ss = CHAR(el); slen = strlen(ss); /* FIXME -- should handle embedded nuls */ buf = R_AllocStringBuffer(slen+1, &cbuff); if (start < 1) start = 1; if (start > stop || start > slen) { buf[0] = '\0'; } else { if (stop > slen) stop = (int) slen; substr(buf, ss, ienc, start, stop); } SET_STRING_ELT(s, i, mkCharCE(buf, ienc)); } R_FreeStringBufferL(&cbuff); } DUPLICATE_ATTRIB(s, x); /* This copied the class, if any */ UNPROTECT(1); return s; }
static SEXP cross_colon(SEXP call, SEXP s, SEXP t) { SEXP a, la, ls, lt, rs, rt; int i, j, k, n, nls, nlt; char *cbuf; const void *vmax = vmaxget(); if (length(s) != length(t)) errorcall(call, _("unequal factor lengths")); n = length(s); ls = getLevelsAttrib(s); lt = getLevelsAttrib(t); nls = LENGTH(ls); nlt = LENGTH(lt); PROTECT(a = allocVector(INTSXP, n)); PROTECT(rs = coerceVector(s, INTSXP)); PROTECT(rt = coerceVector(t, INTSXP)); for (i = 0; i < n; i++) { int vs = INTEGER(rs)[i]; int vt = INTEGER(rt)[i]; if ((vs == NA_INTEGER) || (vt == NA_INTEGER)) INTEGER(a)[i] = NA_INTEGER; else INTEGER(a)[i] = vt + (vs - 1) * nlt; } UNPROTECT(2); if (!isNull(ls) && !isNull(lt)) { PROTECT(la = allocVector(STRSXP, nls * nlt)); k = 0; /* FIXME: possibly UTF-8 version */ for (i = 0; i < nls; i++) { const char *vi = translateChar(STRING_ELT(ls, i)); size_t vs = strlen(vi); for (j = 0; j < nlt; j++) { const char *vj = translateChar(STRING_ELT(lt, j)); size_t vt = strlen(vj), len = vs + vt + 2; cbuf = R_AllocStringBuffer(len, &cbuff); snprintf(cbuf, len, "%s:%s", vi, vj); SET_STRING_ELT(la, k, mkChar(cbuf)); k++; } } setAttrib(a, R_LevelsSymbol, la); UNPROTECT(1); } PROTECT(la = mkString("factor")); setAttrib(a, R_ClassSymbol, la); UNPROTECT(2); R_FreeStringBufferL(&cbuff); vmaxset(vmax); return a; }
SEXP attribute_hidden do_abbrev(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP x, ans; R_xlen_t i, len; int minlen; Rboolean warn = FALSE; const char *s; const void *vmax; checkArity(op,args); x = CAR(args); if (!isString(x)) error(_("the first argument must be a character vector")); len = XLENGTH(x); PROTECT(ans = allocVector(STRSXP, len)); minlen = asInteger(CADR(args)); vmax = vmaxget(); for (i = 0 ; i < len ; i++) { if (STRING_ELT(x, i) == NA_STRING) SET_STRING_ELT(ans, i, NA_STRING); else { s = translateChar(STRING_ELT(x, i)); if(strlen(s) > minlen) { warn = warn | !strIsASCII(s); R_AllocStringBuffer(strlen(s), &cbuff); SET_STRING_ELT(ans, i, stripchars(s, minlen)); } else SET_STRING_ELT(ans, i, mkChar(s)); } vmaxset(vmax); } if (warn) warning(_("abbreviate used with non-ASCII chars")); DUPLICATE_ATTRIB(ans, x); /* This copied the class, if any */ R_FreeStringBufferL(&cbuff); UNPROTECT(1); return(ans); }
SEXP attribute_hidden do_substrgets(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP s, x, sa, so, value, el, v_el; R_xlen_t i, len; int start, stop, k, l, v; size_t slen; cetype_t ienc, venc; const char *ss, *v_ss; char *buf; const void *vmax; checkArity(op, args); x = CAR(args); sa = CADR(args); so = CADDR(args); value = CADDDR(args); k = LENGTH(sa); l = LENGTH(so); if (!isString(x)) error(_("replacing substrings in a non-character object")); len = LENGTH(x); PROTECT(s = allocVector(STRSXP, len)); if (len > 0) { if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0) error(_("invalid substring arguments")); v = LENGTH(value); if (!isString(value) || v == 0) error(_("invalid value")); vmax = vmaxget(); for (i = 0; i < len; i++) { el = STRING_ELT(x, i); v_el = STRING_ELT(value, i % v); start = INTEGER(sa)[i % k]; stop = INTEGER(so)[i % l]; if (el == NA_STRING || v_el == NA_STRING || start == NA_INTEGER || stop == NA_INTEGER) { SET_STRING_ELT(s, i, NA_STRING); continue; } ienc = getCharCE(el); ss = CHAR(el); slen = strlen(ss); if (start < 1) start = 1; if (stop > slen) stop = (int) slen; /* SBCS optimization */ if (start > stop) { /* just copy element across */ SET_STRING_ELT(s, i, STRING_ELT(x, i)); } else { int ienc2 = ienc; v_ss = CHAR(v_el); /* is the value in the same encoding? FIXME: could prefer UTF-8 here */ venc = getCharCE(v_el); if (venc != ienc && !strIsASCII(v_ss)) { ss = translateChar(el); slen = strlen(ss); v_ss = translateChar(v_el); ienc2 = CE_NATIVE; } /* might expand under MBCS */ buf = R_AllocStringBuffer(slen+strlen(v_ss), &cbuff); strcpy(buf, ss); substrset(buf, v_ss, ienc2, start, stop); SET_STRING_ELT(s, i, mkCharCE(buf, ienc2)); } vmaxset(vmax); } R_FreeStringBufferL(&cbuff); } UNPROTECT(1); return s; }
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP d, s, x, stype; int i, len, allowNA; size_t ntype; int nc; const char *type; const char *xi; wchar_t *wc; const void *vmax; checkArity(op, args); if (isFactor(CAR(args))) error(_("'%s' requires a character vector"), "nchar()"); PROTECT(x = coerceVector(CAR(args), STRSXP)); if (!isString(x)) error(_("'%s' requires a character vector"), "nchar()"); len = LENGTH(x); stype = CADR(args); if (!isString(stype) || LENGTH(stype) != 1) error(_("invalid '%s' argument"), "type"); type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */ ntype = strlen(type); if (ntype == 0) error(_("invalid '%s' argument"), "type"); allowNA = asLogical(CADDR(args)); if (allowNA == NA_LOGICAL) allowNA = 0; PROTECT(s = allocVector(INTSXP, len)); vmax = vmaxget(); for (i = 0; i < len; i++) { SEXP sxi = STRING_ELT(x, i); if (sxi == NA_STRING) { INTEGER(s)[i] = 2; continue; } if (strncmp(type, "bytes", ntype) == 0) { INTEGER(s)[i] = LENGTH(sxi); } else if (strncmp(type, "chars", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); nc = 0; for( ; *p; p += utf8clen(*p)) nc++; INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { nc = mbstowcs(NULL, translateChar(sxi), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string %d"), i+1); INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else if (strncmp(type, "width", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); wchar_t wc1; nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { xi = translateChar(sxi); nc = mbstowcs(NULL, xi, 0); if (nc >= 0) { wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647); if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc; } else if (allowNA) error(_("invalid multibyte string %d"), i+1); else INTEGER(s)[i] = NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else error(_("invalid '%s' argument"), "type"); vmaxset(vmax); } R_FreeStringBufferL(&cbuff); if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(s, R_NamesSymbol, d); if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue) setAttrib(s, R_DimSymbol, d); if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(s, R_DimNamesSymbol, d); UNPROTECT(2); return s; }
/* Note that NA_STRING is not handled separately here. This is deliberate -- see ?paste -- and implicitly coerces it to "NA" */ SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, collapse, sep, x; int sepw, u_sepw, ienc; R_xlen_t i, j, k, maxlen, nx, pwidth; const char *s, *cbuf, *csep=NULL, *u_csep=NULL; char *buf; Rboolean allKnown, anyKnown, use_UTF8, use_Bytes, sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE, use_sep = (PRIMVAL(op) == 0); const void *vmax; checkArity(op, args); /* We use formatting and so we must initialize printing. */ PrintDefaults(); /* Check the arguments */ x = CAR(args); if (!isVectorList(x)) error(_("invalid first argument")); nx = xlength(x); if(use_sep) { /* paste(..., sep, .) */ sep = CADR(args); if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING) error(_("invalid separator")); sep = STRING_ELT(sep, 0); csep = translateChar(sep); u_sepw = sepw = (int) strlen(csep); // will be short sepASCII = strIsASCII(csep); sepKnown = ENC_KNOWN(sep) > 0; sepUTF8 = IS_UTF8(sep); sepBytes = IS_BYTES(sep); collapse = CADDR(args); } else { /* paste0(..., .) */ u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */ collapse = CADR(args); } if (!isNull(collapse)) if(!isString(collapse) || LENGTH(collapse) <= 0 || STRING_ELT(collapse, 0) == NA_STRING) error(_("invalid '%s' argument"), "collapse"); if(nx == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); /* Maximum argument length, coerce if needed */ maxlen = 0; for (j = 0; j < nx; j++) { if (!isString(VECTOR_ELT(x, j))) { /* formerly in R code: moved to C for speed */ SEXP call, xj = VECTOR_ELT(x, j); if(OBJECT(xj)) { /* method dispatch */ PROTECT(call = lang2(install("as.character"), xj)); SET_VECTOR_ELT(x, j, eval(call, env)); UNPROTECT(1); } else if (isSymbol(xj)) SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj))); else SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP)); if (!isString(VECTOR_ELT(x, j))) error(_("non-string argument to internal 'paste'")); } if(xlength(VECTOR_ELT(x, j)) > maxlen) maxlen = xlength(VECTOR_ELT(x, j)); } if(maxlen == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); PROTECT(ans = allocVector(STRSXP, maxlen)); for (i = 0; i < maxlen; i++) { /* Strategy for marking the encoding: if all inputs (including * the separator) are ASCII, so is the output and we don't * need to mark. Otherwise if all non-ASCII inputs are of * declared encoding, we should mark. * Need to be careful only to include separator if it is used. */ anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE; if(nx > 1) { allKnown = sepKnown || sepASCII; anyKnown = sepKnown; use_UTF8 = sepUTF8; use_Bytes = sepBytes; } pwidth = 0; for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if(IS_UTF8(cs)) use_UTF8 = TRUE; if(IS_BYTES(cs)) use_Bytes = TRUE; } } if (use_Bytes) use_UTF8 = FALSE; vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { if(use_Bytes) pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k))); else if(use_UTF8) pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k))); else pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k))); vmaxset(vmax); } } if(use_sep) { if (use_UTF8 && !u_csep) { u_csep = translateCharUTF8(sep); u_sepw = (int) strlen(u_csep); // will be short } pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw); } if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if (use_UTF8) { s = translateCharUTF8(cs); strcpy(buf, s); buf += strlen(s); } else { s = use_Bytes ? CHAR(cs) : translateChar(cs); strcpy(buf, s); buf += strlen(s); allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0)); anyKnown = anyKnown || (ENC_KNOWN(cs)> 0); } } if (sepw != 0 && j != nx - 1) { if (use_UTF8) { strcpy(buf, u_csep); buf += u_sepw; } else { strcpy(buf, csep); buf += sepw; } } vmax = vmaxget(); } ienc = 0; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc)); } /* Now collapse, if required. */ if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) { sep = STRING_ELT(collapse, 0); use_UTF8 = IS_UTF8(sep); use_Bytes = IS_BYTES(sep); for (i = 0; i < nx; i++) { if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE; if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE; } if(use_Bytes) { csep = CHAR(sep); use_UTF8 = FALSE; } else if(use_UTF8) csep = translateCharUTF8(sep); else csep = translateChar(sep); sepw = (int) strlen(csep); anyKnown = ENC_KNOWN(sep) > 0; allKnown = anyKnown || strIsASCII(csep); pwidth = 0; vmax = vmaxget(); for (i = 0; i < nx; i++) if(use_UTF8) { pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i))); vmaxset(vmax); } else /* already translated */ pwidth += strlen(CHAR(STRING_ELT(ans, i))); pwidth += (nx - 1) * sepw; if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (i = 0; i < nx; i++) { if(i > 0) { strcpy(buf, csep); buf += sepw; } if(use_UTF8) s = translateCharUTF8(STRING_ELT(ans, i)); else /* already translated */ s = CHAR(STRING_ELT(ans, i)); strcpy(buf, s); while (*buf) buf++; allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0)); anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0); if(use_UTF8) vmaxset(vmax); } UNPROTECT(1); ienc = CE_NATIVE; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc)); } R_FreeStringBufferL(&cbuff); UNPROTECT(1); return ans; }
SEXP attribute_hidden do_filepath(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, sep, x; int i, j, k, ln, maxlen, nx, nzero, pwidth, sepw; const char *s, *csep, *cbuf; char *buf; checkArity(op, args); /* Check the arguments */ x = CAR(args); if (!isVectorList(x)) error(_("invalid first argument")); nx = length(x); if(nx == 0) return allocVector(STRSXP, 0); sep = CADR(args); if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING) error(_("invalid separator")); sep = STRING_ELT(sep, 0); csep = CHAR(sep); sepw = (int) strlen(csep); /* hopefully 1 */ /* Any zero-length argument gives zero-length result */ maxlen = 0; nzero = 0; for (j = 0; j < nx; j++) { if (!isString(VECTOR_ELT(x, j))) { /* formerly in R code: moved to C for speed */ SEXP call, xj = VECTOR_ELT(x, j); if(OBJECT(xj)) { /* method dispatch */ PROTECT(call = lang2(install("as.character"), xj)); SET_VECTOR_ELT(x, j, eval(call, env)); UNPROTECT(1); } else if (isSymbol(xj)) SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj))); else SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP)); if (!isString(VECTOR_ELT(x, j))) error(_("non-string argument to Internal paste")); } ln = length(VECTOR_ELT(x, j)); if(ln > maxlen) maxlen = ln; if(ln == 0) {nzero++; break;} } if(nzero || maxlen == 0) return allocVector(STRSXP, 0); PROTECT(ans = allocVector(STRSXP, maxlen)); for (i = 0; i < maxlen; i++) { pwidth = 0; for (j = 0; j < nx; j++) { k = length(VECTOR_ELT(x, j)); pwidth += (int) strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k))); } pwidth += (nx - 1) * sepw; cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); for (j = 0; j < nx; j++) { k = length(VECTOR_ELT(x, j)); if (k > 0) { s = translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k)); strcpy(buf, s); buf += strlen(s); } if (j != nx - 1 && sepw != 0) { strcpy(buf, csep); buf += sepw; } } #ifdef Win32 // Trailing seps are invalid for file paths except for / and d:/ if(streql(csep, "/") || streql(csep, "\\")) { if(buf > cbuf) { buf--; if(*buf == csep[0] && buf > cbuf && (buf != cbuf+2 || cbuf[1] != ':')) *buf = '\0'; } } #endif SET_STRING_ELT(ans, i, mkChar(cbuf)); } R_FreeStringBufferL(&cbuff); UNPROTECT(1); return ans; }