Beispiel #1
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
const char *translateCharUTF8(SEXP x)
{
    void *obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf, *p;
    size_t inb, outb, res;
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8");
    if(x == NA_STRING) return ans;
    if(IS_UTF8(x)) return ans;
    if(IS_ASCII(x)) return ans;
    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));

    obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : "");
    if(obj == (void *)(-1)) 
#ifdef Win32
	error(_("unsupported conversion from '%s' in codepage %d"),
	      "latin1", localeCP);
#else
       error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8");
#endif
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 5) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	outbuf += 4; outb -= 4;
	inbuf++; inb--;
	goto next_char;
    }
    *outbuf = '\0';
    Riconv_close(obj);
    res = strlen(cbuff.data) + 1;
    p = R_alloc(res, 1);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
Beispiel #2
0
SEXP attribute_hidden do_substr(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP s, x, sa, so, el;
    R_xlen_t i, len;
    int start, stop, k, l;
    size_t slen;
    cetype_t ienc;
    const char *ss;
    char *buf;

    checkArity(op, args);
    x = CAR(args);
    sa = CADR(args);
    so = CADDR(args);
    k = LENGTH(sa);
    l = LENGTH(so);

    if (!isString(x))
	error(_("extracting substrings from a non-character object"));
    len = XLENGTH(x);
    PROTECT(s = allocVector(STRSXP, len));
    if (len > 0) {
	if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0)
	    error(_("invalid substring arguments"));

	for (i = 0; i < len; i++) {
	    start = INTEGER(sa)[i % k];
	    stop = INTEGER(so)[i % l];
	    el = STRING_ELT(x,i);
	    if (el == NA_STRING || start == NA_INTEGER || stop == NA_INTEGER) {
		SET_STRING_ELT(s, i, NA_STRING);
		continue;
	    }
	    ienc = getCharCE(el);
	    ss = CHAR(el);
	    slen = strlen(ss); /* FIXME -- should handle embedded nuls */
	    buf = R_AllocStringBuffer(slen+1, &cbuff);
	    if (start < 1) start = 1;
	    if (start > stop || start > slen) {
		buf[0] = '\0';
	    } else {
		if (stop > slen) stop = (int) slen;
		substr(buf, ss, ienc, start, stop);
	    }
	    SET_STRING_ELT(s, i, mkCharCE(buf, ienc));
	}
	R_FreeStringBufferL(&cbuff);
    }
    DUPLICATE_ATTRIB(s, x);
    /* This copied the class, if any */
    UNPROTECT(1);
    return s;
}
Beispiel #3
0
static SEXP cross_colon(SEXP call, SEXP s, SEXP t)
{
    SEXP a, la, ls, lt, rs, rt;
    int i, j, k, n, nls, nlt;
    char *cbuf;
    const void *vmax = vmaxget();

    if (length(s) != length(t))
	errorcall(call, _("unequal factor lengths"));
    n = length(s);
    ls = getLevelsAttrib(s);
    lt = getLevelsAttrib(t);
    nls = LENGTH(ls);
    nlt = LENGTH(lt);
    PROTECT(a = allocVector(INTSXP, n));
    PROTECT(rs = coerceVector(s, INTSXP));
    PROTECT(rt = coerceVector(t, INTSXP));
    for (i = 0; i < n; i++) {
	int vs = INTEGER(rs)[i];
	int vt = INTEGER(rt)[i];
	if ((vs == NA_INTEGER) || (vt == NA_INTEGER))
	    INTEGER(a)[i] = NA_INTEGER;
	else
	    INTEGER(a)[i] = vt + (vs - 1) * nlt;
    }
    UNPROTECT(2);
    if (!isNull(ls) && !isNull(lt)) {
	PROTECT(la = allocVector(STRSXP, nls * nlt));
	k = 0;
	/* FIXME: possibly UTF-8 version */
	for (i = 0; i < nls; i++) {
	    const char *vi = translateChar(STRING_ELT(ls, i));
	    size_t vs = strlen(vi);
	    for (j = 0; j < nlt; j++) {
		const char *vj = translateChar(STRING_ELT(lt, j));
		size_t vt = strlen(vj), len = vs + vt + 2;
		cbuf = R_AllocStringBuffer(len, &cbuff);
		snprintf(cbuf, len, "%s:%s", vi, vj);
		SET_STRING_ELT(la, k, mkChar(cbuf));
		k++;
	    }
	}
	setAttrib(a, R_LevelsSymbol, la);
	UNPROTECT(1);
    }
    PROTECT(la = mkString("factor"));
    setAttrib(a, R_ClassSymbol, la);
    UNPROTECT(2);
    R_FreeStringBufferL(&cbuff);
    vmaxset(vmax);
    return a;
}
Beispiel #4
0
SEXP attribute_hidden do_abbrev(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP x, ans;
    R_xlen_t i, len;
    int minlen;
    Rboolean warn = FALSE;
    const char *s;
    const void *vmax;

    checkArity(op,args);
    x = CAR(args);

    if (!isString(x))
	error(_("the first argument must be a character vector"));
    len = XLENGTH(x);

    PROTECT(ans = allocVector(STRSXP, len));
    minlen = asInteger(CADR(args));
    vmax = vmaxget();
    for (i = 0 ; i < len ; i++) {
	if (STRING_ELT(x, i) == NA_STRING)
	    SET_STRING_ELT(ans, i, NA_STRING);
	else {
	    s = translateChar(STRING_ELT(x, i));
	    if(strlen(s) > minlen) {
		warn = warn | !strIsASCII(s);
		R_AllocStringBuffer(strlen(s), &cbuff);
		SET_STRING_ELT(ans, i, stripchars(s, minlen));
	    } else SET_STRING_ELT(ans, i, mkChar(s));
	}
	vmaxset(vmax);
    }
    if (warn) warning(_("abbreviate used with non-ASCII chars"));
    DUPLICATE_ATTRIB(ans, x);
    /* This copied the class, if any */
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return(ans);
}
Beispiel #5
0
/* a version of EncodeElement with different escaping of char strings */
static const char
*EncodeElement2(SEXP x, int indx, Rboolean quote,
		Rboolean qmethod, R_StringBuffer *buff, char cdec)
{
    int nbuf;
    char *q;
    const char *p, *p0;

    if(TYPEOF(x) == STRSXP) {
	p0 = translateChar(STRING_ELT(x, indx));
	if(!quote) return p0;
	for(nbuf = 2, p = p0; *p; p++) /* find buffer length needed */
	    nbuf += (*p == '"') ? 2 : 1;
	R_AllocStringBuffer(nbuf, buff);
	q = buff->data; *q++ = '"';
	for(p = p0; *p;) {
	    if(*p == '"') *q++ = qmethod ? '\\' : '"';
	    *q++ = *p++;
	}
	*q++ = '"'; *q = '\0';
	return buff->data;
    }
    return EncodeElement(x, indx, quote ? '"' : 0, cdec);
}
Beispiel #6
0
SEXP installTrChar(SEXP x)
{
    void * obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf;
    size_t inb, outb, res;
    cetype_t ienc = getCharCE(x);
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "installTrChar");
    if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans);
    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));
    if(utf8locale && IS_UTF8(x)) return install(ans);
    if(latin1locale && IS_LATIN1(x)) return install(ans);
    if(IS_ASCII(x)) return install(ans);

    if(IS_LATIN1(x)) {
	if(!latin1_obj) {
	    obj = Riconv_open("", "latin1");
	    /* should never happen */
	    if(obj == (void *)(-1))
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    latin1_obj = obj;
	}
	obj = latin1_obj;
    } else {
	if(!utf8_obj) {
	    obj = Riconv_open("", "UTF-8");
	    /* should never happen */
	    if(obj == (void *)(-1)) 
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    utf8_obj = obj;
	}
	obj = utf8_obj;
    }

    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 13) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	if (ienc == CE_UTF8) {
	    /* if starting in UTF-8, use \uxxxx */
	    /* This must be the first byte */
	    size_t clen;
	    wchar_t wc;
	    clen = utf8toucs(&wc, inbuf);
	    if(clen > 0 && inb >= clen) {
		inbuf += clen; inb -= clen;
# ifndef Win32
		if((unsigned int) wc < 65536) {
# endif
		    snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc);
		    outbuf += 8; outb -= 8;
# ifndef Win32
		} else {
		    snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc);
		    outbuf += 12; outb -= 12;
		}
# endif
	    } else {
		snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
		outbuf += 4; outb -= 4;
		inbuf++; inb--;
	    }
	} else {
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	}
	goto next_char;
    }
    *outbuf = '\0';
    SEXP Sans = install(cbuff.data);
    R_FreeStringBuffer(&cbuff);
    return Sans;
}
Beispiel #7
0
/* iconv(x, from, to, sub, mark) */
SEXP attribute_hidden do_iconv(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP ans, x = CAR(args), si;
    void * obj;
    const char *inbuf;
    char *outbuf;
    const char *sub;
    size_t inb, outb, res;
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};
    Rboolean isRawlist = FALSE;

    checkArity(op, args);
    if(isNull(x)) {  /* list locales */
#ifdef HAVE_ICONVLIST
	cnt = 0;
	iconvlist(count_one, NULL);
	PROTECT(ans = allocVector(STRSXP, cnt));
	cnt = 0;
	iconvlist(write_one, (void *)ans);
#else
	PROTECT(ans = R_NilValue);
#endif
    } else {
	int mark, toRaw;
	const char *from, *to;
	Rboolean isLatin1 = FALSE, isUTF8 = FALSE;

	args = CDR(args);
	if(!isString(CAR(args)) || length(CAR(args)) != 1)
	    error(_("invalid '%s' argument"), "from");
	from = CHAR(STRING_ELT(CAR(args), 0)); /* ASCII */
	args = CDR(args);
	if(!isString(CAR(args)) || length(CAR(args)) != 1)
	    error(_("invalid '%s' argument"), "to");
	to = CHAR(STRING_ELT(CAR(args), 0));
	args = CDR(args);
	if(!isString(CAR(args)) || length(CAR(args)) != 1)
	    error(_("invalid '%s' argument"), "sub");
	if(STRING_ELT(CAR(args), 0) == NA_STRING) sub = NULL;
	else sub = translateChar(STRING_ELT(CAR(args), 0));
	args = CDR(args);
	mark = asLogical(CAR(args));
	if(mark == NA_LOGICAL)
	    error(_("invalid '%s' argument"), "mark");	
	args = CDR(args);
	toRaw = asLogical(CAR(args));
	if(toRaw == NA_LOGICAL)
	    error(_("invalid '%s' argument"), "toRaw");	
	/* some iconv's allow "UTF8", but libiconv does not */
	if(streql(from, "UTF8") || streql(from, "utf8") ) from = "UTF-8";
	if(streql(to, "UTF8") || streql(to, "utf8") ) to = "UTF-8";
	/* Should we do something about marked CHARSXPs in 'from = ""'? */
	if(streql(to, "UTF-8")) isUTF8 = TRUE;
	if(streql(to, "latin1") || streql(to, "ISO_8859-1")
	    || streql(to, "CP1252")) isLatin1 = TRUE;
	if(streql(to, "") && known_to_be_latin1) isLatin1 = TRUE;
	if(streql(to, "") && known_to_be_utf8) isUTF8 = TRUE;
	obj = Riconv_open(to, from);
	if(obj == (iconv_t)(-1))
#ifdef Win32
	    error(_("unsupported conversion from '%s' to '%s' in codepage %d"), 
		  from, to, localeCP);
#else
	    error(_("unsupported conversion from '%s' to '%s'"), from, to);
#endif
	isRawlist = (TYPEOF(x) == VECSXP);
	if(isRawlist) {
	    if(toRaw)
		PROTECT(ans = duplicate(x));
	    else {
		PROTECT(ans = allocVector(STRSXP, LENGTH(x)));
		DUPLICATE_ATTRIB(ans, x);
	    }
	} else {   
	    if(TYPEOF(x) != STRSXP)
		error(_("'x' must be a character vector"));
	    if(toRaw) {
		PROTECT(ans = allocVector(VECSXP, LENGTH(x)));
		DUPLICATE_ATTRIB(ans, x);
	    } else 
		PROTECT(ans = duplicate(x));
	}
	R_AllocStringBuffer(0, &cbuff);  /* 0 -> default */
	for(R_xlen_t i = 0; i < XLENGTH(x); i++) {
	    if (isRawlist) {
		si = VECTOR_ELT(x, i);
		if (TYPEOF(si) == NILSXP) {
		    if (!toRaw) SET_STRING_ELT(ans, i, NA_STRING);
		    continue;
		} else if (TYPEOF(si) != RAWSXP)
		    error(_("'x' must be a list of NULL or raw vectors"));
	    } else {
		si = STRING_ELT(x, i);
		if (si == NA_STRING) {
		    if(!toRaw) SET_STRING_ELT(ans, i, NA_STRING);
		    continue;
		}
	    }
	top_of_loop:
	    inbuf = isRawlist ? (const char *) RAW(si) : CHAR(si); 
	    inb = LENGTH(si);
	    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
	    /* First initialize output */
	    Riconv (obj, NULL, NULL, &outbuf, &outb);
	next_char:
	    /* Then convert input  */
	    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
	    *outbuf = '\0';
	    /* other possible error conditions are incomplete
	       and invalid multibyte chars */
	    if(res == -1 && errno == E2BIG) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    } else if(res == -1 && sub && 
		      (errno == EILSEQ || errno == EINVAL)) {
		/* it seems this gets thrown for non-convertible input too */
		if(strcmp(sub, "byte") == 0) {
		    if(outb < 5) {
			R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
			goto top_of_loop;
		    }
		    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
		    outbuf += 4; outb -= 4;
		} else {
		    size_t j;
		    if(outb < strlen(sub)) {
			R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
			goto top_of_loop;
		    }
		    memcpy(outbuf, sub, j = strlen(sub));
		    outbuf += j;
		    outb -= j;
		}
		inbuf++; inb--;
		goto next_char;
	    }

	    if(toRaw) {
		if(res != -1 && inb == 0) {
		    size_t nout = cbuff.bufsize - 1 - outb;
		    SEXP el = allocVector(RAWSXP, nout);
		    memcpy(RAW(el), cbuff.data, nout);
		    SET_VECTOR_ELT(ans, i, el);
		} /* otherwise is already NULL */
	    } else {
		if(res != -1 && inb == 0) {
		    cetype_t ienc = CE_NATIVE;
		    
		    size_t nout = cbuff.bufsize - 1 - outb;
		    if(mark) {
			if(isLatin1) ienc = CE_LATIN1;
			else if(isUTF8) ienc = CE_UTF8;
		    }
		    SET_STRING_ELT(ans, i, 
				   mkCharLenCE(cbuff.data, (int) nout, ienc));
		} else SET_STRING_ELT(ans, i, NA_STRING);
	    }
	}
	Riconv_close(obj);
	R_FreeStringBuffer(&cbuff);
    }
    UNPROTECT(1);
    return ans;
}
Beispiel #8
0
/* A version avoiding R_alloc for use in the Rgui editor */
void reEnc2(const char *x, char *y, int ny,
	    cetype_t ce_in, cetype_t ce_out, int subst)
{
    void * obj;
    const char *inbuf;
    char *outbuf;
    size_t inb, outb, res, top;
    char *tocode = NULL, *fromcode = NULL;
    char buf[20];
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    strncpy(y, x, ny);
    y[ny - 1] = '\0';

    if(ce_in == ce_out || ce_in == CE_ANY || ce_out == CE_ANY) return;
    if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return;
    if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return;
    if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return;
    if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return;

    if(strIsASCII(x)) return;

    switch(ce_in) {
    case CE_NATIVE:
	{
	    /* Looks like CP1252 is treated as Latin-1 by iconv */
	    snprintf(buf, 20, "CP%d", localeCP);
	    fromcode = buf;
	    break;
	}
    case CE_LATIN1: fromcode = "CP1252"; break;
    case CE_UTF8:   fromcode = "UTF-8"; break;
    default: return;
    }

    switch(ce_out) {
    case CE_NATIVE:
	{
	    /* avoid possible misidentification of CP1250 as LATIN-2 */
	    snprintf(buf, 20, "CP%d", localeCP);
	    tocode = buf;
	    break;
	}
    case CE_LATIN1: tocode = "latin1"; break;
    case CE_UTF8:   tocode = "UTF-8"; break;
    default: return;
    }

    obj = Riconv_open(tocode, fromcode);
    if(obj == (void *)(-1)) return;
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = x; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	switch(subst) {
	case 1: /* substitute hex */
	    if(outb < 5) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	    goto next_char;
	    break;
	case 2: /* substitute . */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '.'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	case 3: /* substitute ? */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '?'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	default: /* skip byte */
	    inbuf++; inb--;
	    goto next_char;
	}
    }
    Riconv_close(obj);
    *outbuf = '\0';
    res = (top-outb)+1; /* strlen(cbuff.data) + 1; */
    if (res > ny) error("converted string too long for buffer");
    memcpy(y, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
}
Beispiel #9
0
/* Note that NA_STRING is not handled separately here.  This is
   deliberate -- see ?paste -- and implicitly coerces it to "NA"
*/
SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP ans, collapse, sep, x;
    int sepw, u_sepw, ienc;
    R_xlen_t i, j, k, maxlen, nx, pwidth;
    const char *s, *cbuf, *csep=NULL, *u_csep=NULL;
    char *buf;
    Rboolean allKnown, anyKnown, use_UTF8, use_Bytes,
	sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE,
	use_sep = (PRIMVAL(op) == 0);
    const void *vmax;

    checkArity(op, args);

    /* We use formatting and so we must initialize printing. */

    PrintDefaults();

    /* Check the arguments */

    x = CAR(args);
    if (!isVectorList(x))
	error(_("invalid first argument"));
    nx = xlength(x);

    if(use_sep) { /* paste(..., sep, .) */
	sep = CADR(args);
	if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING)
	    error(_("invalid separator"));
	sep = STRING_ELT(sep, 0);
	csep = translateChar(sep);
	u_sepw = sepw = (int) strlen(csep); // will be short
	sepASCII = strIsASCII(csep);
	sepKnown = ENC_KNOWN(sep) > 0;
	sepUTF8 = IS_UTF8(sep);
	sepBytes = IS_BYTES(sep);
	collapse = CADDR(args);
    } else { /* paste0(..., .) */
	u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */
	collapse = CADR(args);
    }
    if (!isNull(collapse))
	if(!isString(collapse) || LENGTH(collapse) <= 0 ||
	   STRING_ELT(collapse, 0) == NA_STRING)
	    error(_("invalid '%s' argument"), "collapse");
    if(nx == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);


    /* Maximum argument length, coerce if needed */

    maxlen = 0;
    for (j = 0; j < nx; j++) {
	if (!isString(VECTOR_ELT(x, j))) {
	    /* formerly in R code: moved to C for speed */
	    SEXP call, xj = VECTOR_ELT(x, j);
	    if(OBJECT(xj)) { /* method dispatch */
		PROTECT(call = lang2(install("as.character"), xj));
		SET_VECTOR_ELT(x, j, eval(call, env));
		UNPROTECT(1);
	    } else if (isSymbol(xj))
		SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj)));
	    else
		SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP));

	    if (!isString(VECTOR_ELT(x, j)))
		error(_("non-string argument to internal 'paste'"));
	}
	if(xlength(VECTOR_ELT(x, j)) > maxlen)
	    maxlen = xlength(VECTOR_ELT(x, j));
    }
    if(maxlen == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);

    PROTECT(ans = allocVector(STRSXP, maxlen));

    for (i = 0; i < maxlen; i++) {
	/* Strategy for marking the encoding: if all inputs (including
	 * the separator) are ASCII, so is the output and we don't
	 * need to mark.  Otherwise if all non-ASCII inputs are of
	 * declared encoding, we should mark.
	 * Need to be careful only to include separator if it is used.
	 */
	anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE;
	if(nx > 1) {
	    allKnown = sepKnown || sepASCII;
	    anyKnown = sepKnown;
	    use_UTF8 = sepUTF8;
	    use_Bytes = sepBytes;
	}

	pwidth = 0;
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if(IS_UTF8(cs)) use_UTF8 = TRUE;
		if(IS_BYTES(cs)) use_Bytes = TRUE;
	    }
	}
	if (use_Bytes) use_UTF8 = FALSE;
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		if(use_Bytes)
		    pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else if(use_UTF8)
		    pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else
		    pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		vmaxset(vmax);
	    }
	}
	if(use_sep) {
	    if (use_UTF8 && !u_csep) {
		u_csep = translateCharUTF8(sep);
		u_sepw = (int) strlen(u_csep); // will be short
	    }
	    pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw);
	}
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if (use_UTF8) {
		    s = translateCharUTF8(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		} else {
		    s = use_Bytes ? CHAR(cs) : translateChar(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		    allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0));
		    anyKnown = anyKnown || (ENC_KNOWN(cs)> 0);
		}
	    }
	    if (sepw != 0 && j != nx - 1) {
		if (use_UTF8) {
		    strcpy(buf, u_csep);
		    buf += u_sepw;
		} else {
		    strcpy(buf, csep);
		    buf += sepw;
		}
	    }
	    vmax = vmaxget();
	}
	ienc = 0;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc));
    }

    /* Now collapse, if required. */

    if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) {
	sep = STRING_ELT(collapse, 0);
	use_UTF8 = IS_UTF8(sep);
	use_Bytes = IS_BYTES(sep);
	for (i = 0; i < nx; i++) {
	    if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE;
	    if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE;
	}
	if(use_Bytes) {
	    csep = CHAR(sep);
	    use_UTF8 = FALSE;
	} else if(use_UTF8)
	    csep = translateCharUTF8(sep);
	else
	    csep = translateChar(sep);
	sepw = (int) strlen(csep);
	anyKnown = ENC_KNOWN(sep) > 0;
	allKnown = anyKnown || strIsASCII(csep);
	pwidth = 0;
	vmax = vmaxget();
	for (i = 0; i < nx; i++)
	    if(use_UTF8) {
		pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i)));
		vmaxset(vmax);
	    } else /* already translated */
		pwidth += strlen(CHAR(STRING_ELT(ans, i)));
	pwidth += (nx - 1) * sepw;
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (i = 0; i < nx; i++) {
	    if(i > 0) {
		strcpy(buf, csep);
		buf += sepw;
	    }
	    if(use_UTF8)
		s = translateCharUTF8(STRING_ELT(ans, i));
	    else /* already translated */
		s = CHAR(STRING_ELT(ans, i));
	    strcpy(buf, s);
	    while (*buf)
		buf++;
	    allKnown = allKnown &&
		(strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0));
	    anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0);
	    if(use_UTF8) vmaxset(vmax);
	}
	UNPROTECT(1);
	ienc = CE_NATIVE;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	PROTECT(ans = allocVector(STRSXP, 1));
	SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc));
    }
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return ans;
}
Beispiel #10
0
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP d, s, x, stype;
    int i, len, allowNA;
    size_t ntype;
    int nc;
    const char *type;
    const char *xi;
    wchar_t *wc;
    const void *vmax;

    checkArity(op, args);
    if (isFactor(CAR(args)))
	error(_("'%s' requires a character vector"), "nchar()");
    PROTECT(x = coerceVector(CAR(args), STRSXP));
    if (!isString(x))
	error(_("'%s' requires a character vector"), "nchar()");
    len = LENGTH(x);
    stype = CADR(args);
    if (!isString(stype) || LENGTH(stype) != 1)
	error(_("invalid '%s' argument"), "type");
    type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */
    ntype = strlen(type);
    if (ntype == 0) error(_("invalid '%s' argument"), "type");
    allowNA = asLogical(CADDR(args));
    if (allowNA == NA_LOGICAL) allowNA = 0;

    PROTECT(s = allocVector(INTSXP, len));
    vmax = vmaxget();
    for (i = 0; i < len; i++) {
	SEXP sxi = STRING_ELT(x, i);
	if (sxi == NA_STRING) {
	    INTEGER(s)[i] = 2;
	    continue;
	}
	if (strncmp(type, "bytes", ntype) == 0) {
	    INTEGER(s)[i] = LENGTH(sxi);
	} else if (strncmp(type, "chars", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do chars 0 */
		    error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		nc = mbstowcs(NULL, translateChar(sxi), 0);
		if (!allowNA && nc < 0)
		    error(_("invalid multibyte string %d"), i+1);
		INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else if (strncmp(type, "width", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		wchar_t wc1;
		nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do width 0 */
		    error(_("width is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		xi = translateChar(sxi);
		nc = mbstowcs(NULL, xi, 0);
		if (nc >= 0) {
		    wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);

		    mbstowcs(wc, xi, nc + 1);
		    INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647);
		    if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc;
		} else if (allowNA)
		    error(_("invalid multibyte string %d"), i+1);
		else
		    INTEGER(s)[i] = NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else
	    error(_("invalid '%s' argument"), "type");
	vmaxset(vmax);
    }
    R_FreeStringBufferL(&cbuff);
    if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue)
	setAttrib(s, R_NamesSymbol, d);
    if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue)
	setAttrib(s, R_DimSymbol, d);
    if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue)
	setAttrib(s, R_DimNamesSymbol, d);
    UNPROTECT(2);
    return s;
}
SEXP
RS_PostgreSQL_CopyInDataframe(Con_Handle * conHandle, SEXP x, SEXP nrow, SEXP ncol)
{
    S_EVALUATOR RS_DBI_connection * con;
    int nr, nc, i, j;
    const char *cna ="\\N", *tmp=NULL /* -Wall */;
    char cdec = '.';

    PGconn *my_connection;
    int pqretcode;
    nr = asInteger(nrow);
    nc = asInteger(ncol);
    const int buff_threshold = 8000;

    con = RS_DBI_getConnection(conHandle);
    my_connection = (PGconn *) con->drvConnection;

    if(isVectorList(x)) { /* A data frame */
        R_StringBuffer rstrbuf = {NULL, 0, 10000};
        
        char *strBuf  = Calloc(buff_threshold * 2 + 2, char); /* + 2 for '\t' or '\n' plus '\0'*/
        char *strendp = strBuf;
        SEXP *levels;
        *strendp = '\0';

        R_AllocStringBuffer(10000, &rstrbuf);
	/* handle factors internally, check integrity */
	levels = (SEXP *) R_alloc(nc, sizeof(SEXP));
	for(j = 0; j < nc; j++) {
            SEXP xj;
	    xj = VECTOR_ELT(x, j);
	    if(LENGTH(xj) != nr)
		error(("corrupt data frame -- length of column %d does not not match nrows"), j+1);
	    if(inherits(xj, "factor")) {
		levels[j] = getAttrib(xj, R_LevelsSymbol);
	    } else levels[j] = R_NilValue;
	}

	for(i = 0; i < nr; i++) {
	    for(j = 0; j < nc; j++) {
                SEXP xj;
		xj = VECTOR_ELT(x, j);
		if(j > 0){
                    *strendp++ =  '\t';/*need no size count check here*/
                }
		if(isna(xj, i)) tmp = cna;
		else {
		    if(!isNull(levels[j])) {
			/* We cannot assume factors have integer levels */
			if(TYPEOF(xj) == INTSXP){
                            tmp = EncodeElementS(levels[j], INTEGER(xj)[i] - 1,
                                                 &rstrbuf, cdec);
			}else if(TYPEOF(xj) == REALSXP){
                            tmp = EncodeElementS(levels[j], REAL(xj)[i] - 1,
                                                 &rstrbuf, cdec);
			}else
			    error("column %s claims to be a factor but does not have numeric codes", j+1);
		    } else {
			tmp = EncodeElementS(xj, i, 
					     &rstrbuf, cdec);
		    }
		}
                {
                    size_t n;
                    size_t len = strendp - strBuf;
                    n = strlen(tmp);
                    if (len + n < buff_threshold){
                        memcpy(strendp, tmp, n);/* we already know the length */
                        strendp += n;
                    }else if(n < buff_threshold){ /*copy and flush*/
                        memcpy(strendp, tmp, n);/* we already know the length */
                        pqretcode = PQputCopyData(my_connection, strBuf, len + n);
              	        chkpqcopydataerr(my_connection, pqretcode);
                        strendp = strBuf;
                    }else{ /*flush and copy current*/
                        if(len > 0){
                            pqretcode = PQputCopyData(my_connection, strBuf, len);
                            chkpqcopydataerr(my_connection, pqretcode);
                            strendp = strBuf;
                        }
                        pqretcode = PQputCopyData(my_connection, tmp, n);
                        chkpqcopydataerr(my_connection, pqretcode);
                    }
                }
	    }
            *strendp = '\n'; strendp +=1; *strendp='\0';
	}
        pqretcode = PQputCopyData(my_connection, strBuf, strendp - strBuf);
        chkpqcopydataerr(my_connection, pqretcode);
        Free(strBuf);
        R_FreeStringBuffer(&rstrbuf);
    }
    PQputCopyEnd(my_connection, NULL);
    return R_NilValue;
}
/* a version of EncodeElement with different escaping of char strings */
static const char
*EncodeElementS(SEXP x, int indx,
                R_StringBuffer *buff, char cdec)
{
    switch(TYPEOF(x)) {
       case STRSXP:
       {
	    const char *s = translateCharUTF8(STRING_ELT(x, indx));
	    char *u, *cbuf;
            int j, len, blen, offset;
	    len = strlen(s);
	    blen = len * 2 + 1;
            R_AllocStringBuffer(blen, buff);
	    u = cbuf = buff->data;
	    offset = 0;
	    for (j = 0; j < len; j++){
                switch(s[offset+j]){
/* http://www.postgresql.org/docs/8.1/static/sql-copy.html */
                    case '\b':
                        *u++ = '\\';
                        *u++ = 'b';
                        break;
                    case '\f':
                        *u++ = '\\';
                        *u++ = 'f';
                        break;
                    case '\n':
                        *u++ = '\\';
                        *u++ = 'n';
                        break;
                    case '\r':
                        *u++ = '\\';
                        *u++ = 'r';
                        break;
                    case '\t':
                        *u++ = '\\';
                        *u++ = 't';
                        break;
                    case '\v':
                        *u++ = '\\';
                        *u++ = 'v';
                        break;
                    case '\\':
                        *u++ = '\\';
                        *u++ = '\\';
                        break;
                    default:
		        *u++ = s[offset+j];
                }
            }
            *u = '\0';
            return buff->data;
        }
        case LGLSXP:{
            int value;
            value = LOGICAL(x)[indx];
            if(value == TRUE) return "true";
            if(value == FALSE) return "false";
            return "\\N";
        }
        case INTSXP:{
            int value;
            value = INTEGER(x)[indx];
            if(ISNA(value)) return "\\N";
            snprintf(buff->data, buff->bufsize, "%d", value);
            return buff->data;
        }
        case REALSXP:{
            double value = REAL(x)[indx];
            if (!R_FINITE(value)) {
                if(ISNA(value)) return "\\N";
                else if(ISNAN(value)) return "NaN";
                else if(value > 0) return "Inf";
                else return "-Inf";
            }
            snprintf(buff->data, buff->bufsize, "%.15g", value);
            return buff->data;
        }
        default:
            return NULL; 
    }
    return NULL; 
}
Beispiel #13
0
SEXP writetable(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP x, sep, rnames, eol, na, dec, quote, xj;
    int nr, nc, i, j, qmethod;
    Rboolean wasopen, quote_rn = FALSE, *quote_col;
    Rconnection con;
    const char *csep, *ceol, *cna, *sdec, *tmp=NULL /* -Wall */;
    char cdec;
    SEXP *levels;
    R_StringBuffer strBuf = {NULL, 0, MAXELTSIZE};
    wt_info wi;
    RCNTXT cntxt;

    args = CDR(args);

    x = CAR(args);		   args = CDR(args);
    /* this is going to be a connection open or openable for writing */
    if(!inherits(CAR(args), "connection"))
	error(_("'file' is not a connection"));
    con = getConnection(asInteger(CAR(args))); args = CDR(args);
    if(!con->canwrite)
	error(_("cannot write to this connection"));
    wasopen = con->isopen;
    if(!wasopen) {
	strcpy(con->mode, "wt");
	if(!con->open(con)) error(_("cannot open the connection"));
    }
    nr = asInteger(CAR(args));	   args = CDR(args);
    nc = asInteger(CAR(args));	   args = CDR(args);
    rnames = CAR(args);		   args = CDR(args);
    sep = CAR(args);		   args = CDR(args);
    eol = CAR(args);		   args = CDR(args);
    na = CAR(args);		   args = CDR(args);
    dec = CAR(args);		   args = CDR(args);
    quote = CAR(args);		   args = CDR(args);
    qmethod = asLogical(CAR(args));

    if(nr == NA_INTEGER) error(_("invalid '%s' argument"), "nr");
    if(nc == NA_INTEGER) error(_("invalid '%s' argument"), "nc");
    if(!isNull(rnames) && !isString(rnames))
	error(_("invalid '%s' argument"), "rnames");
    if(!isString(sep)) error(_("invalid '%s' argument"), "sep");
    if(!isString(eol)) error(_("invalid '%s' argument"), "eol");
    if(!isString(na)) error(_("invalid '%s' argument"), "na");
    if(!isString(dec)) error(_("invalid '%s' argument"), "dec");
    if(qmethod == NA_LOGICAL) error(_("invalid '%s' argument"), "qmethod");
    csep = translateChar(STRING_ELT(sep, 0));
    ceol = translateChar(STRING_ELT(eol, 0));
    cna = translateChar(STRING_ELT(na, 0));
    sdec = translateChar(STRING_ELT(dec, 0));
    if(strlen(sdec) != 1)
	error(_("'dec' must be a single character"));
    cdec = sdec[0];
    quote_col = (Rboolean *) R_alloc(nc, sizeof(Rboolean));
    for(j = 0; j < nc; j++) quote_col[j] = FALSE;
    for(i = 0; i < length(quote); i++) { /* NB, quote might be NULL */
	int this = INTEGER(quote)[i];
	if(this == 0) quote_rn = TRUE;
	if(this >  0) quote_col[this - 1] = TRUE;
    }
    R_AllocStringBuffer(0, &strBuf);
    PrintDefaults();
    wi.savedigits = R_print.digits; R_print.digits = DBL_DIG;/* MAX precision */
    wi.con = con;
    wi.wasopen = wasopen;
    wi.buf = &strBuf;
    begincontext(&cntxt, CTXT_CCODE, call, R_BaseEnv, R_BaseEnv,
		 R_NilValue, R_NilValue);
    cntxt.cend = &wt_cleanup;
    cntxt.cenddata = &wi;

    if(isVectorList(x)) { /* A data frame */

	/* handle factors internally, check integrity */
	levels = (SEXP *) R_alloc(nc, sizeof(SEXP));
	for(j = 0; j < nc; j++) {
	    xj = VECTOR_ELT(x, j);
	    if(LENGTH(xj) != nr)
		error(_("corrupt data frame -- length of column %d does not not match nrows"), j+1);
	    if(inherits(xj, "factor")) {
		levels[j] = getAttrib(xj, R_LevelsSymbol);
	    } else levels[j] = R_NilValue;
	}

	for(i = 0; i < nr; i++) {
	    if(i % 1000 == 999) R_CheckUserInterrupt();
	    if(!isNull(rnames))
		Rconn_printf(con, "%s%s",
			     EncodeElement2(rnames, i, quote_rn, qmethod,
					    &strBuf, cdec), csep);
	    for(j = 0; j < nc; j++) {
		xj = VECTOR_ELT(x, j);
		if(j > 0) Rconn_printf(con, "%s", csep);
		if(isna(xj, i)) tmp = cna;
		else {
		    if(!isNull(levels[j])) {
			/* We do not assume factors have integer levels,
			   although they should. */
			if(TYPEOF(xj) == INTSXP)
			    tmp = EncodeElement2(levels[j], INTEGER(xj)[i] - 1,
						 quote_col[j], qmethod,
						 &strBuf, cdec);
			else if(TYPEOF(xj) == REALSXP)
			    tmp = EncodeElement2(levels[j], 
						 (int) (REAL(xj)[i] - 1),
						 quote_col[j], qmethod,
						 &strBuf, cdec);
			else
			    error("column %s claims to be a factor but does not have numeric codes", j+1);
		    } else {
			tmp = EncodeElement2(xj, i, quote_col[j], qmethod,
					     &strBuf, cdec);
		    }
		    /* if(cdec) change_dec(tmp, cdec, TYPEOF(xj)); */
		}
		Rconn_printf(con, "%s", tmp);
	    }
	    Rconn_printf(con, "%s", ceol);
	}

    } else { /* A matrix */

	if(!isVectorAtomic(x))
	    UNIMPLEMENTED_TYPE("write.table, matrix method", x);
	/* quick integrity check */
	if(LENGTH(x) != nr * nc)
	    error(_("corrupt matrix -- dims not not match length"));

	for(i = 0; i < nr; i++) {
	    if(i % 1000 == 999) R_CheckUserInterrupt();
	    if(!isNull(rnames))
		Rconn_printf(con, "%s%s",
			     EncodeElement2(rnames, i, quote_rn, qmethod,
					    &strBuf, cdec), csep);
	    for(j = 0; j < nc; j++) {
		if(j > 0) Rconn_printf(con, "%s", csep);
		if(isna(x, i + j*nr)) tmp = cna;
		else {
		    tmp = EncodeElement2(x, i + j*nr, quote_col[j], qmethod,
					&strBuf, cdec);
		    /* if(cdec) change_dec(tmp, cdec, TYPEOF(x)); */
		}
		Rconn_printf(con, "%s", tmp);
	    }
	    Rconn_printf(con, "%s", ceol);
	}

    }
    endcontext(&cntxt);
    wt_cleanup(&wi);
    return R_NilValue;
}
Beispiel #14
0
static SEXP scanVector(SEXPTYPE type, int maxitems, int maxlines,
		       int flush, SEXP stripwhite, int blskip, LocalData *d)
{
    SEXP ans, bns;
    int blocksize, c, i, n, linesread, nprev,strip, bch;
    char *buffer;
    R_StringBuffer strBuf = {NULL, 0, MAXELTSIZE};

    if (maxitems > 0) blocksize = maxitems;
    else blocksize = SCAN_BLOCKSIZE;

    R_AllocStringBuffer(0, &strBuf);
    PROTECT(ans = allocVector(type, blocksize));

    nprev = 0; n = 0; linesread = 0; bch = 1;

    if (d->ttyflag) sprintf(ConsolePrompt, "1: ");

    strip = asLogical(stripwhite);

    for (;;) {
	if(n % 10000 == 9999) R_CheckUserInterrupt();
	if (bch == R_EOF) {
	    if (d->ttyflag) R_ClearerrConsole();
	    break;
	}
	else if (bch == '\n') {
	    linesread++;
	    if (linesread == maxlines)
		break;
	    if (d->ttyflag) sprintf(ConsolePrompt, "%d: ", n + 1);
	    nprev = n;
	}
	if (n == blocksize) {
	    /* enlarge the vector*/
	    bns = ans;
	    if(blocksize > INT_MAX/2) error(_("too many items"));
	    blocksize = 2 * blocksize;
	    ans = allocVector(type, blocksize);
	    UNPROTECT(1);
	    PROTECT(ans);
	    copyVector(ans, bns);
	}
	buffer = fillBuffer(type, strip, &bch, d, &strBuf);
	if (nprev == n && strlen(buffer)==0 &&
	    ((blskip && bch =='\n') || bch == R_EOF)) {
	    if (d->ttyflag || bch == R_EOF)
		break;
	}
	else {
	    extractItem(buffer, ans, n, d);
	    if (++n == maxitems) {
		if (d->ttyflag && bch != '\n') { /* MBCS-safe */
		    while ((c = scanchar(FALSE, d)) != '\n')
			;
		}
		break;
	    }
	}
	if (flush && (bch != '\n') && (bch != R_EOF)) { /* MBCS-safe */
	    while ((c = scanchar(FALSE, d)) != '\n' && (c != R_EOF));
	    bch = c;
	}
    }
    if (!d->quiet) REprintf("Read %d item%s\n", n, (n == 1) ? "" : "s");
    if (d->ttyflag) ConsolePrompt[0] = '\0';

    if (n == 0) {
	UNPROTECT(1);
	R_FreeStringBuffer(&strBuf);
	return allocVector(type,0);
    }
    if (n == maxitems) {
	UNPROTECT(1);
	R_FreeStringBuffer(&strBuf);
	return ans;
    }

    bns = allocVector(type, n);
    switch (type) {
    case LGLSXP:
    case INTSXP:
	for (i = 0; i < n; i++)
	    INTEGER(bns)[i] = INTEGER(ans)[i];
	break;
    case REALSXP:
	for (i = 0; i < n; i++)
	    REAL(bns)[i] = REAL(ans)[i];
	break;
    case CPLXSXP:
	for (i = 0; i < n; i++)
	    COMPLEX(bns)[i] = COMPLEX(ans)[i];
	break;
    case STRSXP:
	for (i = 0; i < n; i++)
	    SET_STRING_ELT(bns, i, STRING_ELT(ans, i));
	break;
    case RAWSXP:
	for (i = 0; i < n; i++)
	    RAW(bns)[i] = RAW(ans)[i];
	break;
    default:
	UNIMPLEMENTED_TYPEt("scanVector", type);
    }
    UNPROTECT(1);
    R_FreeStringBuffer(&strBuf);
    return bns;
}
Beispiel #15
0
/*XX  Can we pass this routine an R_StringBuffer? appears so.
   But do we have to worry about continuation lines and whatever
   is currently in the buffer before we call this? In other words,
   what if this appends to the existing content. Appears it writes in
   directly at position 0.
 */
static char *
fillBuffer(SEXPTYPE type, int strip, int *bch, LocalData *d,
	   R_StringBuffer *buffer)
{
/* The basic reader function, called from scanVector() and scanFrame().
   Reads into _buffer_	which later will be read out by extractItem().

   bch is used to distinguish \r, \n and EOF from more input available.
*/
    char *bufp;
    int c, quote, filled, nbuf = MAXELTSIZE, m, mm = 0;
    Rboolean dbcslocale = (MB_CUR_MAX == 2);

    m = 0;
    filled = 1;
    if (d->sepchar == 0) {
	/* skip all space or tabs: only look at lead bytes here */
	strip = 0; /* documented to be ignored in this case */
	while ((c = scanchar(FALSE, d)) == ' ' || c == '\t') ;
	if (c == '\n' || c == '\r' || c == R_EOF) {
	    filled = c;
	    goto donefill;
	}
	if ((type == STRSXP || type == NILSXP) && strchr(d->quoteset, c)) {
	    quote = c;
	    while ((c = scanchar(TRUE, d)) != R_EOF && c != quote) {
		if (m >= nbuf - 3) {
		    nbuf *= 2;
		    R_AllocStringBuffer(nbuf, buffer);
		}
		if (c == '\\') {
		    /* If this is an embedded quote, unquote it, but
		       otherwise keep backslashes */
		    c = scanchar(TRUE, d);
		    if (c == R_EOF) break;
		    if(c != quote) buffer->data[m++] = '\\';
		}
		buffer->data[m++] = (char) c;
		if(dbcslocale && btowc(c) == WEOF)
		    buffer->data[m++] = (char) scanchar2(d);
	    }
	    if (c == R_EOF)
		warning(_("EOF within quoted string"));
	    c = scanchar(FALSE, d);
	    mm = m;
	}
	else { /* not a quoted char string */
	    do {
		if (m >= nbuf - 3) {
		    nbuf *= 2;
		    R_AllocStringBuffer(nbuf, buffer);
		}
		buffer->data[m++] = (char) c;
		if(dbcslocale && btowc(c) == WEOF)
		    buffer->data[m++] = (char) scanchar2(d);
		c = scanchar(FALSE, d);
	    } while (!Rspace(c) && c != R_EOF);
	}
	/* skip all space or tabs: only look at lead bytes here */
	while (c == ' ' || c == '\t') c = scanchar(FALSE, d);
	if (c == '\n' || c == '\r' || c == R_EOF)
	    filled = c;
	else
	    unscanchar(c, d);
    }
    else { /* have separator */
	while ((c = scanchar(FALSE, d)) != d->sepchar &&
	       c != '\n' && c != '\r' && c != R_EOF)
	    {
		/* eat white space */
		if (type != STRSXP)
		    while (c == ' ' || c == '\t')
			if ((c = scanchar(FALSE, d)) == d->sepchar
			    || c == '\n' || c == '\r' || c == R_EOF) {
			    filled = c;
			    goto donefill;
			}
		/* CSV style quoted string handling */
		if ((type == STRSXP || type == NILSXP)
		    && c != 0 && strchr(d->quoteset, c)) {
		    quote = c;
		inquote:
		    while ((c = scanchar(TRUE, d)) != R_EOF && c != quote) {
			if (m >= nbuf - 3) {
			    nbuf *= 2;
			    R_AllocStringBuffer(nbuf, buffer);
			}
			buffer->data[m++] = (char) c;
			if(dbcslocale && btowc(c) == WEOF)
			    buffer->data[m++] = (char) scanchar2(d);
		    }
		    if (c == R_EOF)
			warning(_("EOF within quoted string"));
		    c = scanchar(TRUE, d); /* only peek at lead byte
					      unless ASCII */
		    if (c == quote) {
			if (m >= nbuf - 3) {
			    nbuf *= 2;
			    R_AllocStringBuffer(nbuf, buffer);
			}
			buffer->data[m++] = (char) quote;
			goto inquote; /* FIXME: Ick! Clean up logic */
		    }
		    mm = m;
		    if (c == d->sepchar || c == '\n' || c == '\r' || c == R_EOF){
			filled = c;
			goto donefill;
		    }
		    else {
			unscanchar(c, d);
			continue;
		    }
		} /* end of CSV-style quote handling */
		if (!strip || m > 0 || !Rspace(c)) { /* only lead byte */
		    if (m >= nbuf - 3) {
			nbuf *= 2;
			R_AllocStringBuffer(nbuf, buffer);
		    }
		    buffer->data[m++] = (char) c;
		    if(dbcslocale && btowc(c) == WEOF)
			buffer->data[m++] = (char) scanchar2(d);
		}
	    }
	filled = c; /* last lead byte in a DBCS */
    }
 donefill:
    /* strip trailing white space, if desired and if item is non-null */
    bufp = &buffer->data[m];
    if (strip && m > mm) {
	do {c = (int)*--bufp;} while(m-- > mm && Rspace(c));
	bufp++;
    }
    *bufp = '\0';
    /* Remove UTF-8 BOM */
    if(d->atStart && utf8locale &&
       !memcmp(buffer->data, "\xef\xbb\xbf", 3))
	memmove(buffer->data, buffer->data+3, strlen(buffer->data) + 1);
    d->atStart = FALSE;
    *bch = filled;
    return buffer->data;
}
Beispiel #16
0
const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
{
    int b, b0, i, j, cnt;
    const char *p; char *q, buf[11];
    cetype_t ienc = CE_NATIVE;

    /* We have to do something like this as the result is returned, and
       passed on by EncodeElement -- so no way could be end user be
       responsible for freeing it.  However, this is not thread-safe. */

    static R_StringBuffer gBuffer = {NULL, 0, BUFSIZE};
    R_StringBuffer *buffer = &gBuffer;

    if (s == NA_STRING) {
	p = quote ? CHAR(R_print.na_string) : CHAR(R_print.na_string_noquote);
	cnt = i = (int)(quote ? strlen(CHAR(R_print.na_string)) :
			strlen(CHAR(R_print.na_string_noquote)));
	quote = 0;
    } else {
#ifdef Win32
	if(WinUTF8out) {
	    ienc = getCharCE(s);
	    if(ienc == CE_UTF8) {
		p = CHAR(s);
		i = Rstrlen(s, quote);
		cnt = LENGTH(s);
	    } else {
		p = translateChar0(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
		ienc = CE_NATIVE;
	    }
	} else
#endif
	{
	    if(IS_BYTES(s)) {
		p = CHAR(s);
		cnt = (int) strlen(p);
		const char *q;
		char *pp = R_alloc(4*cnt+1, 1), *qq = pp, buf[5];
		for (q = p; *q; q++) {
		    unsigned char k = (unsigned char) *q;
		    if (k >= 0x20 && k < 0x80) {
			*qq++ = *q;
			if (quote && *q == '"') cnt++;
		    } else {
			snprintf(buf, 5, "\\x%02x", k);
			for(j = 0; j < 4; j++) *qq++ = buf[j];
			cnt += 3;
		    }
		}
		*qq = '\0';
		p = pp;
		i = cnt;
	    } else {
		p = translateChar(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = (int) strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
	    }
	}
    }

    /* We need enough space for the encoded string, including escapes.
       Octal encoding turns one byte into four.
       \u encoding can turn a multibyte into six or ten,
       but it turns 2/3 into 6, and 4 (and perhaps 5/6) into 10.
       Let's be wasteful here (the worst case appears to be an MBCS with
       one byte for an upper-plane Unicode point output as ten bytes,
       but I doubt that such an MBCS exists: two bytes is plausible).

       +2 allows for quotes, +6 for UTF_8 escapes.
     */
    q = R_AllocStringBuffer(imax2(5*cnt+8, w), buffer);
    b = w - i - (quote ? 2 : 0); /* total amount of padding */
    if(justify == Rprt_adj_none) b = 0;
    if(b > 0 && justify != Rprt_adj_left) {
	b0 = (justify == Rprt_adj_centre) ? b/2 : b;
	for(i = 0 ; i < b0 ; i++) *q++ = ' ';
	b -= b0;
    }
    if(quote) *q++ = (char) quote;
    if(mbcslocale || ienc == CE_UTF8) {
	int j, res;
	mbstate_t mb_st;
	wchar_t wc;
	unsigned int k; /* not wint_t as it might be signed */
#ifndef __STDC_ISO_10646__
	Rboolean Unicode_warning = FALSE;
#endif
	if(ienc != CE_UTF8)  mbs_init(&mb_st);
#ifdef Win32
	else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; }
#endif
	for (i = 0; i < cnt; i++) {
	    res = (int)((ienc == CE_UTF8) ? utf8toucs(&wc, p):
			mbrtowc(&wc, p, MB_CUR_MAX, NULL));
	    if(res >= 0) { /* res = 0 is a terminator */
		k = wc;
		/* To be portable, treat \0 explicitly */
		if(res == 0) {k = 0; wc = L'\0';}
		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
		    switch(wc) {
		    case L'\\': *q++ = '\\'; *q++ = '\\'; p++;
			break;
		    case L'\'':
		    case L'"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p++;
			break;
		    default:
			for(j = 0; j < res; j++) *q++ = *p++;
			break;
		    }
		} else if (k < 0x80) {
		    /* ANSI Escapes */
		    switch(wc) {
		    case L'\a': *q++ = '\\'; *q++ = 'a'; break;
		    case L'\b': *q++ = '\\'; *q++ = 'b'; break;
		    case L'\f': *q++ = '\\'; *q++ = 'f'; break;
		    case L'\n': *q++ = '\\'; *q++ = 'n'; break;
		    case L'\r': *q++ = '\\'; *q++ = 'r'; break;
		    case L'\t': *q++ = '\\'; *q++ = 't'; break;
		    case L'\v': *q++ = '\\'; *q++ = 'v'; break;
		    case L'\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", k);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		    p++;
		} else {
		    if(iswprint(wc)) {
			/* The problem here is that wc may be
			   printable according to the Unicode tables,
			   but it may not be printable on the output
			   device concerned. */
			for(j = 0; j < res; j++) *q++ = *p++;
		    } else {
#ifndef Win32
# ifndef __STDC_ISO_10646__
			Unicode_warning = TRUE;
# endif
			if(k > 0xffff)
			    snprintf(buf, 11, "\\U%08x", k);
			else
#endif
			    snprintf(buf, 11, "\\u%04x", k);
			j = (int) strlen(buf);
			memcpy(q, buf, j);
			q += j;
			p += res;
		    }
		    i += (res - 1);
		}

	    } else { /* invalid char */
		snprintf(q, 5, "\\x%02x", *((unsigned char *)p));
		q += 4; p++;
	    }
	}
#ifndef __STDC_ISO_10646__
	if(Unicode_warning)
	    warning(_("it is not known that wchar_t is Unicode on this platform"));
#endif

    } else
	for (i = 0; i < cnt; i++) {

	    /* ASCII */
	    if((unsigned char) *p < 0x80) {
		if(*p != '\t' && isprint((int)*p)) { /* Windows has \t as printable */
		    switch(*p) {
		    case '\\': *q++ = '\\'; *q++ = '\\'; break;
		    case '\'':
		    case '"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p; break;
		    default: *q++ = *p; break;
		    }
		} else switch(*p) {
			/* ANSI Escapes */
		    case '\a': *q++ = '\\'; *q++ = 'a'; break;
		    case '\b': *q++ = '\\'; *q++ = 'b'; break;
		    case '\f': *q++ = '\\'; *q++ = 'f'; break;
		    case '\n': *q++ = '\\'; *q++ = 'n'; break;
		    case '\r': *q++ = '\\'; *q++ = 'r'; break;
		    case '\t': *q++ = '\\'; *q++ = 't'; break;
		    case '\v': *q++ = '\\'; *q++ = 'v'; break;
		    case '\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", (unsigned char) *p);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		p++;
	    } else {  /* 8 bit char */
#ifdef Win32 /* It seems Windows does not know what is printable! */
		*q++ = *p++;
#else
		if(!isprint((int)*p & 0xff)) {
		    /* print in octal */
		    snprintf(buf, 5, "\\%03o", (unsigned char) *p);
		    for(j = 0; j < 4; j++) *q++ = buf[j];
		    p++;
		} else *q++ = *p++;
#endif
	    }
	}

#ifdef Win32
    if(WinUTF8out && ienc == CE_UTF8)  { memcpy(q, UTF8out, 3); q += 3; }
#endif
    if(quote) *q++ = (char) quote;
    if(b > 0 && justify != Rprt_adj_right) {
	for(i = 0 ; i < b ; i++) *q++ = ' ';
    }
    *q = '\0';
    return buffer->data;
}
Beispiel #17
0
SEXP attribute_hidden do_sprintf(SEXP call, SEXP op, SEXP args, SEXP env)
{
    int i, nargs, cnt, v, thislen, nfmt, nprotect = 0;
    /* fmt2 is a copy of fmt with '*' expanded.
       bit will hold numeric formats and %<w>s, so be quite small. */
    char fmt[MAXLINE+1], fmt2[MAXLINE+10], *fmtp, bit[MAXLINE+1],
	*outputString;
    const char *formatString;
    size_t n, cur, chunk;

    SEXP format, _this, a[MAXNARGS], ans /* -Wall */ = R_NilValue;
    int ns, maxlen, lens[MAXNARGS], nthis, nstar, star_arg = 0;
    static R_StringBuffer outbuff = {NULL, 0, MAXELTSIZE};
    Rboolean has_star, use_UTF8;

#define _my_sprintf(_X_)						\
    {									\
	int nc = snprintf(bit, MAXLINE+1, fmtp, _X_);			\
	if (nc > MAXLINE)						\
	    error(_("required resulting string length %d is greater than maximal %d"), \
		  nc, MAXLINE);						\
    }

    nargs = length(args);
    /* grab the format string */
    format = CAR(args);
    if (!isString(format))
	error(_("'fmt' is not a character vector"));
    nfmt = length(format);
    if (nfmt == 0) return allocVector(STRSXP, 0);
    args = CDR(args); nargs--;
    if(nargs >= MAXNARGS)
	error(_("only %d arguments are allowed"), MAXNARGS);

    /* record the args for possible coercion and later re-ordering */
    for(i = 0; i < nargs; i++, args = CDR(args)) {
	SEXPTYPE t_ai;
	a[i] = CAR(args);
	if((t_ai = TYPEOF(a[i])) == LANGSXP || t_ai == SYMSXP) /* << maybe add more .. */
	    error(_("invalid type of argument[%d]: '%s'"),
		  i+1, CHAR(type2str(t_ai)));
	lens[i] = length(a[i]);
	if(lens[i] == 0) return allocVector(STRSXP, 0);
    }

#define CHECK_maxlen							\
    maxlen = nfmt;							\
    for(i = 0; i < nargs; i++)						\
	if(maxlen < lens[i]) maxlen = lens[i];				\
    if(maxlen % nfmt)							\
	error(_("arguments cannot be recycled to the same length"));	\
    for(i = 0; i < nargs; i++)						\
	if(maxlen % lens[i])						\
	    error(_("arguments cannot be recycled to the same length"))

    CHECK_maxlen;

    outputString = R_AllocStringBuffer(0, &outbuff);

    /* We do the format analysis a row at a time */
    for(ns = 0; ns < maxlen; ns++) {
	outputString[0] = '\0';
	use_UTF8 = getCharCE(STRING_ELT(format, ns % nfmt)) == CE_UTF8;
	if (!use_UTF8) {
	    for(i = 0; i < nargs; i++) {
		if (!isString(a[i])) continue;
		if (getCharCE(STRING_ELT(a[i], ns % lens[i])) == CE_UTF8) {
		    use_UTF8 = TRUE; break;
		}
	    }
	}

	formatString = TRANSLATE_CHAR(format, ns % nfmt);
	n = strlen(formatString);
	if (n > MAXLINE)
	    error(_("'fmt' length exceeds maximal format length %d"), MAXLINE);
	/* process the format string */
	for (cur = 0, cnt = 0; cur < n; cur += chunk) {
	    const char *curFormat = formatString + cur, *ss;
	    char *starc;
	    ss = NULL;
	    if (formatString[cur] == '%') { /* handle special format command */

		if (cur < n - 1 && formatString[cur + 1] == '%') {
		    /* take care of %% in the format */
		    chunk = 2;
		    strcpy(bit, "%");
		}
		else {
		    /* recognise selected types from Table B-1 of K&R */
		    /* NB: we deal with "%%" in branch above. */
		    /* This is MBCS-OK, as we are in a format spec */
		    chunk = strcspn(curFormat + 1, "diosfeEgGxXaA") + 2;
		    if (cur + chunk > n)
			error(_("unrecognised format specification '%s'"), curFormat);

		    strncpy(fmt, curFormat, chunk);
		    fmt[chunk] = '\0';

		    nthis = -1;
		    /* now look for %n$ or %nn$ form */
		    if (strlen(fmt) > 3 && fmt[1] >= '1' && fmt[1] <= '9') {
			v = fmt[1] - '0';
			if(fmt[2] == '$') {
			    if(v > nargs)
				error(_("reference to non-existent argument %d"), v);
			    nthis = v-1;
			    memmove(fmt+1, fmt+3, strlen(fmt)-2);
			} else if(fmt[2] >= '0' && fmt[2] <= '9' && fmt[3] == '$') {
			    v = 10*v + fmt[2] - '0';
			    if(v > nargs)
				error(_("reference to non-existent argument %d"), v);
			    nthis = v-1;
			    memmove(fmt+1, fmt+4, strlen(fmt)-3);
			}
		    }

		    starc = Rf_strchr(fmt, '*');
		    if (starc) { /* handle  *  format if present */
			nstar = -1;
			if (strlen(starc) > 3 && starc[1] >= '1' && starc[1] <= '9') {
			    v = starc[1] - '0';
			    if(starc[2] == '$') {
				if(v > nargs)
				    error(_("reference to non-existent argument %d"), v);
				nstar = v-1;
				memmove(starc+1, starc+3, strlen(starc)-2);
			    } else if(starc[2] >= '0' && starc[2] <= '9'
				      && starc[3] == '$') {
				v = 10*v + starc[2] - '0';
				if(v > nargs)
				    error(_("reference to non-existent argument %d"), v);
				nstar = v-1;
				memmove(starc+1, starc+4, strlen(starc)-3);
			    }
			}

			if(nstar < 0) {
			    if (cnt >= nargs) error(_("too few arguments"));
			    nstar = cnt++;
			}

			if (Rf_strchr(starc+1, '*'))
			    error(_("at most one asterisk '*' is supported in each conversion specification"));

			_this = a[nstar];
			if(ns == 0 && TYPEOF(_this) == REALSXP) {
			    _this = coerceVector(_this, INTSXP);
			    PROTECT(a[nstar] = _this);
			    nprotect++;
			}
			if(TYPEOF(_this) != INTSXP || LENGTH(_this)<1 ||
			   INTEGER(_this)[ns % LENGTH(_this)] == NA_INTEGER)
			    error(_("argument for '*' conversion specification must be a number"));
			star_arg = INTEGER(_this)[ns % LENGTH(_this)];
			has_star = TRUE;
		    }
		    else
			has_star = FALSE;

		    if (fmt[strlen(fmt) - 1] == '%') {
			/* handle % with formatting options */
			if (has_star)
			    snprintf(bit, MAXLINE+1, fmt, star_arg);
			else
			    strcpy(bit, fmt);
			/* was sprintf(..)  for which some compiler warn */
		    } else {
			Rboolean did_this = FALSE;
			if(nthis < 0) {
			    if (cnt >= nargs) error(_("too few arguments"));
			    nthis = cnt++;
			}
			_this = a[nthis];
			if (has_star) {
			    size_t nf; char *p, *q = fmt2;
			    for (p = fmt; *p; p++)
				if (*p == '*') q += sprintf(q, "%d", star_arg);
				else *q++ = *p;
			    *q = '\0';
			    nf = strlen(fmt2);
			    if (nf > MAXLINE)
				error(_("'fmt' length exceeds maximal format length %d"),
				      MAXLINE);
			    fmtp = fmt2;
			} else fmtp = fmt;

#define CHECK_this_length						\
			PROTECT(_this);					\
			thislen = length(_this);			\
			if(thislen == 0)				\
			    error(_("coercion has changed vector length to 0"))

			/* Now let us see if some minimal coercion
			   would be sensible, but only do so once, for ns = 0: */
			if(ns == 0) {
			    SEXP tmp; Rboolean do_check;
			    switch(*findspec(fmtp)) {
			    case 'd':
			    case 'i':
			    case 'o':
			    case 'x':
			    case 'X':
				if(TYPEOF(_this) == REALSXP) {
				    double r = REAL(_this)[0];
				    if((double)((int) r) == r)
					_this = coerceVector(_this, INTSXP);
				    PROTECT(a[nthis] = _this);
				    nprotect++;
				}
				break;
			    case 'a':
			    case 'A':
			    case 'e':
			    case 'f':
			    case 'g':
			    case 'E':
			    case 'G':
				if(TYPEOF(_this) != REALSXP &&
				   /* no automatic as.double(<string>) : */
				   TYPEOF(_this) != STRSXP) {
				    PROTECT(tmp = lang2(install("as.double"), _this));
#define COERCE_THIS_TO_A						\
				    _this = eval(tmp, env);		\
				    UNPROTECT(1);			\
				    PROTECT(a[nthis] = _this);		\
				    nprotect++;				\
				    did_this = TRUE;			\
				    CHECK_this_length;			\
				    do_check = (lens[nthis] == maxlen);	\
				    lens[nthis] = thislen; /* may have changed! */ \
				    if(do_check && thislen < maxlen) {	\
					CHECK_maxlen;			\
				    }

				    COERCE_THIS_TO_A
				}
				break;
			    case 's':
				if(TYPEOF(_this) != STRSXP) {
				    /* as.character method might call sprintf() */
				    size_t nc = strlen(outputString);
				    char *z = Calloc(nc+1, char);
				    strcpy(z, outputString);
				    PROTECT(tmp = lang2(install("as.character"), _this));

				    COERCE_THIS_TO_A
				    strcpy(outputString, z);
				    Free(z);
				}
				break;
			    default:
				break;
			    }
			} /* ns == 0 (first-time only) */

			if(!did_this)
			    CHECK_this_length;

			switch(TYPEOF(_this)) {
			case LGLSXP:
			    {
				int x = LOGICAL(_this)[ns % thislen];
				if (checkfmt(fmtp, "di"))
				    error(_("invalid format '%s'; %s"), fmtp,
					  _("use format %d or %i for logical objects"));
				if (x == NA_LOGICAL) {
				    fmtp[strlen(fmtp)-1] = 's';
				    _my_sprintf("NA")
				} else {
				    _my_sprintf(x)
				}
				break;
			    }
			case INTSXP:
			    {
				int x = INTEGER(_this)[ns % thislen];
				if (checkfmt(fmtp, "dioxX"))
				    error(_("invalid format '%s'; %s"), fmtp,
					  _("use format %d, %i, %o, %x or %X for integer objects"));
				if (x == NA_INTEGER) {
				    fmtp[strlen(fmtp)-1] = 's';
				    _my_sprintf("NA")
				} else {
				    _my_sprintf(x)
				}
				break;
			    }
Beispiel #18
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */
const wchar_t *wtransChar(SEXP x)
{
    void * obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf;
    wchar_t *p;
    size_t inb, outb, res, top;
    Rboolean knownEnc = FALSE;
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "wtransChar");

    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));

    if(IS_LATIN1(x)) {
	if(!latin1_wobj) {
	    obj = Riconv_open(TO_WCHAR, "latin1");
	    if(obj == (void *)(-1))
		error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", TO_WCHAR);
	    latin1_wobj = obj;
	} else
	    obj = latin1_wobj;
	knownEnc = TRUE;
    } else if(IS_UTF8(x)) {
	if(!utf8_wobj) {
	    obj = Riconv_open(TO_WCHAR, "UTF-8");
	    if(obj == (void *)(-1)) 
		error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", TO_WCHAR);
	    utf8_wobj = obj;
	} else
	    obj = utf8_wobj;
	knownEnc = TRUE;
    } else {
	obj = Riconv_open(TO_WCHAR, "");
	if(obj == (void *)(-1))
#ifdef Win32
	    error(_("unsupported conversion to '%s' from codepage %d"),
		  TO_WCHAR, localeCP);
#else
	    error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR);
#endif
    }

    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 5) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	outbuf += 4; outb -= 4;
	inbuf++; inb--;
	goto next_char;
	/* if(!knownEnc) Riconv_close(obj);
	   error(_("invalid input in wtransChar")); */
    }
    if(!knownEnc) Riconv_close(obj);
    res = (top - outb);
    /* terminator is 2 or 4 null bytes */
    p = (wchar_t *) R_alloc(res+4, 1);
    memset(p, 0, res+4);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
Beispiel #19
0
int R_nchar(SEXP string, nchar_type type_,
	    Rboolean allowNA, Rboolean keepNA, const char* msg_name)
{
    if (string == NA_STRING)
	return keepNA ? NA_INTEGER : 2;
    // else :
    switch(type_) {
    case Bytes:
	return LENGTH(string);
	break;
    case Chars:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do chars 0 */
		error(_("number of characters is not computable in \"bytes\" encoding, %s"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    int nc = (int) mbstowcs(NULL, translateChar(string), 0);
	    if (!allowNA && nc < 0)
		error(_("invalid multibyte string, %s"), msg_name);
	    return (nc >= 0 ? nc : NA_INTEGER);
	} else
	    return ((int) strlen(translateChar(string)));
	break;
    case Width:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		wchar_t wc1;
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do width 0 */
		error(_("width is not computable for %s in \"bytes\" encoding"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    const char *xi = translateChar(string);
	    int nc = (int) mbstowcs(NULL, xi, 0);
	    if (nc >= 0) {
		const void *vmax = vmaxget();
		wchar_t *wc = (wchar_t *)
		    R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
		mbstowcs(wc, xi, nc + 1);
		int nci18n = Ri18n_wcswidth(wc, 2147483647);
		vmaxset(vmax);
		return (nci18n < 1) ? nc : nci18n;
	    } else if (allowNA)
		error(_("invalid multibyte string, %s"), msg_name);
	    else
		return NA_INTEGER;
	} else
	    return (int) strlen(translateChar(string));

    } // switch
    return NA_INTEGER; // -Wall
} // R_nchar()
Beispiel #20
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out, int subst)
{
    void * obj;
    const char *inbuf;
    char *outbuf, *p;
    size_t inb, outb, res, top;
    char *tocode = NULL, *fromcode = NULL;
#ifdef Win32
    char buf[20];
#endif
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    /* We can only encode from Symbol to UTF-8 */
    if(ce_in == ce_out || ce_out == CE_SYMBOL ||
       ce_in == CE_ANY || ce_out == CE_ANY) return x;
    if(ce_in == CE_SYMBOL) {
	if(ce_out == CE_UTF8) {
	    size_t nc = 3*strlen(x)+1; /* all in BMP */
	    p = R_alloc(nc, 1);
	    Rf_AdobeSymbol2utf8(p, x, nc);
	    return p;
	} else return x;
    }
    if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return x;
    if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return x;
    if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return x;
    if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return x;

    if(strIsASCII(x)) return x;

    switch(ce_in) {
#ifdef Win32
    case CE_NATIVE:
	{
	    /* Looks like CP1252 is treated as Latin-1 by iconv */
	    snprintf(buf, 20, "CP%d", localeCP);
	    fromcode = buf;
	    break;
	}
    case CE_LATIN1: fromcode = "CP1252"; break;
#else
    case CE_NATIVE: fromcode = ""; break;
    case CE_LATIN1: fromcode = "latin1"; break;
#endif
    case CE_UTF8:   fromcode = "UTF-8"; break;
    default: return x;
    }

    switch(ce_out) {
 #ifdef Win32
    case CE_NATIVE:
	{
	    /* avoid possible misidentification of CP1250 as LATIN-2 */
	    snprintf(buf, 20, "CP%d", localeCP);
	    tocode = buf;
	    break;
	}
#else
    case CE_NATIVE: tocode = ""; break;
#endif
    case CE_LATIN1: tocode = "latin1"; break;
    case CE_UTF8:   tocode = "UTF-8"; break;
    default: return x;
    }

    obj = Riconv_open(tocode, fromcode);
    if(obj == (void *)(-1)) return x;
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = x; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	switch(subst) {
	case 1: /* substitute hex */
	    if(outb < 5) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	    goto next_char;
	    break;
	case 2: /* substitute . */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '.'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	case 3: /* substitute ? */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '?'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	default: /* skip byte */
	    inbuf++; inb--;
	    goto next_char;
	}
    }
    Riconv_close(obj);
    *outbuf = '\0';
    res = (top-outb)+1; /* strlen(cbuff.data) + 1; */
    p = R_alloc(res, 1);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
Beispiel #21
0
SEXP attribute_hidden do_substrgets(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP s, x, sa, so, value, el, v_el;
    R_xlen_t i, len;
    int start, stop, k, l, v;
    size_t slen;
    cetype_t ienc, venc;
    const char *ss, *v_ss;
    char *buf;
    const void *vmax;

    checkArity(op, args);
    x = CAR(args);
    sa = CADR(args);
    so = CADDR(args);
    value = CADDDR(args);
    k = LENGTH(sa);
    l = LENGTH(so);

    if (!isString(x))
	error(_("replacing substrings in a non-character object"));
    len = LENGTH(x);
    PROTECT(s = allocVector(STRSXP, len));
    if (len > 0) {
	if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0)
	    error(_("invalid substring arguments"));

	v = LENGTH(value);
	if (!isString(value) || v == 0) error(_("invalid value"));

	vmax = vmaxget();
	for (i = 0; i < len; i++) {
	    el = STRING_ELT(x, i);
	    v_el = STRING_ELT(value, i % v);
	    start = INTEGER(sa)[i % k];
	    stop = INTEGER(so)[i % l];
	    if (el == NA_STRING || v_el == NA_STRING ||
		start == NA_INTEGER || stop == NA_INTEGER) {
		SET_STRING_ELT(s, i, NA_STRING);
		continue;
	    }
	    ienc = getCharCE(el);
	    ss = CHAR(el);
	    slen = strlen(ss);
	    if (start < 1) start = 1;
	    if (stop > slen) stop = (int) slen; /* SBCS optimization */
	    if (start > stop) {
		/* just copy element across */
		SET_STRING_ELT(s, i, STRING_ELT(x, i));
	    } else {
		int ienc2 = ienc;
		v_ss = CHAR(v_el);
		/* is the value in the same encoding?
		   FIXME: could prefer UTF-8 here
		 */
		venc = getCharCE(v_el);
		if (venc != ienc && !strIsASCII(v_ss)) {
		    ss = translateChar(el);
		    slen = strlen(ss);
		    v_ss = translateChar(v_el);
		    ienc2 = CE_NATIVE;
		}
		/* might expand under MBCS */
		buf = R_AllocStringBuffer(slen+strlen(v_ss), &cbuff);
		strcpy(buf, ss);
		substrset(buf, v_ss, ienc2, start, stop);
		SET_STRING_ELT(s, i, mkCharCE(buf, ienc2));
	    }
	    vmaxset(vmax);
	}
	R_FreeStringBufferL(&cbuff);
    }
    UNPROTECT(1);
    return s;
}
Beispiel #22
0
SEXP attribute_hidden do_filepath(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP ans, sep, x;
    int i, j, k, ln, maxlen, nx, nzero, pwidth, sepw;
    const char *s, *csep, *cbuf;
    char *buf;

    checkArity(op, args);

    /* Check the arguments */

    x = CAR(args);
    if (!isVectorList(x))
	error(_("invalid first argument"));
    nx = length(x);
    if(nx == 0) return allocVector(STRSXP, 0);


    sep = CADR(args);
    if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING)
	error(_("invalid separator"));
    sep = STRING_ELT(sep, 0);
    csep = CHAR(sep);
    sepw = (int) strlen(csep); /* hopefully 1 */

    /* Any zero-length argument gives zero-length result */
    maxlen = 0; nzero = 0;
    for (j = 0; j < nx; j++) {
	if (!isString(VECTOR_ELT(x, j))) {
	    /* formerly in R code: moved to C for speed */
	    SEXP call, xj = VECTOR_ELT(x, j);
	    if(OBJECT(xj)) { /* method dispatch */
		PROTECT(call = lang2(install("as.character"), xj));
		SET_VECTOR_ELT(x, j, eval(call, env));
		UNPROTECT(1);
	    } else if (isSymbol(xj))
		SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj)));
	    else
		SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP));

	    if (!isString(VECTOR_ELT(x, j)))
		error(_("non-string argument to Internal paste"));
	}
	ln = length(VECTOR_ELT(x, j));
	if(ln > maxlen) maxlen = ln;
	if(ln == 0) {nzero++; break;}
    }
    if(nzero || maxlen == 0) return allocVector(STRSXP, 0);

    PROTECT(ans = allocVector(STRSXP, maxlen));

    for (i = 0; i < maxlen; i++) {
	pwidth = 0;
	for (j = 0; j < nx; j++) {
	    k = length(VECTOR_ELT(x, j));
	    pwidth += (int) strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k)));
	}
	pwidth += (nx - 1) * sepw;
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	for (j = 0; j < nx; j++) {
	    k = length(VECTOR_ELT(x, j));
	    if (k > 0) {
		s = translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k));
		strcpy(buf, s);
		buf += strlen(s);
	    }
	    if (j != nx - 1 && sepw != 0) {
		strcpy(buf, csep);
		buf += sepw;
	    }
	}
#ifdef Win32
	// Trailing seps are invalid for file paths except for / and d:/
	if(streql(csep, "/") || streql(csep, "\\")) {
	    if(buf > cbuf) {
		buf--;
		if(*buf == csep[0] && buf > cbuf &&
		   (buf != cbuf+2 || cbuf[1] != ':')) *buf = '\0';
	    }
	}
#endif
	SET_STRING_ELT(ans, i, mkChar(cbuf));
    }
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return ans;
}