Example #1
0
static void substr(char *buf, const char *str, int ienc, int sa, int so)
{
/* Store the substring	str [sa:so]  into buf[] */
    int i, j, used;

    if (ienc == CE_UTF8) {
	const char *end = str + strlen(str);
	for (i = 0; i < so && str < end; i++) {
	    int used = utf8clen(*str);
	    if (i < sa - 1) { str += used; continue; }
	    for (j = 0; j < used; j++) *buf++ = *str++;
	}
    } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) {
	for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++;
    } else {
	if (mbcslocale && !strIsASCII(str)) {
	    const char *end = str + strlen(str);
	    mbstate_t mb_st;
	    mbs_init(&mb_st);
	    for (i = 1; i < sa; i++) str += Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st);
	    for (i = sa; i <= so && str < end; i++) {
		used = (int) Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st);
		for (j = 0; j < used; j++) *buf++ = *str++;
	    }
	} else
	    for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++;
    }
    *buf = '\0';
}
Example #2
0
SEXP readClipboard(SEXP sformat, SEXP sraw)
{
    SEXP ans = R_NilValue;
    HGLOBAL hglb;
    const char *pc;
    int j, format, raw, size;

    format = asInteger(sformat);
    raw = asLogical(sraw);

    if(OpenClipboard(NULL)) {
	if(IsClipboardFormatAvailable(format) &&
	   (hglb = GetClipboardData(format)) &&
	   (pc = (const char *) GlobalLock(hglb))) {
	    if(raw) {
		Rbyte *pans;
		size = GlobalSize(hglb);
		ans = allocVector(RAWSXP, size); /* no R allocation below */
		pans = RAW(ans);
		for (j = 0; j < size; j++) pans[j] = *pc++;
	    } else if (format == CF_UNICODETEXT) {
		int n, ienc = CE_NATIVE;
		const wchar_t *wpc = (wchar_t *) pc;
		n = wcslen(wpc);
		char text[2 * (n+1)];  /* UTF-8 is at most 1.5x longer */
		R_CheckStack();
		wcstoutf8(text, wpc, n+1);
		if(!strIsASCII(text)) ienc = CE_UTF8;
		ans = splitClipboardText(text, ienc);
	    } else if (format == CF_TEXT || format == CF_OEMTEXT || format == CF_DIF) {
		/* can we get the encoding out of a CF_LOCALE entry? */
		ans = splitClipboardText(pc, 0);
	    } else
		error("'raw = FALSE' and format is a not a known text format");
	    GlobalUnlock(hglb);
	}
	CloseClipboard();
    }
    return ans;
}
Example #3
0
SEXP attribute_hidden do_abbrev(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP x, ans;
    R_xlen_t i, len;
    int minlen;
    Rboolean warn = FALSE;
    const char *s;
    const void *vmax;

    checkArity(op,args);
    x = CAR(args);

    if (!isString(x))
	error(_("the first argument must be a character vector"));
    len = XLENGTH(x);

    PROTECT(ans = allocVector(STRSXP, len));
    minlen = asInteger(CADR(args));
    vmax = vmaxget();
    for (i = 0 ; i < len ; i++) {
	if (STRING_ELT(x, i) == NA_STRING)
	    SET_STRING_ELT(ans, i, NA_STRING);
	else {
	    s = translateChar(STRING_ELT(x, i));
	    if(strlen(s) > minlen) {
		warn = warn | !strIsASCII(s);
		R_AllocStringBuffer(strlen(s), &cbuff);
		SET_STRING_ELT(ans, i, stripchars(s, minlen));
	    } else SET_STRING_ELT(ans, i, mkChar(s));
	}
	vmaxset(vmax);
    }
    if (warn) warning(_("abbreviate used with non-ASCII chars"));
    DUPLICATE_ATTRIB(ans, x);
    /* This copied the class, if any */
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return(ans);
}
Example #4
0
SEXP attribute_hidden do_substrgets(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP s, x, sa, so, value, el, v_el;
    R_xlen_t i, len;
    int start, stop, k, l, v;
    size_t slen;
    cetype_t ienc, venc;
    const char *ss, *v_ss;
    char *buf;
    const void *vmax;

    checkArity(op, args);
    x = CAR(args);
    sa = CADR(args);
    so = CADDR(args);
    value = CADDDR(args);
    k = LENGTH(sa);
    l = LENGTH(so);

    if (!isString(x))
	error(_("replacing substrings in a non-character object"));
    len = LENGTH(x);
    PROTECT(s = allocVector(STRSXP, len));
    if (len > 0) {
	if (!isInteger(sa) || !isInteger(so) || k == 0 || l == 0)
	    error(_("invalid substring arguments"));

	v = LENGTH(value);
	if (!isString(value) || v == 0) error(_("invalid value"));

	vmax = vmaxget();
	for (i = 0; i < len; i++) {
	    el = STRING_ELT(x, i);
	    v_el = STRING_ELT(value, i % v);
	    start = INTEGER(sa)[i % k];
	    stop = INTEGER(so)[i % l];
	    if (el == NA_STRING || v_el == NA_STRING ||
		start == NA_INTEGER || stop == NA_INTEGER) {
		SET_STRING_ELT(s, i, NA_STRING);
		continue;
	    }
	    ienc = getCharCE(el);
	    ss = CHAR(el);
	    slen = strlen(ss);
	    if (start < 1) start = 1;
	    if (stop > slen) stop = (int) slen; /* SBCS optimization */
	    if (start > stop) {
		/* just copy element across */
		SET_STRING_ELT(s, i, STRING_ELT(x, i));
	    } else {
		int ienc2 = ienc;
		v_ss = CHAR(v_el);
		/* is the value in the same encoding?
		   FIXME: could prefer UTF-8 here
		 */
		venc = getCharCE(v_el);
		if (venc != ienc && !strIsASCII(v_ss)) {
		    ss = translateChar(el);
		    slen = strlen(ss);
		    v_ss = translateChar(v_el);
		    ienc2 = CE_NATIVE;
		}
		/* might expand under MBCS */
		buf = R_AllocStringBuffer(slen+strlen(v_ss), &cbuff);
		strcpy(buf, ss);
		substrset(buf, v_ss, ienc2, start, stop);
		SET_STRING_ELT(s, i, mkCharCE(buf, ienc2));
	    }
	    vmaxset(vmax);
	}
	R_FreeStringBufferL(&cbuff);
    }
    UNPROTECT(1);
    return s;
}
Example #5
0
/* A version avoiding R_alloc for use in the Rgui editor */
void reEnc2(const char *x, char *y, int ny,
	    cetype_t ce_in, cetype_t ce_out, int subst)
{
    void * obj;
    const char *inbuf;
    char *outbuf;
    size_t inb, outb, res, top;
    char *tocode = NULL, *fromcode = NULL;
    char buf[20];
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    strncpy(y, x, ny);
    y[ny - 1] = '\0';

    if(ce_in == ce_out || ce_in == CE_ANY || ce_out == CE_ANY) return;
    if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return;
    if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return;
    if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return;
    if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return;

    if(strIsASCII(x)) return;

    switch(ce_in) {
    case CE_NATIVE:
	{
	    /* Looks like CP1252 is treated as Latin-1 by iconv */
	    snprintf(buf, 20, "CP%d", localeCP);
	    fromcode = buf;
	    break;
	}
    case CE_LATIN1: fromcode = "CP1252"; break;
    case CE_UTF8:   fromcode = "UTF-8"; break;
    default: return;
    }

    switch(ce_out) {
    case CE_NATIVE:
	{
	    /* avoid possible misidentification of CP1250 as LATIN-2 */
	    snprintf(buf, 20, "CP%d", localeCP);
	    tocode = buf;
	    break;
	}
    case CE_LATIN1: tocode = "latin1"; break;
    case CE_UTF8:   tocode = "UTF-8"; break;
    default: return;
    }

    obj = Riconv_open(tocode, fromcode);
    if(obj == (void *)(-1)) return;
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = x; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	switch(subst) {
	case 1: /* substitute hex */
	    if(outb < 5) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	    goto next_char;
	    break;
	case 2: /* substitute . */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '.'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	case 3: /* substitute ? */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '?'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	default: /* skip byte */
	    inbuf++; inb--;
	    goto next_char;
	}
    }
    Riconv_close(obj);
    *outbuf = '\0';
    res = (top-outb)+1; /* strlen(cbuff.data) + 1; */
    if (res > ny) error("converted string too long for buffer");
    memcpy(y, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
}
Example #6
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
const char *reEnc(const char *x, cetype_t ce_in, cetype_t ce_out, int subst)
{
    void * obj;
    const char *inbuf;
    char *outbuf, *p;
    size_t inb, outb, res, top;
    char *tocode = NULL, *fromcode = NULL;
#ifdef Win32
    char buf[20];
#endif
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    /* We can only encode from Symbol to UTF-8 */
    if(ce_in == ce_out || ce_out == CE_SYMBOL ||
       ce_in == CE_ANY || ce_out == CE_ANY) return x;
    if(ce_in == CE_SYMBOL) {
	if(ce_out == CE_UTF8) {
	    size_t nc = 3*strlen(x)+1; /* all in BMP */
	    p = R_alloc(nc, 1);
	    Rf_AdobeSymbol2utf8(p, x, nc);
	    return p;
	} else return x;
    }
    if(utf8locale && ce_in == CE_NATIVE && ce_out == CE_UTF8) return x;
    if(utf8locale && ce_out == CE_NATIVE && ce_in == CE_UTF8) return x;
    if(latin1locale && ce_in == CE_NATIVE && ce_out == CE_LATIN1) return x;
    if(latin1locale && ce_out == CE_NATIVE && ce_in == CE_LATIN1) return x;

    if(strIsASCII(x)) return x;

    switch(ce_in) {
#ifdef Win32
    case CE_NATIVE:
	{
	    /* Looks like CP1252 is treated as Latin-1 by iconv */
	    snprintf(buf, 20, "CP%d", localeCP);
	    fromcode = buf;
	    break;
	}
    case CE_LATIN1: fromcode = "CP1252"; break;
#else
    case CE_NATIVE: fromcode = ""; break;
    case CE_LATIN1: fromcode = "latin1"; break;
#endif
    case CE_UTF8:   fromcode = "UTF-8"; break;
    default: return x;
    }

    switch(ce_out) {
 #ifdef Win32
    case CE_NATIVE:
	{
	    /* avoid possible misidentification of CP1250 as LATIN-2 */
	    snprintf(buf, 20, "CP%d", localeCP);
	    tocode = buf;
	    break;
	}
#else
    case CE_NATIVE: tocode = ""; break;
#endif
    case CE_LATIN1: tocode = "latin1"; break;
    case CE_UTF8:   tocode = "UTF-8"; break;
    default: return x;
    }

    obj = Riconv_open(tocode, fromcode);
    if(obj == (void *)(-1)) return x;
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = x; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	switch(subst) {
	case 1: /* substitute hex */
	    if(outb < 5) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	    goto next_char;
	    break;
	case 2: /* substitute . */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '.'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	case 3: /* substitute ? */
	    if(outb < 1) {
		R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
		goto top_of_loop;
	    }
	    *outbuf++ = '?'; inbuf++; outb--; inb--;
	    goto next_char;
	    break;
	default: /* skip byte */
	    inbuf++; inb--;
	    goto next_char;
	}
    }
    Riconv_close(obj);
    *outbuf = '\0';
    res = (top-outb)+1; /* strlen(cbuff.data) + 1; */
    p = R_alloc(res, 1);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
Example #7
0
/* Note that NA_STRING is not handled separately here.  This is
   deliberate -- see ?paste -- and implicitly coerces it to "NA"
*/
SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP ans, collapse, sep, x;
    int sepw, u_sepw, ienc;
    R_xlen_t i, j, k, maxlen, nx, pwidth;
    const char *s, *cbuf, *csep=NULL, *u_csep=NULL;
    char *buf;
    Rboolean allKnown, anyKnown, use_UTF8, use_Bytes,
	sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE,
	use_sep = (PRIMVAL(op) == 0);
    const void *vmax;

    checkArity(op, args);

    /* We use formatting and so we must initialize printing. */

    PrintDefaults();

    /* Check the arguments */

    x = CAR(args);
    if (!isVectorList(x))
	error(_("invalid first argument"));
    nx = xlength(x);

    if(use_sep) { /* paste(..., sep, .) */
	sep = CADR(args);
	if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING)
	    error(_("invalid separator"));
	sep = STRING_ELT(sep, 0);
	csep = translateChar(sep);
	u_sepw = sepw = (int) strlen(csep); // will be short
	sepASCII = strIsASCII(csep);
	sepKnown = ENC_KNOWN(sep) > 0;
	sepUTF8 = IS_UTF8(sep);
	sepBytes = IS_BYTES(sep);
	collapse = CADDR(args);
    } else { /* paste0(..., .) */
	u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */
	collapse = CADR(args);
    }
    if (!isNull(collapse))
	if(!isString(collapse) || LENGTH(collapse) <= 0 ||
	   STRING_ELT(collapse, 0) == NA_STRING)
	    error(_("invalid '%s' argument"), "collapse");
    if(nx == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);


    /* Maximum argument length, coerce if needed */

    maxlen = 0;
    for (j = 0; j < nx; j++) {
	if (!isString(VECTOR_ELT(x, j))) {
	    /* formerly in R code: moved to C for speed */
	    SEXP call, xj = VECTOR_ELT(x, j);
	    if(OBJECT(xj)) { /* method dispatch */
		PROTECT(call = lang2(install("as.character"), xj));
		SET_VECTOR_ELT(x, j, eval(call, env));
		UNPROTECT(1);
	    } else if (isSymbol(xj))
		SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj)));
	    else
		SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP));

	    if (!isString(VECTOR_ELT(x, j)))
		error(_("non-string argument to internal 'paste'"));
	}
	if(xlength(VECTOR_ELT(x, j)) > maxlen)
	    maxlen = xlength(VECTOR_ELT(x, j));
    }
    if(maxlen == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);

    PROTECT(ans = allocVector(STRSXP, maxlen));

    for (i = 0; i < maxlen; i++) {
	/* Strategy for marking the encoding: if all inputs (including
	 * the separator) are ASCII, so is the output and we don't
	 * need to mark.  Otherwise if all non-ASCII inputs are of
	 * declared encoding, we should mark.
	 * Need to be careful only to include separator if it is used.
	 */
	anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE;
	if(nx > 1) {
	    allKnown = sepKnown || sepASCII;
	    anyKnown = sepKnown;
	    use_UTF8 = sepUTF8;
	    use_Bytes = sepBytes;
	}

	pwidth = 0;
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if(IS_UTF8(cs)) use_UTF8 = TRUE;
		if(IS_BYTES(cs)) use_Bytes = TRUE;
	    }
	}
	if (use_Bytes) use_UTF8 = FALSE;
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		if(use_Bytes)
		    pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else if(use_UTF8)
		    pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else
		    pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		vmaxset(vmax);
	    }
	}
	if(use_sep) {
	    if (use_UTF8 && !u_csep) {
		u_csep = translateCharUTF8(sep);
		u_sepw = (int) strlen(u_csep); // will be short
	    }
	    pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw);
	}
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if (use_UTF8) {
		    s = translateCharUTF8(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		} else {
		    s = use_Bytes ? CHAR(cs) : translateChar(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		    allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0));
		    anyKnown = anyKnown || (ENC_KNOWN(cs)> 0);
		}
	    }
	    if (sepw != 0 && j != nx - 1) {
		if (use_UTF8) {
		    strcpy(buf, u_csep);
		    buf += u_sepw;
		} else {
		    strcpy(buf, csep);
		    buf += sepw;
		}
	    }
	    vmax = vmaxget();
	}
	ienc = 0;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc));
    }

    /* Now collapse, if required. */

    if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) {
	sep = STRING_ELT(collapse, 0);
	use_UTF8 = IS_UTF8(sep);
	use_Bytes = IS_BYTES(sep);
	for (i = 0; i < nx; i++) {
	    if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE;
	    if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE;
	}
	if(use_Bytes) {
	    csep = CHAR(sep);
	    use_UTF8 = FALSE;
	} else if(use_UTF8)
	    csep = translateCharUTF8(sep);
	else
	    csep = translateChar(sep);
	sepw = (int) strlen(csep);
	anyKnown = ENC_KNOWN(sep) > 0;
	allKnown = anyKnown || strIsASCII(csep);
	pwidth = 0;
	vmax = vmaxget();
	for (i = 0; i < nx; i++)
	    if(use_UTF8) {
		pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i)));
		vmaxset(vmax);
	    } else /* already translated */
		pwidth += strlen(CHAR(STRING_ELT(ans, i)));
	pwidth += (nx - 1) * sepw;
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (i = 0; i < nx; i++) {
	    if(i > 0) {
		strcpy(buf, csep);
		buf += sepw;
	    }
	    if(use_UTF8)
		s = translateCharUTF8(STRING_ELT(ans, i));
	    else /* already translated */
		s = CHAR(STRING_ELT(ans, i));
	    strcpy(buf, s);
	    while (*buf)
		buf++;
	    allKnown = allKnown &&
		(strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0));
	    anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0);
	    if(use_UTF8) vmaxset(vmax);
	}
	UNPROTECT(1);
	ienc = CE_NATIVE;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	PROTECT(ans = allocVector(STRSXP, 1));
	SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc));
    }
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return ans;
}