Exemplo n.º 1
0
static int utf8towcs(wchar_t *wc, const char *s, int n)
{
    ssize_t m, res = 0;
    const char *t;
    wchar_t *p;
    wchar_t local;

    if(wc)
        for(p = wc, t = s; ; p++, t += m)
        {
            m  = (ssize_t) utf8toucs(p, t);
            if (m < 0) Rf_error("invalid input '%s' in 'utf8towcs'", s);
            if (m == 0) break;
            res ++;
            if (res >= n) break;
        }
    else
        for(t = s; ; res++, t += m)
        {
            m  = (ssize_t) utf8toucs(&local, t);
            if (m < 0) Rf_error("invalid input '%s' in 'utf8towcs'", s);
            if (m == 0) break;
        }
    return (int) res;
}
Exemplo n.º 2
0
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP d, s, x, stype;
    int i, len, allowNA;
    size_t ntype;
    int nc;
    const char *type;
    const char *xi;
    wchar_t *wc;
    const void *vmax;

    checkArity(op, args);
    if (isFactor(CAR(args)))
	error(_("'%s' requires a character vector"), "nchar()");
    PROTECT(x = coerceVector(CAR(args), STRSXP));
    if (!isString(x))
	error(_("'%s' requires a character vector"), "nchar()");
    len = LENGTH(x);
    stype = CADR(args);
    if (!isString(stype) || LENGTH(stype) != 1)
	error(_("invalid '%s' argument"), "type");
    type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */
    ntype = strlen(type);
    if (ntype == 0) error(_("invalid '%s' argument"), "type");
    allowNA = asLogical(CADDR(args));
    if (allowNA == NA_LOGICAL) allowNA = 0;

    PROTECT(s = allocVector(INTSXP, len));
    vmax = vmaxget();
    for (i = 0; i < len; i++) {
	SEXP sxi = STRING_ELT(x, i);
	if (sxi == NA_STRING) {
	    INTEGER(s)[i] = 2;
	    continue;
	}
	if (strncmp(type, "bytes", ntype) == 0) {
	    INTEGER(s)[i] = LENGTH(sxi);
	} else if (strncmp(type, "chars", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do chars 0 */
		    error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		nc = mbstowcs(NULL, translateChar(sxi), 0);
		if (!allowNA && nc < 0)
		    error(_("invalid multibyte string %d"), i+1);
		INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else if (strncmp(type, "width", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		wchar_t wc1;
		nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do width 0 */
		    error(_("width is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		xi = translateChar(sxi);
		nc = mbstowcs(NULL, xi, 0);
		if (nc >= 0) {
		    wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);

		    mbstowcs(wc, xi, nc + 1);
		    INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647);
		    if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc;
		} else if (allowNA)
		    error(_("invalid multibyte string %d"), i+1);
		else
		    INTEGER(s)[i] = NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else
	    error(_("invalid '%s' argument"), "type");
	vmaxset(vmax);
    }
    R_FreeStringBufferL(&cbuff);
    if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue)
	setAttrib(s, R_NamesSymbol, d);
    if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue)
	setAttrib(s, R_DimSymbol, d);
    if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue)
	setAttrib(s, R_DimNamesSymbol, d);
    UNPROTECT(2);
    return s;
}
Exemplo n.º 3
0
int R_nchar(SEXP string, nchar_type type_,
	    Rboolean allowNA, Rboolean keepNA, const char* msg_name)
{
    if (string == NA_STRING)
	return keepNA ? NA_INTEGER : 2;
    // else :
    switch(type_) {
    case Bytes:
	return LENGTH(string);
	break;
    case Chars:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do chars 0 */
		error(_("number of characters is not computable in \"bytes\" encoding, %s"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    int nc = (int) mbstowcs(NULL, translateChar(string), 0);
	    if (!allowNA && nc < 0)
		error(_("invalid multibyte string, %s"), msg_name);
	    return (nc >= 0 ? nc : NA_INTEGER);
	} else
	    return ((int) strlen(translateChar(string)));
	break;
    case Width:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		wchar_t wc1;
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do width 0 */
		error(_("width is not computable for %s in \"bytes\" encoding"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    const char *xi = translateChar(string);
	    int nc = (int) mbstowcs(NULL, xi, 0);
	    if (nc >= 0) {
		const void *vmax = vmaxget();
		wchar_t *wc = (wchar_t *)
		    R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
		mbstowcs(wc, xi, nc + 1);
		int nci18n = Ri18n_wcswidth(wc, 2147483647);
		vmaxset(vmax);
		return (nci18n < 1) ? nc : nci18n;
	    } else if (allowNA)
		error(_("invalid multibyte string, %s"), msg_name);
	    else
		return NA_INTEGER;
	} else
	    return (int) strlen(translateChar(string));

    } // switch
    return NA_INTEGER; // -Wall
} // R_nchar()
Exemplo n.º 4
0
int main(int argc, char *argv[])
{
  int argNo, ucs16_opt, utf8_opt, j;
  int result, remaining, encoded, decoded;

  if (argc < 2) {
    printf("Usage: %s (-ucs16 <unicode-hex-string>|-utf8 <utf8-hex-string>|<ascii-chars>)\n",
           argv[0]);
    exit(1);
  }

  ucs16_opt = 0;
  utf8_opt = 0;
  argNo = 1;

  if (argc > 1) {
    argNo = 2;
    if (strcmp(argv[1],"-ucs16") == 0) {
      ucs16_opt = 1;
    } else if (strcmp(argv[1],"-utf8") == 0) {
      utf8_opt = 1;
    }
  }

  if (ucs16_opt) {
    char* tems = argv[argNo];
    int nus = strlen(argv[argNo])/4;
    int ns = 3*nus;
    UNICHAR* us = (UNICHAR*) malloc((nus+1)*sizeof(UNICHAR));
    unsigned char* s = (unsigned char*) malloc(ns+1);
    char temus[5] = {0,0,0,0,0};

    for (j=0; j<nus; j++) {
      temus[0] = tems[4*j];
      temus[1] = tems[4*j+1];
      temus[2] = tems[4*j+2];
      temus[3] = tems[4*j+3];
      sscanf(temus, "%x", &us[j]);
    }
    us[nus] = U_NUL;

    ucsprint(stderr, us, nus);
    fprintf(stderr, "\n");

    ucstoutf8(us, nus, s, ns, &remaining, &encoded);
    assert(encoded <= ns);
    ns = encoded;
    s[ns] = '\0';

    printf("UTF8(%d)=0x", ns);
    for (j=0; j<ns; j++) {
      printf("%02x",s[j]);
    }
    printf(", remaining=%d\n", remaining);

    result = utf8toucs(s, ns, us, nus, 0, &remaining, &decoded);
    assert(decoded <= nus);
    us[decoded] = U_NUL;

    printf("UCS(%d)=0x", decoded);

    for (j=0; j<decoded; j++) {
      printf("%04x",us[j]);
    }
    printf(", remaining=%d, result=%d\n", remaining, result);

    free(us);
    free(s);

  } else {
    char* tems = argv[argNo];
    int ns, nus;
    char* s;
    UNICHAR* us;

    ns = (utf8_opt) ? strlen(argv[argNo])/2 : strlen(argv[argNo]);
    s = (char*) malloc(ns+1);
    nus = ns;
    us = (UNICHAR*) malloc((nus+1)*sizeof(UNICHAR));

    if (utf8_opt) {
      char temstr[3] = {0,0,0};
      int ival;
      for (j=0; j<ns; j++) {
        temstr[0] = tems[2*j];
        temstr[1] = tems[2*j+1];
        sscanf(temstr, "%x", &ival);
        s[j] = (unsigned char) ival;
      }
    } else {
      for (j=0; j<ns; j++)
        s[j] = tems[j];
    }
    s[ns] = '\0';

    result = utf8toucs(s, ns, us, nus, 0, &remaining, &decoded);
    assert(decoded <= nus);
    nus = decoded;
    us[nus] = U_NUL;

    printf("UCS(%d)=0x", nus);

    for (j=0; j<nus; j++) {
      printf("%04x",us[j]);
    }

    printf(", remaining=%d, result=%d\n", remaining, result);

    ucstoutf8(us, nus, s, ns, &remaining, &encoded);

    assert(encoded <= ns);
    s[encoded] = '\0';

    if (utf8_opt) {
      printf("UTF8(%d)=0x", encoded);

      for (j=0; j<encoded; j++)
        printf("%02x", s[j]);

      printf(", remaining=%d\n", remaining);

    } else {
      printf("UTF8(%d)='%s', remaining=%d\n", encoded, s, remaining);
    }

    free(us);
    free(s);
  }

  return 0;
}
Exemplo n.º 5
0
const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
{
    int b, b0, i, j, cnt;
    const char *p; char *q, buf[11];
    cetype_t ienc = CE_NATIVE;

    /* We have to do something like this as the result is returned, and
       passed on by EncodeElement -- so no way could be end user be
       responsible for freeing it.  However, this is not thread-safe. */

    static R_StringBuffer gBuffer = {NULL, 0, BUFSIZE};
    R_StringBuffer *buffer = &gBuffer;

    if (s == NA_STRING) {
	p = quote ? CHAR(R_print.na_string) : CHAR(R_print.na_string_noquote);
	cnt = i = (int)(quote ? strlen(CHAR(R_print.na_string)) :
			strlen(CHAR(R_print.na_string_noquote)));
	quote = 0;
    } else {
#ifdef Win32
	if(WinUTF8out) {
	    ienc = getCharCE(s);
	    if(ienc == CE_UTF8) {
		p = CHAR(s);
		i = Rstrlen(s, quote);
		cnt = LENGTH(s);
	    } else {
		p = translateChar0(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
		ienc = CE_NATIVE;
	    }
	} else
#endif
	{
	    if(IS_BYTES(s)) {
		p = CHAR(s);
		cnt = (int) strlen(p);
		const char *q;
		char *pp = R_alloc(4*cnt+1, 1), *qq = pp, buf[5];
		for (q = p; *q; q++) {
		    unsigned char k = (unsigned char) *q;
		    if (k >= 0x20 && k < 0x80) {
			*qq++ = *q;
			if (quote && *q == '"') cnt++;
		    } else {
			snprintf(buf, 5, "\\x%02x", k);
			for(j = 0; j < 4; j++) *qq++ = buf[j];
			cnt += 3;
		    }
		}
		*qq = '\0';
		p = pp;
		i = cnt;
	    } else {
		p = translateChar(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = (int) strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
	    }
	}
    }

    /* We need enough space for the encoded string, including escapes.
       Octal encoding turns one byte into four.
       \u encoding can turn a multibyte into six or ten,
       but it turns 2/3 into 6, and 4 (and perhaps 5/6) into 10.
       Let's be wasteful here (the worst case appears to be an MBCS with
       one byte for an upper-plane Unicode point output as ten bytes,
       but I doubt that such an MBCS exists: two bytes is plausible).

       +2 allows for quotes, +6 for UTF_8 escapes.
     */
    q = R_AllocStringBuffer(imax2(5*cnt+8, w), buffer);
    b = w - i - (quote ? 2 : 0); /* total amount of padding */
    if(justify == Rprt_adj_none) b = 0;
    if(b > 0 && justify != Rprt_adj_left) {
	b0 = (justify == Rprt_adj_centre) ? b/2 : b;
	for(i = 0 ; i < b0 ; i++) *q++ = ' ';
	b -= b0;
    }
    if(quote) *q++ = (char) quote;
    if(mbcslocale || ienc == CE_UTF8) {
	int j, res;
	mbstate_t mb_st;
	wchar_t wc;
	unsigned int k; /* not wint_t as it might be signed */
#ifndef __STDC_ISO_10646__
	Rboolean Unicode_warning = FALSE;
#endif
	if(ienc != CE_UTF8)  mbs_init(&mb_st);
#ifdef Win32
	else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; }
#endif
	for (i = 0; i < cnt; i++) {
	    res = (int)((ienc == CE_UTF8) ? utf8toucs(&wc, p):
			mbrtowc(&wc, p, MB_CUR_MAX, NULL));
	    if(res >= 0) { /* res = 0 is a terminator */
		k = wc;
		/* To be portable, treat \0 explicitly */
		if(res == 0) {k = 0; wc = L'\0';}
		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
		    switch(wc) {
		    case L'\\': *q++ = '\\'; *q++ = '\\'; p++;
			break;
		    case L'\'':
		    case L'"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p++;
			break;
		    default:
			for(j = 0; j < res; j++) *q++ = *p++;
			break;
		    }
		} else if (k < 0x80) {
		    /* ANSI Escapes */
		    switch(wc) {
		    case L'\a': *q++ = '\\'; *q++ = 'a'; break;
		    case L'\b': *q++ = '\\'; *q++ = 'b'; break;
		    case L'\f': *q++ = '\\'; *q++ = 'f'; break;
		    case L'\n': *q++ = '\\'; *q++ = 'n'; break;
		    case L'\r': *q++ = '\\'; *q++ = 'r'; break;
		    case L'\t': *q++ = '\\'; *q++ = 't'; break;
		    case L'\v': *q++ = '\\'; *q++ = 'v'; break;
		    case L'\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", k);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		    p++;
		} else {
		    if(iswprint(wc)) {
			/* The problem here is that wc may be
			   printable according to the Unicode tables,
			   but it may not be printable on the output
			   device concerned. */
			for(j = 0; j < res; j++) *q++ = *p++;
		    } else {
#ifndef Win32
# ifndef __STDC_ISO_10646__
			Unicode_warning = TRUE;
# endif
			if(k > 0xffff)
			    snprintf(buf, 11, "\\U%08x", k);
			else
#endif
			    snprintf(buf, 11, "\\u%04x", k);
			j = (int) strlen(buf);
			memcpy(q, buf, j);
			q += j;
			p += res;
		    }
		    i += (res - 1);
		}

	    } else { /* invalid char */
		snprintf(q, 5, "\\x%02x", *((unsigned char *)p));
		q += 4; p++;
	    }
	}
#ifndef __STDC_ISO_10646__
	if(Unicode_warning)
	    warning(_("it is not known that wchar_t is Unicode on this platform"));
#endif

    } else
	for (i = 0; i < cnt; i++) {

	    /* ASCII */
	    if((unsigned char) *p < 0x80) {
		if(*p != '\t' && isprint((int)*p)) { /* Windows has \t as printable */
		    switch(*p) {
		    case '\\': *q++ = '\\'; *q++ = '\\'; break;
		    case '\'':
		    case '"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p; break;
		    default: *q++ = *p; break;
		    }
		} else switch(*p) {
			/* ANSI Escapes */
		    case '\a': *q++ = '\\'; *q++ = 'a'; break;
		    case '\b': *q++ = '\\'; *q++ = 'b'; break;
		    case '\f': *q++ = '\\'; *q++ = 'f'; break;
		    case '\n': *q++ = '\\'; *q++ = 'n'; break;
		    case '\r': *q++ = '\\'; *q++ = 'r'; break;
		    case '\t': *q++ = '\\'; *q++ = 't'; break;
		    case '\v': *q++ = '\\'; *q++ = 'v'; break;
		    case '\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", (unsigned char) *p);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		p++;
	    } else {  /* 8 bit char */
#ifdef Win32 /* It seems Windows does not know what is printable! */
		*q++ = *p++;
#else
		if(!isprint((int)*p & 0xff)) {
		    /* print in octal */
		    snprintf(buf, 5, "\\%03o", (unsigned char) *p);
		    for(j = 0; j < 4; j++) *q++ = buf[j];
		    p++;
		} else *q++ = *p++;
#endif
	    }
	}

#ifdef Win32
    if(WinUTF8out && ienc == CE_UTF8)  { memcpy(q, UTF8out, 3); q += 3; }
#endif
    if(quote) *q++ = (char) quote;
    if(b > 0 && justify != Rprt_adj_right) {
	for(i = 0 ; i < b ; i++) *q++ = ' ';
    }
    *q = '\0';
    return buffer->data;
}
Exemplo n.º 6
0
/* strlen() using escaped rather than literal form,
   and allowing for embedded nuls.
   In MBCS locales it works in characters, and reports in display width.
   Also used in printarray.c.
 */
attribute_hidden
int Rstrwid(const char *str, int slen, cetype_t ienc, int quote)
{
    const char *p = str;
    int len = 0, i;

    if(mbcslocale || ienc == CE_UTF8) {
	int res;
	mbstate_t mb_st;
	wchar_t wc;
	unsigned int k; /* not wint_t as it might be signed */

	if(ienc != CE_UTF8)  mbs_init(&mb_st);
	for (i = 0; i < slen; i++) {
	    res = (ienc == CE_UTF8) ? (int) utf8toucs(&wc, p):
		(int) mbrtowc(&wc, p, MB_CUR_MAX, NULL);
	    if(res >= 0) {
		k = wc;
		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
		    switch(wc) {
		    case L'\\':
			len += 2;
			break;
		    case L'\'':
		    case L'"':
			len += (quote == *p) ? 2 : 1;
			break;
		    default:
			len++; /* assumes these are all width 1 */
			break;
		    }
		    p++;
		} else if (k < 0x80) {
		    switch(wc) {
		    case L'\a':
		    case L'\b':
		    case L'\f':
		    case L'\n':
		    case L'\r':
		    case L'\t':
		    case L'\v':
		    case L'\0':
			len += 2; break;
		    default:
			/* print in octal */
			len += 4; break;
		    }
		    p++;
		} else {
		    len += iswprint((wint_t)wc) ? Ri18n_wcwidth(wc) :
#ifdef Win32
			6;
#else
		    (k > 0xffff ? 10 : 6);
#endif
		    i += (res - 1);
		    p += res;
		}
	    } else {
		len += 4;
		p++;
	    }
	}
    } else
	for (i = 0; i < slen; i++) {
	    /* ASCII */
	    if((unsigned char) *p < 0x80) {
		if(isprint((int)*p)) {
		    switch(*p) {
		    case '\\':
			len += 2; break;
		    case '\'':
		    case '"':
			len += (quote == *p)? 2 : 1; break;
		    default:
			len++; break;
		    }
		} else switch(*p) {
		    case '\a':
		    case '\b':
		    case '\f':
		    case '\n':
		    case '\r':
		    case '\t':
		    case '\v':
		    case '\0':
			len += 2; break;
		    default:
			/* print in octal */
			len += 4; break;
		    }
		p++;
	    } else { /* 8 bit char */
#ifdef Win32 /* It seems Windows does not know what is printable! */
		len++;
#else
		len += isprint((int)*p) ? 1 : 4;
#endif
		p++;
	    }
	}

    return len;
}
Exemplo n.º 7
0
/* strlen() using escaped rather than literal form.
   In MBCS locales it works in characters, and reports in display width.
   Rstrwid is also used in printarray.c.

   This supported embedded nuls when we had those.
 */
attribute_hidden
int Rstrwid(const char *str, int slen, cetype_t ienc, int quote)
{
    const char *p = str;
    int len = 0, i;

    if(ienc == CE_BYTES) { // not currently used for that encoding
	for (i = 0; i < slen; i++) {
	    unsigned char k = str[i];
	    if (k >= 0x20 && k < 0x80) len += 1;
	    else len += 4;
	}
	return len;
    }
    /* Future-proof: currently that is all Rstrlen calls it with,
       and printarray has CE_NATIVE explicitly */
    if(ienc > 2) // CE_NATIVE, CE_UTF8, CE_BYTES are supported
	warning("unsupported encoding (%d) in Rstrwid", ienc);
    if(mbcslocale || ienc == CE_UTF8) {
	int res;
	mbstate_t mb_st;
	wchar_t wc;
	unsigned int k; /* not wint_t as it might be signed */

	if(ienc != CE_UTF8)  mbs_init(&mb_st);
	for (i = 0; i < slen; i++) {
	    res = (ienc == CE_UTF8) ? (int) utf8toucs(&wc, p):
		(int) mbrtowc(&wc, p, MB_CUR_MAX, NULL);
	    if(res >= 0) {
		k = wc;
		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
		    switch(wc) {
		    case L'\\':
			len += 2;
			break;
		    case L'\'':
		    case L'"':
		    case L'`':
			len += (quote == *p) ? 2 : 1;
			break;
		    default:
			len++; /* assumes these are all width 1 */
			break;
		    }
		    p++;
		} else if (k < 0x80) {
		    switch(wc) {
		    case L'\a':
		    case L'\b':
		    case L'\f':
		    case L'\n':
		    case L'\r':
		    case L'\t':
		    case L'\v':
		    case L'\0':
			len += 2; break;
		    default:
			/* print in octal */
			len += 4; break;
		    }
		    p++;
		} else {
		    len += iswprint((wint_t)wc) ? Ri18n_wcwidth(wc) :
#ifdef Win32
			6;
#else
		    (k > 0xffff ? 10 : 6);
#endif
		    i += (res - 1);
		    p += res;
		}
	    } else {
		len += 4;
		p++;
	    }
	}
    } else // not MBCS nor marked as UTF-8
	for (i = 0; i < slen; i++) {
	    if((unsigned char) *p < 0x80) {
		/* ASCII */
		if(isprint((int)*p)) {
		    switch(*p) {
		    case '\\':
			len += 2; break;
		    case '\'':
		    case '"':
		    case '`':
			len += (quote == *p)? 2 : 1; break;
		    default:
			len++; break;
		    }
		} else switch(*p) {
		    case '\a':
		    case '\b':
		    case '\f':
		    case '\n':
		    case '\r':
		    case '\t':
		    case '\v':
		    case '\0':
			len += 2; break;
		    default:
			/* print in octal */
			len += 4; break;
		    }
		p++;
	    } else { /* 8 bit char */
#ifdef Win32 /* It seems Windows does not know what is printable! */
		len++;
#else
		len += isprint((int)*p) ? 1 : 4;
#endif
		p++;
	    }
	}

    return len;
}
Exemplo n.º 8
0
SEXP installTrChar(SEXP x)
{
    void * obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf;
    size_t inb, outb, res;
    cetype_t ienc = getCharCE(x);
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "installTrChar");
    if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans);
    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));
    if(utf8locale && IS_UTF8(x)) return install(ans);
    if(latin1locale && IS_LATIN1(x)) return install(ans);
    if(IS_ASCII(x)) return install(ans);

    if(IS_LATIN1(x)) {
	if(!latin1_obj) {
	    obj = Riconv_open("", "latin1");
	    /* should never happen */
	    if(obj == (void *)(-1))
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    latin1_obj = obj;
	}
	obj = latin1_obj;
    } else {
	if(!utf8_obj) {
	    obj = Riconv_open("", "UTF-8");
	    /* should never happen */
	    if(obj == (void *)(-1)) 
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    utf8_obj = obj;
	}
	obj = utf8_obj;
    }

    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 13) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	if (ienc == CE_UTF8) {
	    /* if starting in UTF-8, use \uxxxx */
	    /* This must be the first byte */
	    size_t clen;
	    wchar_t wc;
	    clen = utf8toucs(&wc, inbuf);
	    if(clen > 0 && inb >= clen) {
		inbuf += clen; inb -= clen;
# ifndef Win32
		if((unsigned int) wc < 65536) {
# endif
		    snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc);
		    outbuf += 8; outb -= 8;
# ifndef Win32
		} else {
		    snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc);
		    outbuf += 12; outb -= 12;
		}
# endif
	    } else {
		snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
		outbuf += 4; outb -= 4;
		inbuf++; inb--;
	    }
	} else {
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	}
	goto next_char;
    }
    *outbuf = '\0';
    SEXP Sans = install(cbuff.data);
    R_FreeStringBuffer(&cbuff);
    return Sans;
}