/* Match what EncodeString does with encodings */ attribute_hidden int Rstrlen(SEXP s, int quote) { cetype_t ienc = getCharCE(s); if (ienc == CE_UTF8 || ienc == CE_BYTES) return Rstrwid(CHAR(s), LENGTH(s), ienc, quote); const void *vmax = vmaxget(); const char *p = translateChar(s); int len = Rstrwid(p, (int)strlen(p), CE_NATIVE, quote); vmaxset(vmax); return len; }
const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) { int b, b0, i, j, cnt; const char *p; char *q, buf[11]; cetype_t ienc = CE_NATIVE; /* We have to do something like this as the result is returned, and passed on by EncodeElement -- so no way could be end user be responsible for freeing it. However, this is not thread-safe. */ static R_StringBuffer gBuffer = {NULL, 0, BUFSIZE}; R_StringBuffer *buffer = &gBuffer; if (s == NA_STRING) { p = quote ? CHAR(R_print.na_string) : CHAR(R_print.na_string_noquote); cnt = i = (int)(quote ? strlen(CHAR(R_print.na_string)) : strlen(CHAR(R_print.na_string_noquote))); quote = 0; } else { #ifdef Win32 if(WinUTF8out) { ienc = getCharCE(s); if(ienc == CE_UTF8) { p = CHAR(s); i = Rstrlen(s, quote); cnt = LENGTH(s); } else { p = translateChar0(s); if(p == CHAR(s)) { i = Rstrlen(s, quote); cnt = LENGTH(s); } else { cnt = strlen(p); i = Rstrwid(p, cnt, CE_NATIVE, quote); } ienc = CE_NATIVE; } } else #endif { if(IS_BYTES(s)) { p = CHAR(s); cnt = (int) strlen(p); const char *q; char *pp = R_alloc(4*cnt+1, 1), *qq = pp, buf[5]; for (q = p; *q; q++) { unsigned char k = (unsigned char) *q; if (k >= 0x20 && k < 0x80) { *qq++ = *q; if (quote && *q == '"') cnt++; } else { snprintf(buf, 5, "\\x%02x", k); for(j = 0; j < 4; j++) *qq++ = buf[j]; cnt += 3; } } *qq = '\0'; p = pp; i = cnt; } else { p = translateChar(s); if(p == CHAR(s)) { i = Rstrlen(s, quote); cnt = LENGTH(s); } else { cnt = (int) strlen(p); i = Rstrwid(p, cnt, CE_NATIVE, quote); } } } } /* We need enough space for the encoded string, including escapes. Octal encoding turns one byte into four. \u encoding can turn a multibyte into six or ten, but it turns 2/3 into 6, and 4 (and perhaps 5/6) into 10. Let's be wasteful here (the worst case appears to be an MBCS with one byte for an upper-plane Unicode point output as ten bytes, but I doubt that such an MBCS exists: two bytes is plausible). +2 allows for quotes, +6 for UTF_8 escapes. */ q = R_AllocStringBuffer(imax2(5*cnt+8, w), buffer); b = w - i - (quote ? 2 : 0); /* total amount of padding */ if(justify == Rprt_adj_none) b = 0; if(b > 0 && justify != Rprt_adj_left) { b0 = (justify == Rprt_adj_centre) ? b/2 : b; for(i = 0 ; i < b0 ; i++) *q++ = ' '; b -= b0; } if(quote) *q++ = (char) quote; if(mbcslocale || ienc == CE_UTF8) { int j, res; mbstate_t mb_st; wchar_t wc; unsigned int k; /* not wint_t as it might be signed */ #ifndef __STDC_ISO_10646__ Rboolean Unicode_warning = FALSE; #endif if(ienc != CE_UTF8) mbs_init(&mb_st); #ifdef Win32 else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; } #endif for (i = 0; i < cnt; i++) { res = (int)((ienc == CE_UTF8) ? utf8toucs(&wc, p): mbrtowc(&wc, p, MB_CUR_MAX, NULL)); if(res >= 0) { /* res = 0 is a terminator */ k = wc; /* To be portable, treat \0 explicitly */ if(res == 0) {k = 0; wc = L'\0';} if(0x20 <= k && k < 0x7f && iswprint(wc)) { switch(wc) { case L'\\': *q++ = '\\'; *q++ = '\\'; p++; break; case L'\'': case L'"': if(quote == *p) *q++ = '\\'; *q++ = *p++; break; default: for(j = 0; j < res; j++) *q++ = *p++; break; } } else if (k < 0x80) { /* ANSI Escapes */ switch(wc) { case L'\a': *q++ = '\\'; *q++ = 'a'; break; case L'\b': *q++ = '\\'; *q++ = 'b'; break; case L'\f': *q++ = '\\'; *q++ = 'f'; break; case L'\n': *q++ = '\\'; *q++ = 'n'; break; case L'\r': *q++ = '\\'; *q++ = 'r'; break; case L'\t': *q++ = '\\'; *q++ = 't'; break; case L'\v': *q++ = '\\'; *q++ = 'v'; break; case L'\0': *q++ = '\\'; *q++ = '0'; break; default: /* print in octal */ snprintf(buf, 5, "\\%03o", k); for(j = 0; j < 4; j++) *q++ = buf[j]; break; } p++; } else { if(iswprint(wc)) { /* The problem here is that wc may be printable according to the Unicode tables, but it may not be printable on the output device concerned. */ for(j = 0; j < res; j++) *q++ = *p++; } else { #ifndef Win32 # ifndef __STDC_ISO_10646__ Unicode_warning = TRUE; # endif if(k > 0xffff) snprintf(buf, 11, "\\U%08x", k); else #endif snprintf(buf, 11, "\\u%04x", k); j = (int) strlen(buf); memcpy(q, buf, j); q += j; p += res; } i += (res - 1); } } else { /* invalid char */ snprintf(q, 5, "\\x%02x", *((unsigned char *)p)); q += 4; p++; } } #ifndef __STDC_ISO_10646__ if(Unicode_warning) warning(_("it is not known that wchar_t is Unicode on this platform")); #endif } else for (i = 0; i < cnt; i++) { /* ASCII */ if((unsigned char) *p < 0x80) { if(*p != '\t' && isprint((int)*p)) { /* Windows has \t as printable */ switch(*p) { case '\\': *q++ = '\\'; *q++ = '\\'; break; case '\'': case '"': if(quote == *p) *q++ = '\\'; *q++ = *p; break; default: *q++ = *p; break; } } else switch(*p) { /* ANSI Escapes */ case '\a': *q++ = '\\'; *q++ = 'a'; break; case '\b': *q++ = '\\'; *q++ = 'b'; break; case '\f': *q++ = '\\'; *q++ = 'f'; break; case '\n': *q++ = '\\'; *q++ = 'n'; break; case '\r': *q++ = '\\'; *q++ = 'r'; break; case '\t': *q++ = '\\'; *q++ = 't'; break; case '\v': *q++ = '\\'; *q++ = 'v'; break; case '\0': *q++ = '\\'; *q++ = '0'; break; default: /* print in octal */ snprintf(buf, 5, "\\%03o", (unsigned char) *p); for(j = 0; j < 4; j++) *q++ = buf[j]; break; } p++; } else { /* 8 bit char */ #ifdef Win32 /* It seems Windows does not know what is printable! */ *q++ = *p++; #else if(!isprint((int)*p & 0xff)) { /* print in octal */ snprintf(buf, 5, "\\%03o", (unsigned char) *p); for(j = 0; j < 4; j++) *q++ = buf[j]; p++; } else *q++ = *p++; #endif } } #ifdef Win32 if(WinUTF8out && ienc == CE_UTF8) { memcpy(q, UTF8out, 3); q += 3; } #endif if(quote) *q++ = (char) quote; if(b > 0 && justify != Rprt_adj_right) { for(i = 0 ; i < b ; i++) *q++ = ' '; } *q = '\0'; return buffer->data; }
attribute_hidden int Rstrlen(SEXP s, int quote) { return Rstrwid(CHAR(s), LENGTH(s), getCharCE(s), quote); }