/** Check R encoding marking *for testing only* * This function should not be exported * * @param s character vector * * Results are printed on STDERR * * @version 0.1 (Marek Gagolewski) */ SEXP stri_test_Rmark(SEXP s) { #ifndef NDEBUG s = stri_prepare_arg_string(s, "str"); int ns = LENGTH(s); for (int i=0; i < ns; ++i) { fprintf(stdout, "!NDEBUG: Element #%d:\n", i); SEXP curs = STRING_ELT(s, i); if (curs == NA_STRING){ fprintf(stdout, "!NDEBUG: \tNA\n"); continue; } //const char* string = CHAR(curs); fprintf(stdout, "!NDEBUG: \tMARK_ASCII = %d\n", (IS_ASCII(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_UTF8 = %d\n", (IS_UTF8(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_LATIN1= %d\n", (IS_LATIN1(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_BYTES = %d\n", (IS_BYTES(curs) > 0)); fprintf(stdout, "!NDEBUG: \n"); } return R_NilValue; #else Rf_error("This function is enabled only if NDEBUG is undef."); return s; // s here avoids compiler warning #endif }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateChar0(SEXP x) { if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateChar0"); if(IS_BYTES(x)) return CHAR(x); return translateChar(x); }
wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){ static wchar_t filename[BSIZE + 1]; void *obj; const char *from = "", *inbuf; char *outbuf; size_t inb, outb, res; if(!strlen(CHAR(fn))){ wcscpy(filename, L""); return filename; } if(IS_LATIN1(fn)) from = "latin1"; if(IS_UTF8(fn)) from = "UTF-8"; if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'"); obj = Riconv_open("UCS-2LE", from); if(obj == (void *)(-1)) REprintf("unsupported conversion from '%s' in shellexec_wcc.c", from); if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn); inb = strlen(inbuf)+1; outb = 2*BSIZE; outbuf = (char *) filename; res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); Riconv_close(obj); if(inb > 0) REprintf("file name conversion problem -- name too long?"); if(res == -1) REprintf("file name conversion problem"); return filename; } /* End of filenameToWchar_wcc(). */
cetype_t getCharCE(SEXP x) { if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "getCharCE"); if(IS_UTF8(x)) return CE_UTF8; else if(IS_LATIN1(x)) return CE_LATIN1; else if(IS_BYTES(x)) return CE_BYTES; else return CE_NATIVE; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateCharUTF8(SEXP x) { void *obj; const char *inbuf, *ans = CHAR(x); char *outbuf, *p; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8"); if(x == NA_STRING) return ans; if(IS_UTF8(x)) return ans; if(IS_ASCII(x)) return ans; if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8"); #endif R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; } *outbuf = '\0'; Riconv_close(obj); res = strlen(cbuff.data) + 1; p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/** Get Declared Encodings of Each String * * @param str a character vector or an object coercible to * @return a character vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_mark(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); // some of them will not be used in this call, but we're lazy SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes; STRI__PROTECT(mark_ascii = Rf_mkChar("ASCII")); STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1")); STRI__PROTECT(mark_utf8 = Rf_mkChar("UTF-8")); STRI__PROTECT(mark_native = Rf_mkChar("native")); STRI__PROTECT(mark_bytes = Rf_mkChar("bytes")); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i=0; i<str_len; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs)) SET_STRING_ELT(ret, i, mark_ascii); else if (IS_UTF8(curs)) SET_STRING_ELT(ret, i, mark_utf8); else if (IS_BYTES(curs)) SET_STRING_ELT(ret, i, mark_bytes); else if (IS_LATIN1(curs)) SET_STRING_ELT(ret, i, mark_latin1); else SET_STRING_ELT(ret, i, mark_native); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
int R_nchar(SEXP string, nchar_type type_, Rboolean allowNA, Rboolean keepNA, const char* msg_name) { if (string == NA_STRING) return keepNA ? NA_INTEGER : 2; // else : switch(type_) { case Bytes: return LENGTH(string); break; case Chars: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { int nc = 0; for( ; *p; p += utf8clen(*p)) nc++; return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable in \"bytes\" encoding, %s"), msg_name); return NA_INTEGER; } else if (mbcslocale) { int nc = (int) mbstowcs(NULL, translateChar(string), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string, %s"), msg_name); return (nc >= 0 ? nc : NA_INTEGER); } else return ((int) strlen(translateChar(string))); break; case Width: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { wchar_t wc1; int nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for %s in \"bytes\" encoding"), msg_name); return NA_INTEGER; } else if (mbcslocale) { const char *xi = translateChar(string); int nc = (int) mbstowcs(NULL, xi, 0); if (nc >= 0) { const void *vmax = vmaxget(); wchar_t *wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); int nci18n = Ri18n_wcswidth(wc, 2147483647); vmaxset(vmax); return (nci18n < 1) ? nc : nci18n; } else if (allowNA) error(_("invalid multibyte string, %s"), msg_name); else return NA_INTEGER; } else return (int) strlen(translateChar(string)); } // switch return NA_INTEGER; // -Wall } // R_nchar()
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP d, s, x, stype; int i, len, allowNA; size_t ntype; int nc; const char *type; const char *xi; wchar_t *wc; const void *vmax; checkArity(op, args); if (isFactor(CAR(args))) error(_("'%s' requires a character vector"), "nchar()"); PROTECT(x = coerceVector(CAR(args), STRSXP)); if (!isString(x)) error(_("'%s' requires a character vector"), "nchar()"); len = LENGTH(x); stype = CADR(args); if (!isString(stype) || LENGTH(stype) != 1) error(_("invalid '%s' argument"), "type"); type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */ ntype = strlen(type); if (ntype == 0) error(_("invalid '%s' argument"), "type"); allowNA = asLogical(CADDR(args)); if (allowNA == NA_LOGICAL) allowNA = 0; PROTECT(s = allocVector(INTSXP, len)); vmax = vmaxget(); for (i = 0; i < len; i++) { SEXP sxi = STRING_ELT(x, i); if (sxi == NA_STRING) { INTEGER(s)[i] = 2; continue; } if (strncmp(type, "bytes", ntype) == 0) { INTEGER(s)[i] = LENGTH(sxi); } else if (strncmp(type, "chars", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); nc = 0; for( ; *p; p += utf8clen(*p)) nc++; INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { nc = mbstowcs(NULL, translateChar(sxi), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string %d"), i+1); INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else if (strncmp(type, "width", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); wchar_t wc1; nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { xi = translateChar(sxi); nc = mbstowcs(NULL, xi, 0); if (nc >= 0) { wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647); if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc; } else if (allowNA) error(_("invalid multibyte string %d"), i+1); else INTEGER(s)[i] = NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else error(_("invalid '%s' argument"), "type"); vmaxset(vmax); } R_FreeStringBufferL(&cbuff); if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(s, R_NamesSymbol, d); if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue) setAttrib(s, R_DimSymbol, d); if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(s, R_DimNamesSymbol, d); UNPROTECT(2); return s; }
/** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n > 0) { this->str = new UnicodeString*[this->n]; for (R_len_t i=0; i<this->n; ++i) this->str[i] = NULL; // in case it fails during conversion (this is NA) UConverter* ucnvASCII = NULL; // UConverter* ucnvUTF8 = NULL; UConverter* ucnvLatin1 = NULL; UConverter* ucnvNative = NULL; for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } else { if (IS_ASCII(curs)) { if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvASCII, status); if (U_FAILURE(status)) throw StriException(status); // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster tham the ucnvASCII approach. } else if (IS_UTF8(curs)) { // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8 this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvLatin1, status); if (U_FAILURE(status)) throw StriException(status); } else if (IS_BYTES(curs)) throw StriException(MSG__BYTESENC); else { // Any encoding - detection needed // Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvNative, status); if (U_FAILURE(status)) throw StriException(status); } } } if (ucnvASCII) ucnv_close(ucnvASCII); // if (ucnvUTF8) ucnv_close(ucnvUTF8); if (ucnvLatin1) ucnv_close(ucnvLatin1); if (ucnvNative) ucnv_close(ucnvNative); if (!_shallowrecycle) { for (R_len_t i=nrstr; i<this->n; ++i) { if (this->str[i%nrstr] == NULL) this->str[i] = NULL; else this->str[i] = new UnicodeString(*this->str[i%nrstr]); } } } }
const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) { int b, b0, i, j, cnt; const char *p; char *q, buf[11]; cetype_t ienc = CE_NATIVE; /* We have to do something like this as the result is returned, and passed on by EncodeElement -- so no way could be end user be responsible for freeing it. However, this is not thread-safe. */ static R_StringBuffer gBuffer = {NULL, 0, BUFSIZE}; R_StringBuffer *buffer = &gBuffer; if (s == NA_STRING) { p = quote ? CHAR(R_print.na_string) : CHAR(R_print.na_string_noquote); cnt = i = (int)(quote ? strlen(CHAR(R_print.na_string)) : strlen(CHAR(R_print.na_string_noquote))); quote = 0; } else { #ifdef Win32 if(WinUTF8out) { ienc = getCharCE(s); if(ienc == CE_UTF8) { p = CHAR(s); i = Rstrlen(s, quote); cnt = LENGTH(s); } else { p = translateChar0(s); if(p == CHAR(s)) { i = Rstrlen(s, quote); cnt = LENGTH(s); } else { cnt = strlen(p); i = Rstrwid(p, cnt, CE_NATIVE, quote); } ienc = CE_NATIVE; } } else #endif { if(IS_BYTES(s)) { p = CHAR(s); cnt = (int) strlen(p); const char *q; char *pp = R_alloc(4*cnt+1, 1), *qq = pp, buf[5]; for (q = p; *q; q++) { unsigned char k = (unsigned char) *q; if (k >= 0x20 && k < 0x80) { *qq++ = *q; if (quote && *q == '"') cnt++; } else { snprintf(buf, 5, "\\x%02x", k); for(j = 0; j < 4; j++) *qq++ = buf[j]; cnt += 3; } } *qq = '\0'; p = pp; i = cnt; } else { p = translateChar(s); if(p == CHAR(s)) { i = Rstrlen(s, quote); cnt = LENGTH(s); } else { cnt = (int) strlen(p); i = Rstrwid(p, cnt, CE_NATIVE, quote); } } } } /* We need enough space for the encoded string, including escapes. Octal encoding turns one byte into four. \u encoding can turn a multibyte into six or ten, but it turns 2/3 into 6, and 4 (and perhaps 5/6) into 10. Let's be wasteful here (the worst case appears to be an MBCS with one byte for an upper-plane Unicode point output as ten bytes, but I doubt that such an MBCS exists: two bytes is plausible). +2 allows for quotes, +6 for UTF_8 escapes. */ q = R_AllocStringBuffer(imax2(5*cnt+8, w), buffer); b = w - i - (quote ? 2 : 0); /* total amount of padding */ if(justify == Rprt_adj_none) b = 0; if(b > 0 && justify != Rprt_adj_left) { b0 = (justify == Rprt_adj_centre) ? b/2 : b; for(i = 0 ; i < b0 ; i++) *q++ = ' '; b -= b0; } if(quote) *q++ = (char) quote; if(mbcslocale || ienc == CE_UTF8) { int j, res; mbstate_t mb_st; wchar_t wc; unsigned int k; /* not wint_t as it might be signed */ #ifndef __STDC_ISO_10646__ Rboolean Unicode_warning = FALSE; #endif if(ienc != CE_UTF8) mbs_init(&mb_st); #ifdef Win32 else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; } #endif for (i = 0; i < cnt; i++) { res = (int)((ienc == CE_UTF8) ? utf8toucs(&wc, p): mbrtowc(&wc, p, MB_CUR_MAX, NULL)); if(res >= 0) { /* res = 0 is a terminator */ k = wc; /* To be portable, treat \0 explicitly */ if(res == 0) {k = 0; wc = L'\0';} if(0x20 <= k && k < 0x7f && iswprint(wc)) { switch(wc) { case L'\\': *q++ = '\\'; *q++ = '\\'; p++; break; case L'\'': case L'"': if(quote == *p) *q++ = '\\'; *q++ = *p++; break; default: for(j = 0; j < res; j++) *q++ = *p++; break; } } else if (k < 0x80) { /* ANSI Escapes */ switch(wc) { case L'\a': *q++ = '\\'; *q++ = 'a'; break; case L'\b': *q++ = '\\'; *q++ = 'b'; break; case L'\f': *q++ = '\\'; *q++ = 'f'; break; case L'\n': *q++ = '\\'; *q++ = 'n'; break; case L'\r': *q++ = '\\'; *q++ = 'r'; break; case L'\t': *q++ = '\\'; *q++ = 't'; break; case L'\v': *q++ = '\\'; *q++ = 'v'; break; case L'\0': *q++ = '\\'; *q++ = '0'; break; default: /* print in octal */ snprintf(buf, 5, "\\%03o", k); for(j = 0; j < 4; j++) *q++ = buf[j]; break; } p++; } else { if(iswprint(wc)) { /* The problem here is that wc may be printable according to the Unicode tables, but it may not be printable on the output device concerned. */ for(j = 0; j < res; j++) *q++ = *p++; } else { #ifndef Win32 # ifndef __STDC_ISO_10646__ Unicode_warning = TRUE; # endif if(k > 0xffff) snprintf(buf, 11, "\\U%08x", k); else #endif snprintf(buf, 11, "\\u%04x", k); j = (int) strlen(buf); memcpy(q, buf, j); q += j; p += res; } i += (res - 1); } } else { /* invalid char */ snprintf(q, 5, "\\x%02x", *((unsigned char *)p)); q += 4; p++; } } #ifndef __STDC_ISO_10646__ if(Unicode_warning) warning(_("it is not known that wchar_t is Unicode on this platform")); #endif } else for (i = 0; i < cnt; i++) { /* ASCII */ if((unsigned char) *p < 0x80) { if(*p != '\t' && isprint((int)*p)) { /* Windows has \t as printable */ switch(*p) { case '\\': *q++ = '\\'; *q++ = '\\'; break; case '\'': case '"': if(quote == *p) *q++ = '\\'; *q++ = *p; break; default: *q++ = *p; break; } } else switch(*p) { /* ANSI Escapes */ case '\a': *q++ = '\\'; *q++ = 'a'; break; case '\b': *q++ = '\\'; *q++ = 'b'; break; case '\f': *q++ = '\\'; *q++ = 'f'; break; case '\n': *q++ = '\\'; *q++ = 'n'; break; case '\r': *q++ = '\\'; *q++ = 'r'; break; case '\t': *q++ = '\\'; *q++ = 't'; break; case '\v': *q++ = '\\'; *q++ = 'v'; break; case '\0': *q++ = '\\'; *q++ = '0'; break; default: /* print in octal */ snprintf(buf, 5, "\\%03o", (unsigned char) *p); for(j = 0; j < 4; j++) *q++ = buf[j]; break; } p++; } else { /* 8 bit char */ #ifdef Win32 /* It seems Windows does not know what is printable! */ *q++ = *p++; #else if(!isprint((int)*p & 0xff)) { /* print in octal */ snprintf(buf, 5, "\\%03o", (unsigned char) *p); for(j = 0; j < 4; j++) *q++ = buf[j]; p++; } else *q++ = *p++; #endif } } #ifdef Win32 if(WinUTF8out && ienc == CE_UTF8) { memcpy(q, UTF8out, 3); q += 3; } #endif if(quote) *q++ = (char) quote; if(b > 0 && justify != Rprt_adj_right) { for(i = 0 ; i < b ; i++) *q++ = ' '; } *q = '\0'; return buffer->data; }
SEXP installTrChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; size_t inb, outb, res; cetype_t ienc = getCharCE(x); R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "installTrChar"); if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(utf8locale && IS_UTF8(x)) return install(ans); if(latin1locale && IS_LATIN1(x)) return install(ans); if(IS_ASCII(x)) return install(ans); if(IS_LATIN1(x)) { if(!latin1_obj) { obj = Riconv_open("", "latin1"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif latin1_obj = obj; } obj = latin1_obj; } else { if(!utf8_obj) { obj = Riconv_open("", "UTF-8"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif utf8_obj = obj; } obj = utf8_obj; } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 13) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } if (ienc == CE_UTF8) { /* if starting in UTF-8, use \uxxxx */ /* This must be the first byte */ size_t clen; wchar_t wc; clen = utf8toucs(&wc, inbuf); if(clen > 0 && inb >= clen) { inbuf += clen; inb -= clen; # ifndef Win32 if((unsigned int) wc < 65536) { # endif snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc); outbuf += 8; outb -= 8; # ifndef Win32 } else { snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc); outbuf += 12; outb -= 12; } # endif } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } goto next_char; } *outbuf = '\0'; SEXP Sans = install(cbuff.data); R_FreeStringBuffer(&cbuff); return Sans; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */ const wchar_t *wtransChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; wchar_t *p; size_t inb, outb, res, top; Rboolean knownEnc = FALSE; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "wtransChar"); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(IS_LATIN1(x)) { if(!latin1_wobj) { obj = Riconv_open(TO_WCHAR, "latin1"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); latin1_wobj = obj; } else obj = latin1_wobj; knownEnc = TRUE; } else if(IS_UTF8(x)) { if(!utf8_wobj) { obj = Riconv_open(TO_WCHAR, "UTF-8"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); utf8_wobj = obj; } else obj = utf8_wobj; knownEnc = TRUE; } else { obj = Riconv_open(TO_WCHAR, ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion to '%s' from codepage %d"), TO_WCHAR, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR); #endif } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; /* if(!knownEnc) Riconv_close(obj); error(_("invalid input in wtransChar")); */ } if(!knownEnc) Riconv_close(obj); res = (top - outb); /* terminator is 2 or 4 null bytes */ p = (wchar_t *) R_alloc(res+4, 1); memset(p, 0, res+4); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/* Note that NA_STRING is not handled separately here. This is deliberate -- see ?paste -- and implicitly coerces it to "NA" */ SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, collapse, sep, x; int sepw, u_sepw, ienc; R_xlen_t i, j, k, maxlen, nx, pwidth; const char *s, *cbuf, *csep=NULL, *u_csep=NULL; char *buf; Rboolean allKnown, anyKnown, use_UTF8, use_Bytes, sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE, use_sep = (PRIMVAL(op) == 0); const void *vmax; checkArity(op, args); /* We use formatting and so we must initialize printing. */ PrintDefaults(); /* Check the arguments */ x = CAR(args); if (!isVectorList(x)) error(_("invalid first argument")); nx = xlength(x); if(use_sep) { /* paste(..., sep, .) */ sep = CADR(args); if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING) error(_("invalid separator")); sep = STRING_ELT(sep, 0); csep = translateChar(sep); u_sepw = sepw = (int) strlen(csep); // will be short sepASCII = strIsASCII(csep); sepKnown = ENC_KNOWN(sep) > 0; sepUTF8 = IS_UTF8(sep); sepBytes = IS_BYTES(sep); collapse = CADDR(args); } else { /* paste0(..., .) */ u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */ collapse = CADR(args); } if (!isNull(collapse)) if(!isString(collapse) || LENGTH(collapse) <= 0 || STRING_ELT(collapse, 0) == NA_STRING) error(_("invalid '%s' argument"), "collapse"); if(nx == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); /* Maximum argument length, coerce if needed */ maxlen = 0; for (j = 0; j < nx; j++) { if (!isString(VECTOR_ELT(x, j))) { /* formerly in R code: moved to C for speed */ SEXP call, xj = VECTOR_ELT(x, j); if(OBJECT(xj)) { /* method dispatch */ PROTECT(call = lang2(install("as.character"), xj)); SET_VECTOR_ELT(x, j, eval(call, env)); UNPROTECT(1); } else if (isSymbol(xj)) SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj))); else SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP)); if (!isString(VECTOR_ELT(x, j))) error(_("non-string argument to internal 'paste'")); } if(xlength(VECTOR_ELT(x, j)) > maxlen) maxlen = xlength(VECTOR_ELT(x, j)); } if(maxlen == 0) return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0); PROTECT(ans = allocVector(STRSXP, maxlen)); for (i = 0; i < maxlen; i++) { /* Strategy for marking the encoding: if all inputs (including * the separator) are ASCII, so is the output and we don't * need to mark. Otherwise if all non-ASCII inputs are of * declared encoding, we should mark. * Need to be careful only to include separator if it is used. */ anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE; if(nx > 1) { allKnown = sepKnown || sepASCII; anyKnown = sepKnown; use_UTF8 = sepUTF8; use_Bytes = sepBytes; } pwidth = 0; for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if(IS_UTF8(cs)) use_UTF8 = TRUE; if(IS_BYTES(cs)) use_Bytes = TRUE; } } if (use_Bytes) use_UTF8 = FALSE; vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { if(use_Bytes) pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k))); else if(use_UTF8) pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k))); else pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k))); vmaxset(vmax); } } if(use_sep) { if (use_UTF8 && !u_csep) { u_csep = translateCharUTF8(sep); u_sepw = (int) strlen(u_csep); // will be short } pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw); } if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (j = 0; j < nx; j++) { k = xlength(VECTOR_ELT(x, j)); if (k > 0) { SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k); if (use_UTF8) { s = translateCharUTF8(cs); strcpy(buf, s); buf += strlen(s); } else { s = use_Bytes ? CHAR(cs) : translateChar(cs); strcpy(buf, s); buf += strlen(s); allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0)); anyKnown = anyKnown || (ENC_KNOWN(cs)> 0); } } if (sepw != 0 && j != nx - 1) { if (use_UTF8) { strcpy(buf, u_csep); buf += u_sepw; } else { strcpy(buf, csep); buf += sepw; } } vmax = vmaxget(); } ienc = 0; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc)); } /* Now collapse, if required. */ if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) { sep = STRING_ELT(collapse, 0); use_UTF8 = IS_UTF8(sep); use_Bytes = IS_BYTES(sep); for (i = 0; i < nx; i++) { if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE; if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE; } if(use_Bytes) { csep = CHAR(sep); use_UTF8 = FALSE; } else if(use_UTF8) csep = translateCharUTF8(sep); else csep = translateChar(sep); sepw = (int) strlen(csep); anyKnown = ENC_KNOWN(sep) > 0; allKnown = anyKnown || strIsASCII(csep); pwidth = 0; vmax = vmaxget(); for (i = 0; i < nx; i++) if(use_UTF8) { pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i))); vmaxset(vmax); } else /* already translated */ pwidth += strlen(CHAR(STRING_ELT(ans, i))); pwidth += (nx - 1) * sepw; if (pwidth > INT_MAX) error(_("result would exceed 2^31-1 bytes")); cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff); vmax = vmaxget(); for (i = 0; i < nx; i++) { if(i > 0) { strcpy(buf, csep); buf += sepw; } if(use_UTF8) s = translateCharUTF8(STRING_ELT(ans, i)); else /* already translated */ s = CHAR(STRING_ELT(ans, i)); strcpy(buf, s); while (*buf) buf++; allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0)); anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0); if(use_UTF8) vmaxset(vmax); } UNPROTECT(1); ienc = CE_NATIVE; if(use_UTF8) ienc = CE_UTF8; else if(use_Bytes) ienc = CE_BYTES; else if(anyKnown && allKnown) { if(known_to_be_latin1) ienc = CE_LATIN1; if(known_to_be_utf8) ienc = CE_UTF8; } PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc)); } R_FreeStringBufferL(&cbuff); UNPROTECT(1); return ans; }
/* format.default(x, trim, digits, nsmall, width, justify, na.encode, scientific) */ SEXP attribute_hidden do_format(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP l, x, y, swd; int il, digits, trim = 0, nsmall = 0, wd = 0, adj = -1, na, sci = 0; int w, d, e; int wi, di, ei, scikeep; const char *strp; R_xlen_t i, n; checkArity(op, args); PrintDefaults(); scikeep = R_print.scipen; if (isEnvironment(x = CAR(args))) { return mkString(EncodeEnvironment(x)); } else if (!isVector(x)) error(_("first argument must be atomic")); args = CDR(args); trim = asLogical(CAR(args)); if (trim == NA_INTEGER) error(_("invalid '%s' argument"), "trim"); args = CDR(args); if (!isNull(CAR(args))) { digits = asInteger(CAR(args)); if (digits == NA_INTEGER || digits < R_MIN_DIGITS_OPT || digits > R_MAX_DIGITS_OPT) error(_("invalid '%s' argument"), "digits"); R_print.digits = digits; } args = CDR(args); nsmall = asInteger(CAR(args)); if (nsmall == NA_INTEGER || nsmall < 0 || nsmall > 20) error(_("invalid '%s' argument"), "nsmall"); args = CDR(args); if (isNull(swd = CAR(args))) wd = 0; else wd = asInteger(swd); if(wd == NA_INTEGER) error(_("invalid '%s' argument"), "width"); args = CDR(args); adj = asInteger(CAR(args)); if(adj == NA_INTEGER || adj < 0 || adj > 3) error(_("invalid '%s' argument"), "justify"); args = CDR(args); na = asLogical(CAR(args)); if(na == NA_LOGICAL) error(_("invalid '%s' argument"), "na.encode"); args = CDR(args); if(LENGTH(CAR(args)) != 1) error(_("invalid '%s' argument"), "scientific"); if(isLogical(CAR(args))) { int tmp = LOGICAL(CAR(args))[0]; if(tmp == NA_LOGICAL) sci = NA_INTEGER; else sci = tmp > 0 ?-100 : 100; } else if (isNumeric(CAR(args))) { sci = asInteger(CAR(args)); } else error(_("invalid '%s' argument"), "scientific"); if(sci != NA_INTEGER) R_print.scipen = sci; if ((n = XLENGTH(x)) <= 0) { PROTECT(y = allocVector(STRSXP, 0)); } else { switch (TYPEOF(x)) { case LGLSXP: PROTECT(y = allocVector(STRSXP, n)); if (trim) w = 0; else formatLogical(LOGICAL(x), n, &w); w = imax2(w, wd); for (i = 0; i < n; i++) { strp = EncodeLogical(LOGICAL(x)[i], w); SET_STRING_ELT(y, i, mkChar(strp)); } break; case INTSXP: PROTECT(y = allocVector(STRSXP, n)); if (trim) w = 0; else formatInteger(INTEGER(x), n, &w); w = imax2(w, wd); for (i = 0; i < n; i++) { strp = EncodeInteger(INTEGER(x)[i], w); SET_STRING_ELT(y, i, mkChar(strp)); } break; case REALSXP: formatReal(REAL(x), n, &w, &d, &e, nsmall); if (trim) w = 0; w = imax2(w, wd); PROTECT(y = allocVector(STRSXP, n)); for (i = 0; i < n; i++) { strp = EncodeReal0(REAL(x)[i], w, d, e, OutDec); SET_STRING_ELT(y, i, mkChar(strp)); } break; case CPLXSXP: formatComplex(COMPLEX(x), n, &w, &d, &e, &wi, &di, &ei, nsmall); if (trim) wi = w = 0; w = imax2(w, wd); wi = imax2(wi, wd); PROTECT(y = allocVector(STRSXP, n)); for (i = 0; i < n; i++) { strp = EncodeComplex(COMPLEX(x)[i], w, d, e, wi, di, ei, OutDec); SET_STRING_ELT(y, i, mkChar(strp)); } break; case STRSXP: { /* this has to be different from formatString/EncodeString as we don't actually want to encode here */ const char *s; char *q; int b, b0, cnt = 0, j; SEXP s0, xx; /* This is clumsy, but it saves rewriting and re-testing this complex code */ PROTECT(xx = duplicate(x)); for (i = 0; i < n; i++) { SEXP tmp = STRING_ELT(xx, i); if(IS_BYTES(tmp)) { const char *p = CHAR(tmp), *q; char *pp = R_alloc(4*strlen(p)+1, 1), *qq = pp, buf[5]; for (q = p; *q; q++) { unsigned char k = (unsigned char) *q; if (k >= 0x20 && k < 0x80) { *qq++ = *q; } else { snprintf(buf, 5, "\\x%02x", k); for(int j = 0; j < 4; j++) *qq++ = buf[j]; } } *qq = '\0'; s = pp; } else s = translateChar(tmp); if(s != CHAR(tmp)) SET_STRING_ELT(xx, i, mkChar(s)); } w = wd; if (adj != Rprt_adj_none) { for (i = 0; i < n; i++) if (STRING_ELT(xx, i) != NA_STRING) w = imax2(w, Rstrlen(STRING_ELT(xx, i), 0)); else if (na) w = imax2(w, R_print.na_width); } else w = 0; /* now calculate the buffer size needed, in bytes */ for (i = 0; i < n; i++) if (STRING_ELT(xx, i) != NA_STRING) { il = Rstrlen(STRING_ELT(xx, i), 0); cnt = imax2(cnt, LENGTH(STRING_ELT(xx, i)) + imax2(0, w-il)); } else if (na) cnt = imax2(cnt, R_print.na_width + imax2(0, w-R_print.na_width)); R_CheckStack2(cnt+1); char buff[cnt+1]; PROTECT(y = allocVector(STRSXP, n)); for (i = 0; i < n; i++) { if(!na && STRING_ELT(xx, i) == NA_STRING) { SET_STRING_ELT(y, i, NA_STRING); } else { q = buff; if(STRING_ELT(xx, i) == NA_STRING) s0 = R_print.na_string; else s0 = STRING_ELT(xx, i) ; s = CHAR(s0); il = Rstrlen(s0, 0); b = w - il; if(b > 0 && adj != Rprt_adj_left) { b0 = (adj == Rprt_adj_centre) ? b/2 : b; for(j = 0 ; j < b0 ; j++) *q++ = ' '; b -= b0; } for(j = 0; j < LENGTH(s0); j++) *q++ = *s++; if(b > 0 && adj != Rprt_adj_right) for(j = 0 ; j < b ; j++) *q++ = ' '; *q = '\0'; SET_STRING_ELT(y, i, mkChar(buff)); } } } UNPROTECT(2); /* xx , y */ PROTECT(y); break; default: error(_("Impossible mode ( x )")); y = R_NilValue;/* -Wall */ } } if((l = getAttrib(x, R_DimSymbol)) != R_NilValue) { setAttrib(y, R_DimSymbol, l); if((l = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(y, R_DimNamesSymbol, l); } else if((l = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(y, R_NamesSymbol, l); /* In case something else forgets to set PrintDefaults(), PR#14477 */ R_print.scipen = scikeep; UNPROTECT(1); /* y */ return y; }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) Multiple input encoding support * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_length(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t ns = LENGTH(str); SEXP ret; UConverter* uconv = NULL; bool uconv_8bit = false; bool uconv_utf8 = false; STRI__ERROR_HANDLER_BEGIN /* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */ #define STRI_LENGTH_CALCULATE_UTF8 \ const char* qc = CHAR(q); \ R_len_t j = 0; \ for (R_len_t i = 0; i < nq; j++) \ U8_FWD_1(qc, i, nq); \ retint[k] = j; PROTECT(ret = Rf_allocVector(INTSXP, ns)); int* retint = INTEGER(ret); for (R_len_t k = 0; k < ns; k++) { SEXP q = STRING_ELT(str, k); if (q == NA_STRING) retint[k] = NA_INTEGER; else { R_len_t nq = LENGTH(q); // O(1) - stored by R // We trust (is that a wise assumption?) // R encoding marks; However, it there is no mark, // the string may have any encoding (ascii, latin1, utf8, native) if (IS_ASCII(q) || IS_LATIN1(q)) retint[k] = nq; else if (IS_BYTES(q)) throw StriException(MSG__BYTESENC); else if (IS_UTF8(q)) { STRI_LENGTH_CALCULATE_UTF8 } else { // Any encoding - detection needed // UTF-8 strings can be fairly reliably recognized as such by a // simple algorithm, i.e., the probability that a string of // characters in any other encoding appears as valid UTF-8 is low, // diminishing with increasing string length. // We have two possibilities here: // 1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native // This won't work correctly in some cases. // e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8 // and ("A umlaut", "Ellipsis") in WINDOWS-1250 // 2. Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!uconv) { // open ucnv on demand uconv = stri__ucnv_open((const char*)NULL); // native decoder if (!uconv) { retint[k] = NA_INTEGER; continue; } uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1); if (!uconv_8bit) { UErrorCode err = U_ZERO_ERROR; const char* name = ucnv_getName(uconv, &err); if (U_FAILURE(err)) throw StriException("could not query default converter"); uconv_utf8 = !strncmp("UTF-8", name, 5); } } if (uconv_8bit) { retint[k] = nq; // it's an 8-bit encoding :-) } else if (uconv_utf8) { // it's UTF-8 STRI_LENGTH_CALCULATE_UTF8 } else { // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') UErrorCode err = U_ZERO_ERROR; const char* source = CHAR(q); const char* sourceLimit = source + nq; R_len_t j; for (j = 0; source != sourceLimit; j++) { if (U_FAILURE(err)) break; // error from previous iteration // iterate through each native-encoded character: ucnv_getNextUChar(uconv, &source, sourceLimit, &err); } if (U_FAILURE(err)) { // error from last iteration Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string."); retint[k] = NA_INTEGER; } else retint[k] = j; // all right, we got it! } } }