wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){ static wchar_t filename[BSIZE + 1]; void *obj; const char *from = "", *inbuf; char *outbuf; size_t inb, outb, res; if(!strlen(CHAR(fn))){ wcscpy(filename, L""); return filename; } if(IS_LATIN1(fn)) from = "latin1"; if(IS_UTF8(fn)) from = "UTF-8"; if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'"); obj = Riconv_open("UCS-2LE", from); if(obj == (void *)(-1)) REprintf("unsupported conversion from '%s' in shellexec_wcc.c", from); if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn); inb = strlen(inbuf)+1; outb = 2*BSIZE; outbuf = (char *) filename; res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); Riconv_close(obj); if(inb > 0) REprintf("file name conversion problem -- name too long?"); if(res == -1) REprintf("file name conversion problem"); return filename; } /* End of filenameToWchar_wcc(). */
/** Check R encoding marking *for testing only* * This function should not be exported * * @param s character vector * * Results are printed on STDERR * * @version 0.1 (Marek Gagolewski) */ SEXP stri_test_Rmark(SEXP s) { #ifndef NDEBUG s = stri_prepare_arg_string(s, "str"); int ns = LENGTH(s); for (int i=0; i < ns; ++i) { fprintf(stdout, "!NDEBUG: Element #%d:\n", i); SEXP curs = STRING_ELT(s, i); if (curs == NA_STRING){ fprintf(stdout, "!NDEBUG: \tNA\n"); continue; } //const char* string = CHAR(curs); fprintf(stdout, "!NDEBUG: \tMARK_ASCII = %d\n", (IS_ASCII(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_UTF8 = %d\n", (IS_UTF8(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_LATIN1= %d\n", (IS_LATIN1(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_BYTES = %d\n", (IS_BYTES(curs) > 0)); fprintf(stdout, "!NDEBUG: \n"); } return R_NilValue; #else Rf_error("This function is enabled only if NDEBUG is undef."); return s; // s here avoids compiler warning #endif }
cetype_t getCharCE(SEXP x) { if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "getCharCE"); if(IS_UTF8(x)) return CE_UTF8; else if(IS_LATIN1(x)) return CE_LATIN1; else if(IS_BYTES(x)) return CE_BYTES; else return CE_NATIVE; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ const char *translateCharUTF8(SEXP x) { void *obj; const char *inbuf, *ans = CHAR(x); char *outbuf, *p; size_t inb, outb, res; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8"); if(x == NA_STRING) return ans; if(IS_UTF8(x)) return ans; if(IS_ASCII(x)) return ans; if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8"); #endif R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; } *outbuf = '\0'; Riconv_close(obj); res = strlen(cbuff.data) + 1; p = R_alloc(res, 1); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/** Get Declared Encodings of Each String * * @param str a character vector or an object coercible to * @return a character vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_mark(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); // some of them will not be used in this call, but we're lazy SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes; STRI__PROTECT(mark_ascii = Rf_mkChar("ASCII")); STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1")); STRI__PROTECT(mark_utf8 = Rf_mkChar("UTF-8")); STRI__PROTECT(mark_native = Rf_mkChar("native")); STRI__PROTECT(mark_bytes = Rf_mkChar("bytes")); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i=0; i<str_len; ++i) { SEXP curs = STRING_ELT(str, i); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (IS_ASCII(curs)) SET_STRING_ELT(ret, i, mark_ascii); else if (IS_UTF8(curs)) SET_STRING_ELT(ret, i, mark_utf8); else if (IS_BYTES(curs)) SET_STRING_ELT(ret, i, mark_bytes); else if (IS_LATIN1(curs)) SET_STRING_ELT(ret, i, mark_latin1); else SET_STRING_ELT(ret, i, mark_native); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n > 0) { this->str = new UnicodeString*[this->n]; for (R_len_t i=0; i<this->n; ++i) this->str[i] = NULL; // in case it fails during conversion (this is NA) UConverter* ucnvASCII = NULL; // UConverter* ucnvUTF8 = NULL; UConverter* ucnvLatin1 = NULL; UConverter* ucnvNative = NULL; for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } else { if (IS_ASCII(curs)) { if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvASCII, status); if (U_FAILURE(status)) throw StriException(status); // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster tham the ucnvASCII approach. } else if (IS_UTF8(curs)) { // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8 this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1"); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvLatin1, status); if (U_FAILURE(status)) throw StriException(status); } else if (IS_BYTES(curs)) throw StriException(MSG__BYTESENC); else { // Any encoding - detection needed // Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL); UErrorCode status = U_ZERO_ERROR; this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs), ucnvNative, status); if (U_FAILURE(status)) throw StriException(status); } } } if (ucnvASCII) ucnv_close(ucnvASCII); // if (ucnvUTF8) ucnv_close(ucnvUTF8); if (ucnvLatin1) ucnv_close(ucnvLatin1); if (ucnvNative) ucnv_close(ucnvNative); if (!_shallowrecycle) { for (R_len_t i=nrstr; i<this->n; ++i) { if (this->str[i%nrstr] == NULL) this->str[i] = NULL; else this->str[i] = new UnicodeString(*this->str[i%nrstr]); } } } }
SEXP installTrChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; size_t inb, outb, res; cetype_t ienc = getCharCE(x); R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "installTrChar"); if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(utf8locale && IS_UTF8(x)) return install(ans); if(latin1locale && IS_LATIN1(x)) return install(ans); if(IS_ASCII(x)) return install(ans); if(IS_LATIN1(x)) { if(!latin1_obj) { obj = Riconv_open("", "latin1"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif latin1_obj = obj; } obj = latin1_obj; } else { if(!utf8_obj) { obj = Riconv_open("", "UTF-8"); /* should never happen */ if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion from '%s' in codepage %d"), "latin1", localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "latin1", ""); #endif utf8_obj = obj; } obj = utf8_obj; } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 13) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } if (ienc == CE_UTF8) { /* if starting in UTF-8, use \uxxxx */ /* This must be the first byte */ size_t clen; wchar_t wc; clen = utf8toucs(&wc, inbuf); if(clen > 0 && inb >= clen) { inbuf += clen; inb -= clen; # ifndef Win32 if((unsigned int) wc < 65536) { # endif snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc); outbuf += 8; outb -= 8; # ifndef Win32 } else { snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc); outbuf += 12; outb -= 12; } # endif } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } } else { snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; } goto next_char; } *outbuf = '\0'; SEXP Sans = install(cbuff.data); R_FreeStringBuffer(&cbuff); return Sans; }
/* This may return a R_alloc-ed result, so the caller has to manage the R_alloc stack */ attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */ const wchar_t *wtransChar(SEXP x) { void * obj; const char *inbuf, *ans = CHAR(x); char *outbuf; wchar_t *p; size_t inb, outb, res, top; Rboolean knownEnc = FALSE; R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE}; if(TYPEOF(x) != CHARSXP) error(_("'%s' must be called on a CHARSXP"), "wtransChar"); if(IS_BYTES(x)) error(_("translating strings with \"bytes\" encoding is not allowed")); if(IS_LATIN1(x)) { if(!latin1_wobj) { obj = Riconv_open(TO_WCHAR, "latin1"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); latin1_wobj = obj; } else obj = latin1_wobj; knownEnc = TRUE; } else if(IS_UTF8(x)) { if(!utf8_wobj) { obj = Riconv_open(TO_WCHAR, "UTF-8"); if(obj == (void *)(-1)) error(_("unsupported conversion from '%s' to '%s'"), "latin1", TO_WCHAR); utf8_wobj = obj; } else obj = utf8_wobj; knownEnc = TRUE; } else { obj = Riconv_open(TO_WCHAR, ""); if(obj == (void *)(-1)) #ifdef Win32 error(_("unsupported conversion to '%s' from codepage %d"), TO_WCHAR, localeCP); #else error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR); #endif } R_AllocStringBuffer(0, &cbuff); top_of_loop: inbuf = ans; inb = strlen(inbuf); outbuf = cbuff.data; top = outb = cbuff.bufsize - 1; /* First initialize output */ Riconv (obj, NULL, NULL, &outbuf, &outb); next_char: /* Then convert input */ res = Riconv(obj, &inbuf , &inb, &outbuf, &outb); if(res == -1 && errno == E2BIG) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { if(outb < 5) { R_AllocStringBuffer(2*cbuff.bufsize, &cbuff); goto top_of_loop; } snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf); outbuf += 4; outb -= 4; inbuf++; inb--; goto next_char; /* if(!knownEnc) Riconv_close(obj); error(_("invalid input in wtransChar")); */ } if(!knownEnc) Riconv_close(obj); res = (top - outb); /* terminator is 2 or 4 null bytes */ p = (wchar_t *) R_alloc(res+4, 1); memset(p, 0, res+4); memcpy(p, cbuff.data, res); R_FreeStringBuffer(&cbuff); return p; }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) Multiple input encoding support * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_length(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t ns = LENGTH(str); SEXP ret; UConverter* uconv = NULL; bool uconv_8bit = false; bool uconv_utf8 = false; STRI__ERROR_HANDLER_BEGIN /* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */ #define STRI_LENGTH_CALCULATE_UTF8 \ const char* qc = CHAR(q); \ R_len_t j = 0; \ for (R_len_t i = 0; i < nq; j++) \ U8_FWD_1(qc, i, nq); \ retint[k] = j; PROTECT(ret = Rf_allocVector(INTSXP, ns)); int* retint = INTEGER(ret); for (R_len_t k = 0; k < ns; k++) { SEXP q = STRING_ELT(str, k); if (q == NA_STRING) retint[k] = NA_INTEGER; else { R_len_t nq = LENGTH(q); // O(1) - stored by R // We trust (is that a wise assumption?) // R encoding marks; However, it there is no mark, // the string may have any encoding (ascii, latin1, utf8, native) if (IS_ASCII(q) || IS_LATIN1(q)) retint[k] = nq; else if (IS_BYTES(q)) throw StriException(MSG__BYTESENC); else if (IS_UTF8(q)) { STRI_LENGTH_CALCULATE_UTF8 } else { // Any encoding - detection needed // UTF-8 strings can be fairly reliably recognized as such by a // simple algorithm, i.e., the probability that a string of // characters in any other encoding appears as valid UTF-8 is low, // diminishing with increasing string length. // We have two possibilities here: // 1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native // This won't work correctly in some cases. // e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8 // and ("A umlaut", "Ellipsis") in WINDOWS-1250 // 2. Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!uconv) { // open ucnv on demand uconv = stri__ucnv_open((const char*)NULL); // native decoder if (!uconv) { retint[k] = NA_INTEGER; continue; } uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1); if (!uconv_8bit) { UErrorCode err = U_ZERO_ERROR; const char* name = ucnv_getName(uconv, &err); if (U_FAILURE(err)) throw StriException("could not query default converter"); uconv_utf8 = !strncmp("UTF-8", name, 5); } } if (uconv_8bit) { retint[k] = nq; // it's an 8-bit encoding :-) } else if (uconv_utf8) { // it's UTF-8 STRI_LENGTH_CALCULATE_UTF8 } else { // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') UErrorCode err = U_ZERO_ERROR; const char* source = CHAR(q); const char* sourceLimit = source + nq; R_len_t j; for (j = 0; source != sourceLimit; j++) { if (U_FAILURE(err)) break; // error from previous iteration // iterate through each native-encoded character: ucnv_getNextUChar(uconv, &source, sourceLimit, &err); } if (U_FAILURE(err)) { // error from last iteration Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string."); retint[k] = NA_INTEGER; } else retint[k] = j; // all right, we got it! } } }