/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
/** * Construct String Container from an R character vector * * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!isString(rstr)) throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n == 0) return; /* nothing more to do */ this->str = new UnicodeString[this->n]; if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; i<this->n; ++i) this->str[i].setToBogus(); // in case it fails during conversion (this is NA) /* Important: ICU provides full internationalization functionality without any conversion table data. The common library contains code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ StriUcnv ucnvASCII("US-ASCII"); StriUcnv ucnvLatin1("ISO-8859-1"); StriUcnv ucnvNative(NULL); for (R_len_t i=0; i<nrstr; ++i) { SEXP curs = STRING_ELT(rstr, i); if (curs == NA_STRING) { continue; // keep NA } if (IS_ASCII(curs)) { // Version 1: UConverter* ucnv = ucnvASCII.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // Performance improvement attempt #1: // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); // slower than the above // Performance improvement attempt #2: // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // This wasn't faster than the ucnvASCII approach. // Performance improvement attempt #3: // slightly slower than ucnvASCII // R_len_t curs_n = LENGTH(curs); // const char* curs_s = CHAR(curs); // this->str[i].remove(); // unset bogus (NA) // UChar* buf = this->str[i].getBuffer(curs_n); // for (R_len_t k=0; k<curs_n; ++k) // buf[k] = (UChar)curs_s[k]; // well, this is ASCII :) // this->str[i].releaseBuffer(curs_n); } else if (IS_UTF8(curs)) { // using ucnvUTF8 is slower for UTF-8 // the same is done for native encoding && ucnvNative_isUTF8 this->str[i].setTo(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { UConverter* ucnv = ucnvLatin1.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else if (IS_BYTES(curs)) {