Ejemplo n.º 1
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Ejemplo n.º 2
0
/**
 * Construct String Container from an R character vector
 *
 * @param rstr R character vector
 * @param nrecycle extend length [vectorization]
 * @param shallowrecycle will \code{this->str} be ever modified?
 */
StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle)
{
   this->str = NULL;
#ifndef NDEBUG
   if (!isString(rstr))
      throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)");
#endif
   R_len_t nrstr = LENGTH(rstr);
   this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call

   if (this->n == 0)
      return; /* nothing more to do */

   this->str = new UnicodeString[this->n];
   if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR);
   for (R_len_t i=0; i<this->n; ++i)
      this->str[i].setToBogus(); // in case it fails during conversion (this is NA)

   /* Important: ICU provides full internationalization functionality
   without any conversion table data. The common library contains
   code to handle several important encodings algorithmically: US-ASCII,
   ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */
   StriUcnv ucnvASCII("US-ASCII");
   StriUcnv ucnvLatin1("ISO-8859-1");
   StriUcnv ucnvNative(NULL);

   for (R_len_t i=0; i<nrstr; ++i) {
      SEXP curs = STRING_ELT(rstr, i);
      if (curs == NA_STRING) {
         continue; // keep NA
      }

      if (IS_ASCII(curs)) {
         // Version 1:
         UConverter* ucnv = ucnvASCII.getConverter();
         UErrorCode status = U_ZERO_ERROR;
         this->str[i].setTo(
            UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status)
         );
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

         // Performance improvement attempt #1:
         // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs)));
         // if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR);
         // slower than the above

         // Performance improvement attempt #2:
         // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,...
         // This wasn't faster than the ucnvASCII approach.

         // Performance improvement attempt #3:
         // slightly slower than ucnvASCII
         // R_len_t curs_n = LENGTH(curs);
         // const char* curs_s = CHAR(curs);
         // this->str[i].remove(); // unset bogus (NA)
         // UChar* buf = this->str[i].getBuffer(curs_n);
         // for (R_len_t k=0; k<curs_n; ++k)
         //   buf[k] = (UChar)curs_s[k]; // well, this is ASCII :)
         // this->str[i].releaseBuffer(curs_n);
      }
      else if (IS_UTF8(curs)) {
         // using ucnvUTF8 is slower for UTF-8
         // the same is done for native encoding && ucnvNative_isUTF8
         this->str[i].setTo(UnicodeString::fromUTF8(CHAR(curs)));
      }
      else if (IS_LATIN1(curs)) {
         UConverter* ucnv = ucnvLatin1.getConverter();
         UErrorCode status = U_ZERO_ERROR;
         this->str[i].setTo(
            UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status)
         );
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }
      else if (IS_BYTES(curs)) {