/**
 * Convert character vector between marked encodings and the encoding provided
 *
 * @param str     input character vector or list of raw vectors
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
 *          use StriUcnv
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          calc required buf size a priori
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_n = LENGTH(str);
   StriContainerUTF16 str_cont(str, str_n);

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   StriUcnv ucnv(selected_to);
   UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);

   // Get target encoding mark
   cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();

   // Prepare out val
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   // calculate required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_n; ++i) {
      if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize)
         bufsize = str_cont.get(i).length();
   }
   bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
   // "The calculated size is guaranteed to be sufficient for this conversion."
   String8buf buf(bufsize);

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t curn_tmp = str_cont.get(i).length();
      const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      UErrorCode status = U_ZERO_ERROR;
      ucnv_resetFromUnicode(uconv_to);
      R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
            curs_tmp, curn_tmp, &status);
      if (bufneed <= buf.size()) {
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }
      else {// larger buffer needed [this shouldn't happen?]
         buf.resize(bufneed, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
               curs_tmp, curn_tmp, &status);
         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
      }

      if (to_raw_logical) {
         SEXP outobj;
         STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
         STRI__UNPROTECT(1);
      }
      else {
         SET_STRING_ELT(ret, i,
            Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({/* nothing special on error */})
}
Ejemplo n.º 2
0
CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
    UConverter *converter;
    UErrorCode errorCode = U_ZERO_ERROR;
    const UTF16Char *source = characters;
    const UTF16Char *sourceLimit = source + numChars;
    char *destination = (char *)bytes;
    const char *destinationLimit = destination + maxByteLen;
    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
    CFIndex status;

    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;

    if (0 == maxByteLen) {
        char buffer[MAX_BUFFER_SIZE];
        CFIndex totalLength = 0;

        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
            destination = buffer;
            destinationLimit = destination + MAX_BUFFER_SIZE;

            ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);

            totalLength += (destination - buffer);

            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
        }

        if (NULL != usedByteLen) *usedByteLen = totalLength;
    } else {
        ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);

#if HAS_ICU_BUG_6024743
/* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
        if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
            const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
            const uint8_t *nonBase;
            UTF32Char character;

            do {
                // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
                do {
                    sourceLimit = (source - 1);
                    character = *sourceLimit;
                    nonBase = bitmap;

                    if (CFUniCharIsSurrogateLowCharacter(character)) {
                        --sourceLimit;
                        character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
                        nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
                        character &= 0xFFFF;
                    }
                } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));

                if (sourceLimit > characters) {
                    source = characters;
                    destination = (char *)bytes;
                    errorCode = U_ZERO_ERROR;

                    ucnv_resetFromUnicode(converter);

                    ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
                }
            } while (U_BUFFER_OVERFLOW_ERROR == errorCode);

            errorCode = U_BUFFER_OVERFLOW_ERROR;
        }
#endif
        if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
    }

    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));

    if (NULL != usedCharLen) {
#if HAS_ICU_BUG_6024743
/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
	if (kCFStringEncodingInvalidInputStream == status) {
#define MAX_ERROR_BUFFER_LEN (32)
	    UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
#undef MAX_ERROR_BUFFER_LEN

	    errorCode = U_ZERO_ERROR;

	    ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);

	    if (U_ZERO_ERROR == errorCode) {
		source -= errorLength;
	    } else {
		// Gah, something is terribly wrong. Reset everything
		source = characters; // 0 length
		if (NULL != usedByteLen) *usedByteLen = 0;
	    }
	}
#endif
	*usedCharLen = source - characters;
    }

    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);

    return status;
}
Ejemplo n.º 3
0
/**
 * Convert character vector between given encodings
 *
 * @param str     input character vector or list of raw vectors
 * @param from  source encoding, \code{NULL} or \code{""} for default enc
 * @param to    target encoding, \code{NULL} or \code{""} for default enc
 * @param to_raw single logical, should list of raw vectors be returned?
 * @return a converted character vector or list of raw vectors
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski) arg to_raw_added, encoding marking
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 * @version 0.4 (Marek Gagolewski, 2013-08-08) use StriContainerListRaw
 * @version 0.5 (Marek Gagolewski, 2013-11-20) BUGFIX call stri_encode_from_marked if necessary
 */
SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw)
{
   const char* selected_from = stri__prepare_arg_enc(from, "from", true);
   if (!selected_from && Rf_isVectorAtomic(str))
      return stri_encode_from_marked(str, to, to_raw);

   str = stri_prepare_arg_list_raw(str, "str");
   const char* selected_to   = stri__prepare_arg_enc(to, "to", true);
   bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");

   UConverter* uconv_from = NULL;
   UConverter* uconv_to = NULL;

   STRI__ERROR_HANDLER_BEGIN
   StriContainerListRaw str_cont(str);
   R_len_t str_n = str_cont.get_n();

   // get the number of strings to convert; if == 0, then you know what's the result
   if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);

   // Open converters
   uconv_from = stri__ucnv_open(selected_from);
   uconv_to = stri__ucnv_open(selected_to);


   // Get target encoding mark
   UErrorCode err = U_ZERO_ERROR;
   const char* uconv_to_name = ucnv_getName(uconv_to, &err);
   if (U_FAILURE(err))
      throw StriException(err);
   cetype_t encmark_to = CE_BYTES; // all other cases than the below ones
      // - bytes enc (this is reasonable, isn't it?)
   if (!to_raw_logical) { // otherwise not needed
      if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8"))
         encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE
      else if (!strcmp(uconv_to_name, "ISO-8859-1"))
         encmark_to = CE_LATIN1;
      else if (!strcmp(uconv_to_name, ucnv_getDefaultName()))
         encmark_to = CE_NATIVE;
   }

   // Prepare out val
   SEXP ret;
   PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));

   String8 buf(0); // will be extended in a moment

   for (R_len_t i=0; i<str_n; ++i) {
      if (str_cont.isNA(i)) {
         if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
         else                SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const char* curd = str_cont.get(i).c_str();
      R_len_t curn     = str_cont.get(i).length();

      err = U_ZERO_ERROR;
      UnicodeString encs(curd, curn, uconv_from, err); // FROM -> UTF-16 [this is the slow part]
      if (U_FAILURE(err))
         throw StriException(err);

      R_len_t curn_tmp = encs.length();
      const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated.
      if (!curs_tmp)
         throw StriException(MSG__INTERNAL_ERROR);

      R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to));
      // "The calculated size is guaranteed to be sufficient for this conversion."
      buf.resize(bufneed);

      err = U_ZERO_ERROR;
//      bufneed = encs.extract(buf.data(), buf.size(), uconv_to, err); // UTF-16 -> TO
      ucnv_resetFromUnicode(uconv_to);
      bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
      if (bufneed <= buf.size()) {
         if (U_FAILURE(err))
            throw StriException(err);
      }
      else {// larger buffer needed [this shouldn't happen?]
//         warning("buf extending");
         buf.resize(bufneed);
         err = U_ZERO_ERROR;
         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err);
         if (U_FAILURE(err))
            throw StriException(err);
         if (bufneed > buf.size())
            throw StriException(MSG__INTERNAL_ERROR);
      }

      if (to_raw_logical) {
         SEXP outobj = Rf_allocVector(RAWSXP, bufneed);
         memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
         SET_VECTOR_ELT(ret, i, outobj);
      }
      else {
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
      }
   }

   if (uconv_from) {
      ucnv_close(uconv_from);
      uconv_from = NULL;
   }
   if (uconv_to) {
      ucnv_close(uconv_to);
      uconv_to = NULL;
   }

   UNPROTECT(1);
   return ret;

   STRI__ERROR_HANDLER_END({
      if (uconv_from)
         ucnv_close(uconv_from);
      if (uconv_to)
         ucnv_close(uconv_to);
   })
}
static void NativeConverter_resetCharToByte(JNIEnv*, jclass, jlong address) {
    UConverter* cnv = toUConverter(address);
    if (cnv) {
        ucnv_resetFromUnicode(cnv);
    }
}