char * ppb_char_set_utf16_to_char_set(PP_Instance instance, const uint16_t *utf16, uint32_t utf16_len, const char *output_char_set, enum PP_CharSet_ConversionError on_error, uint32_t *output_length) { // each character could take up to 4 bytes in UTF-8; with additional zero-terminator byte const uint32_t output_buffer_length = (utf16_len + 1) * 4 + 1; char *output = ppb_memory_mem_alloc(output_buffer_length); if (!output) { trace_error("%s, can't allocate memory, %u bytes\n", __func__, output_buffer_length); goto err; } const char *charset = encoding_alias_get_canonical_name(output_char_set); const UChar subst = '?'; UErrorCode st = U_ZERO_ERROR; UConverter *u = ucnv_open(charset, &st); if (!U_SUCCESS(st)) { trace_error("%s, wrong charset %s\n", __func__, output_char_set); goto err; } switch (on_error) { default: case PP_CHARSET_CONVERSIONERROR_FAIL: st = U_ZERO_ERROR; ucnv_setFromUCallBack(u, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SKIP: st = U_ZERO_ERROR; ucnv_setFromUCallBack(u, UCNV_FROM_U_CALLBACK_SKIP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SUBSTITUTE: st = U_ZERO_ERROR; ucnv_setFromUCallBack(u, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &st); st = U_ZERO_ERROR; ucnv_setSubstString(u, &subst, 1, &st); break; } *output_length = ucnv_fromUChars(u, output, output_buffer_length, utf16, utf16_len, &st); if (st != U_BUFFER_OVERFLOW_ERROR && !U_SUCCESS(st)) goto err; ucnv_close(u); return output; err: *output_length = 0; ppb_memory_mem_free(output); if (u) ucnv_close(u); return NULL; }
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu (const charset& source, const charset& dest, outputStream* os, const charsetConverterOptions& opts) : m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os), m_options(opts) { UErrorCode err = U_ZERO_ERROR; m_from = ucnv_open(source.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + "."); } m_to = ucnv_open(dest.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); } // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } }
bool StringCharsetConverter::makeConverter() { if(cv != nullptr) ucnv_close(cv); UErrorCode err = U_ZERO_ERROR; cv = ucnv_open(charset.c_str(), &err); if(U_FAILURE(err)) { cv = nullptr; return false; } ucnv_setSubstString(cv, L"\x3F", -1, &err); // "?" U+003F QUESTION MARK return true; }
uint16_t * ppb_char_set_char_set_to_utf16(PP_Instance instance, const char *input, uint32_t input_len, const char *input_char_set, enum PP_CharSet_ConversionError on_error, uint32_t *output_length) { // each character could be converted into a surrogate pair const uint32_t output_buffer_length = (input_len + 2) * 2 * sizeof(uint16_t); uint16_t *output = ppb_memory_mem_alloc(output_buffer_length); if (!output) { trace_error("%s, can't allocate memory, %u bytes\n", __func__, output_buffer_length); goto err; } const char *charset = encoding_alias_get_canonical_name(input_char_set); const UChar subst = '?'; UErrorCode st = U_ZERO_ERROR; UConverter *u = ucnv_open(charset, &st); if (!U_SUCCESS(st)) { trace_error("%s, wrong charset %s\n", __func__, input_char_set); goto err; } switch (on_error) { default: case PP_CHARSET_CONVERSIONERROR_FAIL: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SKIP: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SUBSTITUTE: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &st); st = U_ZERO_ERROR; ucnv_setSubstString(u, &subst, 1, &st); break; } st = U_ZERO_ERROR; *output_length = ucnv_toUChars(u, output, output_buffer_length / sizeof(uint16_t), input, input_len, &st); if (st != U_BUFFER_OVERFLOW_ERROR && !U_SUCCESS(st)) goto err; ucnv_close(u); return output; err: *output_length = 0; ppb_memory_mem_free(output); if (u) ucnv_close(u); return NULL; }
void charsetConverter_icu::convert (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; ucnv_reset(m_from); ucnv_reset(m_to); if (st) new (st) status(); // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here // To buffers // converted (char) data end up here const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector <char> cpOutBuffer(cpOutBufferSz); // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } // Input data available while (!in.eof()) { // Read input data into buffer size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer)); // Beginning of read data const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]); const char* sourceLimit = source + inLength; // end + 1 UBool flush = in.eof(); // is this last run? UErrorCode toErr; // Loop until all source has been processed do { // Set up target pointers UChar* target = &uOutBuffer[0]; UChar* targetLimit = &target[0] + outSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); if (st) st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0])); if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { // Error will be thrown later (*) } else { throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); } } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) const UChar* uSource = &uOutBuffer[0]; UChar* uSourceLimit = &target[0]; UErrorCode fromErr; // Loop until converted chars are fully written do { char* cpTarget = &cpOutBuffer[0]; const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz; fromErr = U_ZERO_ERROR; // Write converted bytes (Unicode) to destination codepage ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); if (st) { // Decrement input bytes count by the number of input bytes in error char errBytes[16]; int8_t errBytesLen = sizeof(errBytes); UErrorCode errBytesErr = U_ZERO_ERROR; ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); st->inputBytesRead -= errBytesLen; st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; } // (*) If an error occurred while converting from input charset, throw it now if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); } } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); } }