bool StreamingTextDecoderICU::textEncodingSupported() { if (!m_converterICU) createICUConverter(); return m_converterICU; }
String TextCodecICU::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); ASSERT(m_converterICU); if (!m_converterICU) { WTF_LOG_ERROR("error creating ICU encoder even though encoding was in table"); return String(); } } ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); StringBuilder result; UChar buffer[ConversionBufferSize]; UChar* bufferLimit = buffer + ConversionBufferSize; const char* source = reinterpret_cast<const char*>(bytes); const char* sourceLimit = source + length; int32_t* offsets = nullptr; UErrorCode err = U_ZERO_ERROR; do { int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush != DoNotFlush, err); result.append(buffer, ucharsDecoded); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); } while (source < sourceLimit); sawError = true; } #if !defined(USING_SYSTEM_ICU) // Chrome's copy of ICU does not have the issue described below. return result.toString(); #else String resultString = result.toString(); // <http://bugs.webkit.org/show_bug.cgi?id=17014> // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. if (!strcmp(m_encoding.name(), "GBK")) { if (!strcasecmp(m_encoding.name(), "gb18030")) resultString.replace(0xE5E5, ideographicSpaceCharacter); // Make GBK compliant to the encoding spec and align with GB18030 resultString.replace(0x01F9, 0xE7C8); // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 // is resolved, add U+1E3F => 0xE7C7. } return resultString; #endif }
CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling) { if (!length) return ""; if (!m_converterICU) createICUConverter(); if (!m_converterICU) return CString(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. String copy(characters, length); copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); const UChar* source = copy.characters(); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; switch (handling) { case QuestionMarksForUnencodables: ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); break; case EntitiesForUnencodables: ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); break; case URLEncodedEntitiesForUnencodables: ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); break; } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return CString(); Vector<char> result; size_t size = 0; do { char buffer[ConversionBufferSize]; char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); size_t count = target - buffer; result.grow(size + count); memcpy(result.data() + size, buffer, count); size += count; } while (err == U_BUFFER_OVERFLOW_ERROR); return CString(result.data(), size); }
CString TextCodecICU::encodeCommon(const CharType* characters, size_t length, UnencodableHandling handling) { if (!length) return ""; if (!m_converterICU) createICUConverter(); if (!m_converterICU) return CString(); TextCodecInput input(m_encoding, characters, length); return encodeInternal(input, handling); }
String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); ASSERT(m_converterICU); if (!m_converterICU) { LOG_ERROR("error creating ICU encoder even though encoding was in table"); return String(); } } ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); StringBuilder result; UChar buffer[ConversionBufferSize]; UChar* bufferLimit = buffer + ConversionBufferSize; const char* source = reinterpret_cast<const char*>(bytes); const char* sourceLimit = source + length; int32_t* offsets = NULL; UErrorCode err = U_ZERO_ERROR; do { int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); result.append(buffer, ucharsDecoded); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); } while (source < sourceLimit); sawError = true; } String resultString = result.toString(); // <http://bugs.webkit.org/show_bug.cgi?id=17014> // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. // FIXME: strcasecmp is locale sensitive, we should not be using it. if (strcmp(m_encodingName, "GBK") == 0 || strcasecmp(m_encodingName, "gb18030") == 0) resultString.replace(0xE5E5, ideographicSpace); return resultString; }
DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); if (!m_converterICU) return DeprecatedString(); } DeprecatedString result(""); result.reserve(len); UChar buffer[ConversionBufferSize]; const char* source = reinterpret_cast<const char*>(chs); const char* sourceLimit = source + len; int32_t* offsets = NULL; UErrorCode err; do { UChar* target = buffer; const UChar* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); int count = target - buffer; appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar)); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { UChar *target = buffer; const UChar *targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err); } while (source < sourceLimit); LOG_ERROR("ICU conversion error"); return DeprecatedString(); } return result; }
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities) { TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID(); if (encoding == WinLatin1Encoding && qcs.isAllLatin1()) return qcs.latin1(); if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) && qcs.isAllASCII()) return qcs.ascii(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. DeprecatedString copy = qcs; copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); if (!m_converterICU) createICUConverter(); if (!m_converterICU) return DeprecatedCString(); // FIXME: when DeprecatedString buffer is latin1, it would be nice to // convert from that w/o having to allocate a unicode buffer char buffer[ConversionBufferSize]; const UChar* source = reinterpret_cast<const UChar*>(copy.unicode()); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; DeprecatedString normalizedString; if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) { normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedString.truncate(normalizedLength); normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err); } source = reinterpret_cast<const UChar*>(normalizedString.unicode()); sourceLimit = source + normalizedLength; } DeprecatedCString result(1); // for trailing zero if (allowEntities) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); else { ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return DeprecatedCString(); do { char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); int count = target - buffer; buffer[count] = 0; result.append(buffer); } while (err == U_BUFFER_OVERFLOW_ERROR); return result; }