String TextCodecICU::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); ASSERT(m_converterICU); if (!m_converterICU) { WTF_LOG_ERROR("error creating ICU encoder even though encoding was in table"); return String(); } } ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); StringBuilder result; UChar buffer[ConversionBufferSize]; UChar* bufferLimit = buffer + ConversionBufferSize; const char* source = reinterpret_cast<const char*>(bytes); const char* sourceLimit = source + length; int32_t* offsets = nullptr; UErrorCode err = U_ZERO_ERROR; do { int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush != DoNotFlush, err); result.append(buffer, ucharsDecoded); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); } while (source < sourceLimit); sawError = true; } #if !defined(USING_SYSTEM_ICU) // Chrome's copy of ICU does not have the issue described below. return result.toString(); #else String resultString = result.toString(); // <http://bugs.webkit.org/show_bug.cgi?id=17014> // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. if (!strcmp(m_encoding.name(), "GBK")) { if (!strcasecmp(m_encoding.name(), "gb18030")) resultString.replace(0xE5E5, ideographicSpaceCharacter); // Make GBK compliant to the encoding spec and align with GB18030 resultString.replace(0x01F9, 0xE7C8); // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 // is resolved, add U+1E3F => 0xE7C7. } return resultString; #endif }
String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); ASSERT(m_converterICU); if (!m_converterICU) { LOG_ERROR("error creating ICU encoder even though encoding was in table"); return String(); } } ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); StringBuilder result; UChar buffer[ConversionBufferSize]; UChar* bufferLimit = buffer + ConversionBufferSize; const char* source = reinterpret_cast<const char*>(bytes); const char* sourceLimit = source + length; int32_t* offsets = NULL; UErrorCode err = U_ZERO_ERROR; do { int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); result.append(buffer, ucharsDecoded); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); } while (source < sourceLimit); sawError = true; } String resultString = result.toString(); // <http://bugs.webkit.org/show_bug.cgi?id=17014> // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. // FIXME: strcasecmp is locale sensitive, we should not be using it. if (strcmp(m_encodingName, "GBK") == 0 || strcasecmp(m_encodingName, "gb18030") == 0) resultString.replace(0xE5E5, ideographicSpace); return resultString; }