CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(GLIB_UNICODE) GOwnPtr<char> UTF8Source; UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); if (!UTF8Source) { // If conversion to UTF-8 failed, try with the string without normalization return newTextCodec(*this)->encode(characters, length, handling); } GOwnPtr<char> UTF8Normalized; UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); long UTF16Length; GOwnPtr<UChar> UTF16Normalized; UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); #elif OS(WINCE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(CASQT_UNICODE) // FIXME:CASQT #if 0 // 暂不执行normalized,该函数效率太低 QString str(reinterpret_cast<const ushort*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #else return newTextCodec(*this)->encode(characters, length, handling); #endif #endif }
String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const { if (!m_name) return String(); return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); }
TEST(TextCodecLatin1Test, QuestionMarksAndSurrogates) { WTF::TextEncoding encoding("windows-1252"); std::unique_ptr<WTF::TextCodec> codec(newTextCodec(encoding)); { const LChar testCase[] = {0xd1, 0x16, 0x86}; size_t testCaseSize = WTF_ARRAY_LENGTH(testCase); CString result = codec->encode(testCase, testCaseSize, WTF::QuestionMarksForUnencodables); EXPECT_STREQ("\xd1\x16?", result.data()); } { const UChar testCase[] = {0xd9f0, 0xdcd9}; size_t testCaseSize = WTF_ARRAY_LENGTH(testCase); CString result = codec->encode(testCase, testCaseSize, WTF::QuestionMarksForUnencodables); EXPECT_STREQ("?", result.data()); } { const UChar testCase[] = {0xd9f0, 0xdcd9, 0xd9f0, 0xdcd9}; size_t testCaseSize = WTF_ARRAY_LENGTH(testCase); CString result = codec->encode(testCase, testCaseSize, WTF::QuestionMarksForUnencodables); EXPECT_STREQ("??", result.data()); } }
TextDecoder::TextDecoder(const WTF::TextEncoding& encoding, bool fatal, bool ignoreBOM) : m_encoding(encoding) , m_codec(newTextCodec(encoding)) , m_fatal(fatal) , m_ignoreBOM(ignoreBOM) , m_bomSeen(false) { }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(JAVA_UNICODE) String normalized = TextNormalizer::normalize( characters, length, TextNormalizer::NFC); return newTextCodec(*this)->encode( normalized.characters(), normalized.length(), handling); #elif OS(WINDOWS) && USE(WCHAR_UNICODE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
String TextResourceDecoder::flush() { if (!m_codec) m_codec = newTextCodec(encoding()); String result = m_codec->decode(0, 0, WTF::FetchEOF, false, m_sawError); m_codec.clear(); return result; }
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const { if (!m_name) return CString(); if (string.isEmpty()) return ""; // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left // unaffected by NFC. This is effectively the same as saying that all // Latin-1 text is already normalized to NFC. // Source: http://unicode.org/reports/tr15/ if (string.is8Bit()) return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); const UChar* source = string.characters16(); size_t length = string.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(length); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); length = normalizedLength; } return newTextCodec(*this)->encode(source, length, handling); }
CString TextEncoding::encode(const String& string, UnencodableHandling handling) const { if (!m_name) return CString(); if (string.isEmpty()) return ""; OwnPtr<TextCodec> textCodec = newTextCodec(*this); CString encodedString; if (string.is8Bit()) encodedString = textCodec->encode(string.characters8(), string.length(), handling); else encodedString = textCodec->encode(string.characters16(), string.length(), handling); return encodedString; }
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush) { // Check to see if we found a BOM. size_t numBufferedBytes = m_numBufferedBytes; size_t buf1Len = numBufferedBytes; size_t buf2Len = length; const unsigned char* buf1 = m_bufferedBytes; const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c3 = buf2Len ? (--buf2Len, *buf2++) : 0; const TextEncoding* encodingConsideringBOM = &m_encoding; if (c1 == 0xFF && c2 == 0xFE) encodingConsideringBOM = &UTF16LittleEndianEncoding(); else if (c1 == 0xFE && c2 == 0xFF) encodingConsideringBOM = &UTF16BigEndianEncoding(); else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) encodingConsideringBOM = &UTF8Encoding(); else if (numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) { // Continue to look for the BOM. memcpy(&m_bufferedBytes[numBufferedBytes], data, length); m_numBufferedBytes += length; return ""; } // Done checking for BOM. m_codec.set(newTextCodec(*encodingConsideringBOM).release()); if (!m_codec) return String(); m_checkedForBOM = true; // Handle case where we have some buffered bytes to deal with. if (numBufferedBytes) { char bufferedBytes[sizeof(m_bufferedBytes)]; memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes); m_numBufferedBytes = 0; return m_codec->decode(bufferedBytes, numBufferedBytes, false) + m_codec->decode(data, length, flush); } return m_codec->decode(data, length, flush); }
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const { if (!m_name) return CString(); if (text.isEmpty()) return ""; // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. auto upconvertedCharacters = text.upconvertedCharacters(); const UChar* source = upconvertedCharacters; size_t sourceLength = text.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); }
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError) { ASSERT(!m_checkedForBOM); // Check to see if we found a BOM. size_t numBufferedBytes = m_numBufferedBytes; size_t buf1Len = numBufferedBytes; size_t buf2Len = length; const unsigned char* buf1 = m_bufferedBytes; const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; const TextEncoding* encodingConsideringBOM = &m_encoding; bool foundBOM = true; size_t lengthOfBOM = 0; if (c1 == 0xFF && c2 == 0xFE) { if (c3 != 0 || c4 != 0) { encodingConsideringBOM = &UTF16LittleEndianEncoding(); lengthOfBOM = 2; } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) { encodingConsideringBOM = &UTF32LittleEndianEncoding(); lengthOfBOM = 4; } else foundBOM = false; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { encodingConsideringBOM = &UTF8Encoding(); lengthOfBOM = 3; } else if (c1 == 0xFE && c2 == 0xFF) { encodingConsideringBOM = &UTF16BigEndianEncoding(); lengthOfBOM = 2; } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { encodingConsideringBOM = &UTF32BigEndianEncoding(); lengthOfBOM = 4; } else foundBOM = false; if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) { // Continue to look for the BOM. memcpy(&m_bufferedBytes[numBufferedBytes], data, length); m_numBufferedBytes += length; return ""; } // Done checking for BOM. m_codec.set(newTextCodec(*encodingConsideringBOM).release()); if (!m_codec) return String(); m_checkedForBOM = true; // Skip the BOM. if (foundBOM) { ASSERT(numBufferedBytes < lengthOfBOM); size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes; ASSERT(numUnbufferedBOMBytes <= length); data += numUnbufferedBOMBytes; length -= numUnbufferedBOMBytes; numBufferedBytes = 0; m_numBufferedBytes = 0; } // Handle case where we have some buffered bytes to deal with. if (numBufferedBytes) { char bufferedBytes[sizeof(m_bufferedBytes)]; memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes); m_numBufferedBytes = 0; String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError); if (stopOnError && sawError) return bufferedResult; return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError); } return m_codec->decode(data, length, flush, stopOnError, sawError); }
EventSourceParser::EventSourceParser(const AtomicString& lastEventId, Client* client) : m_id(lastEventId), m_lastEventId(lastEventId), m_client(client), m_codec(newTextCodec(UTF8Encoding())) {}
String TextResourceDecoder::decode(const char* data, size_t len) { if (!m_codec) m_codec = newTextCodec(encoding()); return m_codec->decode(data, len, WTF::DoNotFlush, false, m_sawError); }