CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(GLIB_UNICODE) GOwnPtr<char> UTF8Source; UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); if (!UTF8Source) { // If conversion to UTF-8 failed, try with the string without normalization return newTextCodec(*this)->encode(characters, length, handling); } GOwnPtr<char> UTF8Normalized; UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); long UTF16Length; GOwnPtr<UChar> UTF16Normalized; UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); #elif OS(WINCE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode *status) { return unorm_quickCheck(source, sourcelength, mode, status); }
virtual void call(UErrorCode* pErrorCode) { UErrorCode errorCode=U_ZERO_ERROR; qcResult=unorm_quickCheck(testcase.getBuffer(), testcase.getBufferLen(), UNORM_FCD_ALWAYS_GET, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "error: unorm_quickCheck(UNORM_FCD) failed: %s\n", u_errorName(errorCode)); } }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(CASQT_UNICODE) // FIXME:CASQT #if 0 // 暂不执行normalized,该函数效率太低 QString str(reinterpret_cast<const ushort*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #else return newTextCodec(*this)->encode(characters, length, handling); #endif #endif }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(JAVA_UNICODE) String normalized = TextNormalizer::normalize( characters, length, TextNormalizer::NFC); return newTextCodec(*this)->encode( normalized.characters(), normalized.length(), handling); #elif OS(WINDOWS) && USE(WCHAR_UNICODE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const { if (!m_name) return CString(); if (string.isEmpty()) return ""; // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left // unaffected by NFC. This is effectively the same as saying that all // Latin-1 text is already normalized to NFC. // Source: http://unicode.org/reports/tr15/ if (string.is8Bit()) return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); const UChar* source = string.characters16(); size_t length = string.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(length); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); length = normalizedLength; } return newTextCodec(*this)->encode(source, length, handling); }
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const { if (!m_name) return CString(); if (text.isEmpty()) return ""; // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. auto upconvertedCharacters = text.upconvertedCharacters(); const UChar* source = upconvertedCharacters; size_t sourceLength = text.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); }
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities) { TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID(); if (encoding == WinLatin1Encoding && qcs.isAllLatin1()) return qcs.latin1(); if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) && qcs.isAllASCII()) return qcs.ascii(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. DeprecatedString copy = qcs; copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); if (!m_converterICU) createICUConverter(); if (!m_converterICU) return DeprecatedCString(); // FIXME: when DeprecatedString buffer is latin1, it would be nice to // convert from that w/o having to allocate a unicode buffer char buffer[ConversionBufferSize]; const UChar* source = reinterpret_cast<const UChar*>(copy.unicode()); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; DeprecatedString normalizedString; if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) { normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedString.truncate(normalizedLength); normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err); } source = reinterpret_cast<const UChar*>(normalizedString.unicode()); sourceLimit = source + normalizedLength; } DeprecatedCString result(1); // for trailing zero if (allowEntities) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); else { ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return DeprecatedCString(); do { char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); int count = target - buffer; buffer[count] = 0; result.append(buffer); } while (err == U_BUFFER_OVERFLOW_ERROR); return result; }