Beispiel #1
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(GLIB_UNICODE)
    GOwnPtr<char> UTF8Source;
    UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
    if (!UTF8Source) {
        // If conversion to UTF-8 failed, try with the string without normalization
        return newTextCodec(*this)->encode(characters, length, handling);
    }

    GOwnPtr<char> UTF8Normalized;
    UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));

    long UTF16Length;
    GOwnPtr<UChar> UTF16Normalized;
    UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));

    return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
#elif OS(WINCE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
Beispiel #2
0
UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source,
						int32_t sourcelength,
						UNormalizationMode mode,
						UErrorCode *status)
{
    return unorm_quickCheck(source, sourcelength, mode, status);
}
Beispiel #3
0
 virtual void call(UErrorCode* pErrorCode) {
     UErrorCode errorCode=U_ZERO_ERROR;
     qcResult=unorm_quickCheck(testcase.getBuffer(), testcase.getBufferLen(),
                               UNORM_FCD_ALWAYS_GET, &errorCode);
     if(U_FAILURE(errorCode)) {
         fprintf(stderr, "error: unorm_quickCheck(UNORM_FCD) failed: %s\n",
                 u_errorName(errorCode));
     }
 }
Beispiel #4
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(CASQT_UNICODE) // FIXME:CASQT
#if 0 // 暂不执行normalized,该函数效率太低
    QString str(reinterpret_cast<const ushort*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#else
	return newTextCodec(*this)->encode(characters, length, handling);
#endif
#endif
}
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(JAVA_UNICODE)
    String normalized = TextNormalizer::normalize(
                                     characters, length, TextNormalizer::NFC);
    return newTextCodec(*this)->encode(
                 normalized.characters(), normalized.length(), handling);
#elif OS(WINDOWS) && USE(WCHAR_UNICODE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
    // unaffected by NFC. This is effectively the same as saying that all
    // Latin-1 text is already normalized to NFC.
    // Source: http://unicode.org/reports/tr15/
    if (string.is8Bit())
        return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);

    const UChar* source = string.characters16();
    size_t length = string.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(length);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        length = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, length, handling);
}
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (text.isEmpty())
        return "";

    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    auto upconvertedCharacters = text.upconvertedCharacters();

    const UChar* source = upconvertedCharacters;
    size_t sourceLength = text.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, sourceLength, handling);
}
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
{
    TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();

    if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
        return qcs.latin1();

    if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) 
        && qcs.isAllASCII())
        return qcs.ascii();

    // FIXME: We should see if there is "force ASCII range" mode in ICU;
    // until then, we change the backslash into a yen sign.
    // Encoding will change the yen sign back into a backslash.
    DeprecatedString copy = qcs;
    copy.replace('\\', m_encoding.backslashAsCurrencySymbol());

    if (!m_converterICU)
        createICUConverter();
    if (!m_converterICU)
        return DeprecatedCString();

    // FIXME: when DeprecatedString buffer is latin1, it would be nice to
    // convert from that w/o having to allocate a unicode buffer

    char buffer[ConversionBufferSize];
    const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
    const UChar* sourceLimit = source + copy.length();

    UErrorCode err = U_ZERO_ERROR;
    DeprecatedString normalizedString;
    if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
        normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed
        
        int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedString.truncate(normalizedLength);
            normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err);
        }
        
        source = reinterpret_cast<const UChar*>(normalizedString.unicode());
        sourceLimit = source + normalizedLength;
    }

    DeprecatedCString result(1); // for trailing zero

    if (allowEntities)
        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
    else {
        ucnv_setSubstChars(m_converterICU, "?", 1, &err);
        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
    }

    ASSERT(U_SUCCESS(err));
    if (U_FAILURE(err))
        return DeprecatedCString();

    do {
        char* target = buffer;
        char* targetLimit = target + ConversionBufferSize;
        err = U_ZERO_ERROR;
        ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true,  &err);
        int count = target - buffer;
        buffer[count] = 0;
        result.append(buffer);
    } while (err == U_BUFFER_OVERFLOW_ERROR);

    return result;
}