示例#1
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(GLIB_UNICODE)
    GOwnPtr<char> UTF8Source;
    UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
    if (!UTF8Source) {
        // If conversion to UTF-8 failed, try with the string without normalization
        return newTextCodec(*this)->encode(characters, length, handling);
    }

    GOwnPtr<char> UTF8Normalized;
    UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));

    long UTF16Length;
    GOwnPtr<UChar> UTF16Normalized;
    UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));

    return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
#elif OS(WINCE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
示例#2
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(CASQT_UNICODE) // FIXME:CASQT
#if 0 // 暂不执行normalized,该函数效率太低
    QString str(reinterpret_cast<const ushort*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#else
	return newTextCodec(*this)->encode(characters, length, handling);
#endif
#endif
}
示例#3
0
String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
{
    if (!m_name)
        return String();

    return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
}
示例#4
0
TEST(TextCodecLatin1Test, QuestionMarksAndSurrogates) {
  WTF::TextEncoding encoding("windows-1252");
  std::unique_ptr<WTF::TextCodec> codec(newTextCodec(encoding));

  {
    const LChar testCase[] = {0xd1, 0x16, 0x86};
    size_t testCaseSize = WTF_ARRAY_LENGTH(testCase);
    CString result = codec->encode(testCase, testCaseSize,
                                   WTF::QuestionMarksForUnencodables);
    EXPECT_STREQ("\xd1\x16?", result.data());
  }
  {
    const UChar testCase[] = {0xd9f0, 0xdcd9};
    size_t testCaseSize = WTF_ARRAY_LENGTH(testCase);
    CString result = codec->encode(testCase, testCaseSize,
                                   WTF::QuestionMarksForUnencodables);
    EXPECT_STREQ("?", result.data());
  }
  {
    const UChar testCase[] = {0xd9f0, 0xdcd9, 0xd9f0, 0xdcd9};
    size_t testCaseSize = WTF_ARRAY_LENGTH(testCase);
    CString result = codec->encode(testCase, testCaseSize,
                                   WTF::QuestionMarksForUnencodables);
    EXPECT_STREQ("??", result.data());
  }
}
示例#5
0
TextDecoder::TextDecoder(const WTF::TextEncoding& encoding, bool fatal, bool ignoreBOM)
    : m_encoding(encoding)
    , m_codec(newTextCodec(encoding))
    , m_fatal(fatal)
    , m_ignoreBOM(ignoreBOM)
    , m_bomSeen(false)
{
}
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(JAVA_UNICODE)
    String normalized = TextNormalizer::normalize(
                                     characters, length, TextNormalizer::NFC);
    return newTextCodec(*this)->encode(
                 normalized.characters(), normalized.length(), handling);
#elif OS(WINDOWS) && USE(WCHAR_UNICODE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
示例#7
0
String TextResourceDecoder::flush()
{
    if (!m_codec)
        m_codec = newTextCodec(encoding());

    String result = m_codec->decode(0, 0, WTF::FetchEOF, false, m_sawError);
    m_codec.clear();
    return result;
}
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
    // unaffected by NFC. This is effectively the same as saying that all
    // Latin-1 text is already normalized to NFC.
    // Source: http://unicode.org/reports/tr15/
    if (string.is8Bit())
        return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);

    const UChar* source = string.characters16();
    size_t length = string.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(length);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        length = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, length, handling);
}
CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    CString encodedString;
    if (string.is8Bit())
        encodedString = textCodec->encode(string.characters8(), string.length(), handling);
    else
        encodedString = textCodec->encode(string.characters16(), string.length(), handling);
    return encodedString;
}
示例#10
0
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush)
{
    // Check to see if we found a BOM.
    size_t numBufferedBytes = m_numBufferedBytes;
    size_t buf1Len = numBufferedBytes;
    size_t buf2Len = length;
    const unsigned char* buf1 = m_bufferedBytes;
    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c3 = buf2Len ? (--buf2Len, *buf2++) : 0;

    const TextEncoding* encodingConsideringBOM = &m_encoding;
    if (c1 == 0xFF && c2 == 0xFE)
        encodingConsideringBOM = &UTF16LittleEndianEncoding();
    else if (c1 == 0xFE && c2 == 0xFF)
        encodingConsideringBOM = &UTF16BigEndianEncoding();
    else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
        encodingConsideringBOM = &UTF8Encoding();
    else if (numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
        // Continue to look for the BOM.
        memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
        m_numBufferedBytes += length;
        return "";
    }

    // Done checking for BOM.
    m_codec.set(newTextCodec(*encodingConsideringBOM).release());
    if (!m_codec)
        return String();
    m_checkedForBOM = true;

    // Handle case where we have some buffered bytes to deal with.
    if (numBufferedBytes) {
        char bufferedBytes[sizeof(m_bufferedBytes)];
        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
        m_numBufferedBytes = 0;
        return m_codec->decode(bufferedBytes, numBufferedBytes, false)
            + m_codec->decode(data, length, flush);
    }

    return m_codec->decode(data, length, flush);
}
示例#11
0
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (text.isEmpty())
        return "";

    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    auto upconvertedCharacters = text.upconvertedCharacters();

    const UChar* source = upconvertedCharacters;
    size_t sourceLength = text.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, sourceLength, handling);
}
示例#12
0
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
{
    ASSERT(!m_checkedForBOM);

    // Check to see if we found a BOM.
    size_t numBufferedBytes = m_numBufferedBytes;
    size_t buf1Len = numBufferedBytes;
    size_t buf2Len = length;
    const unsigned char* buf1 = m_bufferedBytes;
    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

    const TextEncoding* encodingConsideringBOM = &m_encoding;
    bool foundBOM = true;
    size_t lengthOfBOM = 0;
    if (c1 == 0xFF && c2 == 0xFE) {
        if (c3 != 0 || c4 != 0)  {
            encodingConsideringBOM = &UTF16LittleEndianEncoding();
            lengthOfBOM = 2;
        } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
            encodingConsideringBOM = &UTF32LittleEndianEncoding();
            lengthOfBOM = 4;
        } else
            foundBOM = false;
    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
        encodingConsideringBOM = &UTF8Encoding();
        lengthOfBOM = 3;
    } else if (c1 == 0xFE && c2 == 0xFF) {
        encodingConsideringBOM = &UTF16BigEndianEncoding();
        lengthOfBOM = 2;
    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
        encodingConsideringBOM = &UTF32BigEndianEncoding();
        lengthOfBOM = 4;
    } else
        foundBOM = false;

    if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
        // Continue to look for the BOM.
        memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
        m_numBufferedBytes += length;
        return "";
    }

    // Done checking for BOM.
    m_codec.set(newTextCodec(*encodingConsideringBOM).release());
    if (!m_codec)
        return String();
    m_checkedForBOM = true;

    // Skip the BOM.
    if (foundBOM) {
        ASSERT(numBufferedBytes < lengthOfBOM);
        size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
        ASSERT(numUnbufferedBOMBytes <= length);

        data += numUnbufferedBOMBytes;
        length -= numUnbufferedBOMBytes;
        numBufferedBytes = 0;
        m_numBufferedBytes = 0;
    }

    // Handle case where we have some buffered bytes to deal with.
    if (numBufferedBytes) {
        char bufferedBytes[sizeof(m_bufferedBytes)];
        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
        m_numBufferedBytes = 0;

        String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError);
        if (stopOnError && sawError)
            return bufferedResult;
        return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError);
    }

    return m_codec->decode(data, length, flush, stopOnError, sawError);
}
示例#13
0
EventSourceParser::EventSourceParser(const AtomicString& lastEventId,
                                     Client* client)
    : m_id(lastEventId),
      m_lastEventId(lastEventId),
      m_client(client),
      m_codec(newTextCodec(UTF8Encoding())) {}
示例#14
0
String TextResourceDecoder::decode(const char* data, size_t len)
{
    if (!m_codec)
        m_codec = newTextCodec(encoding());
    return m_codec->decode(data, len, WTF::DoNotFlush, false, m_sawError);
}