bool TextEncoding::isNonByteBasedEncoding() const
{
    if (noExtendedTextEncodingNameUsed()) {
        return *this == UTF16LittleEndianEncoding()
               || *this == UTF16BigEndianEncoding();
    }

    return *this == UTF16LittleEndianEncoding()
           || *this == UTF16BigEndianEncoding()
           || *this == UTF32BigEndianEncoding()
           || *this == UTF32LittleEndianEncoding();
}
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush)
{
    // Check to see if we found a BOM.
    size_t numBufferedBytes = m_numBufferedBytes;
    size_t buf1Len = numBufferedBytes;
    size_t buf2Len = length;
    const unsigned char* buf1 = m_bufferedBytes;
    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c3 = buf2Len ? (--buf2Len, *buf2++) : 0;

    const TextEncoding* encodingConsideringBOM = &m_encoding;
    if (c1 == 0xFF && c2 == 0xFE)
        encodingConsideringBOM = &UTF16LittleEndianEncoding();
    else if (c1 == 0xFE && c2 == 0xFF)
        encodingConsideringBOM = &UTF16BigEndianEncoding();
    else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
        encodingConsideringBOM = &UTF8Encoding();
    else if (numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
        // Continue to look for the BOM.
        memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
        m_numBufferedBytes += length;
        return "";
    }

    // Done checking for BOM.
    m_codec.set(newTextCodec(*encodingConsideringBOM).release());
    if (!m_codec)
        return String();
    m_checkedForBOM = true;

    // Handle case where we have some buffered bytes to deal with.
    if (numBufferedBytes) {
        char bufferedBytes[sizeof(m_bufferedBytes)];
        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
        m_numBufferedBytes = 0;
        return m_codec->decode(bufferedBytes, numBufferedBytes, false)
            + m_codec->decode(data, length, flush);
    }

    return m_codec->decode(data, length, flush);
}
const TextEncoding& TextEncoding::closest8BitEquivalent() const
{
    if (*this == UTF16BigEndianEncoding() || *this == UTF16LittleEndianEncoding())
        return UTF8Encoding();
    return *this;
}
Exemple #4
0
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
{
    ASSERT(!m_checkedForBOM);

    // Check to see if we found a BOM.
    size_t numBufferedBytes = m_numBufferedBytes;
    size_t buf1Len = numBufferedBytes;
    size_t buf2Len = length;
    const unsigned char* buf1 = m_bufferedBytes;
    const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

    const TextEncoding* encodingConsideringBOM = &m_encoding;
    bool foundBOM = true;
    size_t lengthOfBOM = 0;
    if (c1 == 0xFF && c2 == 0xFE) {
        if (c3 != 0 || c4 != 0)  {
            encodingConsideringBOM = &UTF16LittleEndianEncoding();
            lengthOfBOM = 2;
        } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
            encodingConsideringBOM = &UTF32LittleEndianEncoding();
            lengthOfBOM = 4;
        } else
            foundBOM = false;
    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
        encodingConsideringBOM = &UTF8Encoding();
        lengthOfBOM = 3;
    } else if (c1 == 0xFE && c2 == 0xFF) {
        encodingConsideringBOM = &UTF16BigEndianEncoding();
        lengthOfBOM = 2;
    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
        encodingConsideringBOM = &UTF32BigEndianEncoding();
        lengthOfBOM = 4;
    } else
        foundBOM = false;

    if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
        // Continue to look for the BOM.
        memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
        m_numBufferedBytes += length;
        return "";
    }

    // Done checking for BOM.
    m_codec.set(newTextCodec(*encodingConsideringBOM).release());
    if (!m_codec)
        return String();
    m_checkedForBOM = true;

    // Skip the BOM.
    if (foundBOM) {
        ASSERT(numBufferedBytes < lengthOfBOM);
        size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
        ASSERT(numUnbufferedBOMBytes <= length);

        data += numUnbufferedBOMBytes;
        length -= numUnbufferedBOMBytes;
        numBufferedBytes = 0;
        m_numBufferedBytes = 0;
    }

    // Handle case where we have some buffered bytes to deal with.
    if (numBufferedBytes) {
        char bufferedBytes[sizeof(m_bufferedBytes)];
        memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
        m_numBufferedBytes = 0;

        String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError);
        if (stopOnError && sawError)
            return bufferedResult;
        return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError);
    }

    return m_codec->decode(data, length, flush, stopOnError, sawError);
}