Esempio n. 1
0
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&)
{
    UChar* characters;
    String result = String::createUninitialized(length, characters);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length);
    const uint8_t* alignedEnd = alignToMachineWord(end);
    UChar* destination = characters;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable;

                    copyASCIIMachineWord(destination, source);
                    source += sizeof(MachineWord);
                    destination += sizeof(MachineWord);
                }

                if (source == end)
                    break;
            }
            *destination = *source;
        } else {
useLookupTable:
            *destination = table[*source];
        }

        ++source;
        ++destination;
    }

    return result;
}
Esempio n. 2
0
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&)
{
    LChar* characters;
    if (!length)
        return emptyString();
    String result = String::createUninitialized(length, characters);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length);
    const uint8_t* alignedEnd = alignToMachineWord(end);
    LChar* destination = characters;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable;

                    copyASCIIMachineWord(destination, source);
                    source += sizeof(MachineWord);
                    destination += sizeof(MachineWord);
                }

                if (source == end)
                    break;
            }
            *destination = *source;
        } else {
useLookupTable:
            if (table[*source] > 0xff)
                goto upConvertTo16Bit;

            *destination = table[*source];
        }

        ++source;
        ++destination;
    }

    return result;
    
upConvertTo16Bit:
    UChar* characters16;
    String result16 = String::createUninitialized(length, characters16);

    UChar* destination16 = characters16;

    // Zero extend and copy already processed 8 bit data
    LChar* ptr8 = characters;
    LChar* endPtr8 = destination;

    while (ptr8 < endPtr8)
        *destination16++ = *ptr8++;

    // Handle the character that triggered the 16 bit path
    *destination16 = table[*source];
    ++source;
    ++destination16;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                    
                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable16;
                    
                    copyASCIIMachineWord(destination16, source);
                    source += sizeof(MachineWord);
                    destination16 += sizeof(MachineWord);
                }
                
                if (source == end)
                    break;
            }
            *destination16 = *source;
        } else {
useLookupTable16:
            *destination16 = table[*source];
        }
        
        ++source;
        ++destination16;
    }
    
    return result16;
}
Esempio n. 3
0
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
    // Each input byte might turn into a character.
    // That includes all bytes in the partial-sequence buffer because
    // each byte in an invalid sequence will turn into a replacement character.
    StringBuffer<LChar> buffer(m_partialSequenceSize + length);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = source + length;
    const uint8_t* alignedEnd = alignToMachineWord(end);
    LChar* destination = buffer.characters();

    do {
        if (m_partialSequenceSize) {
            // Explicitly copy destination and source pointers to avoid taking pointers to the
            // local variables, which may harm code generation by disabling some optimizations
            // in some compilers.
            LChar* destinationForHandlePartialSequence = destination;
            const uint8_t* sourceForHandlePartialSequence = source;
            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
                source = sourceForHandlePartialSequence;
                goto upConvertTo16Bit;
            }
            destination = destinationForHandlePartialSequence;
            source = sourceForHandlePartialSequence;
            if (m_partialSequenceSize)
                break;
        }

        while (source < end) {
            if (isASCII(*source)) {
                // Fast path for ASCII. Most UTF-8 text will be ASCII.
                if (isAlignedToMachineWord(source)) {
                    while (source < alignedEnd) {
                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                        if (!isAllASCII<LChar>(chunk))
                            break;
                        copyASCIIMachineWord(destination, source);
                        source += sizeof(MachineWord);
                        destination += sizeof(MachineWord);
                    }
                    if (source == end)
                        break;
                    if (!isASCII(*source))
                        continue;
                }
                *destination++ = *source++;
                continue;
            }
            int count = nonASCIISequenceLength(*source);
            int character;
            if (!count)
                character = nonCharacter;
            else {
                if (count > end - source) {
                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                    ASSERT(!m_partialSequenceSize);
                    m_partialSequenceSize = end - source;
                    memcpy(m_partialSequence, source, m_partialSequenceSize);
                    source = end;
                    break;
                }
                character = decodeNonASCIISequence(source, count);
            }
            if (character == nonCharacter) {
                sawError = true;
                if (stopOnError)
                    break;

                goto upConvertTo16Bit;
            }
            if (character > 0xff)
                goto upConvertTo16Bit;

            source += count;
            *destination++ = character;
        }
    } while (flush && m_partialSequenceSize);

    buffer.shrink(destination - buffer.characters());

    return String::adopt(buffer);

upConvertTo16Bit:
    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

    UChar* destination16 = buffer16.characters();

    // Copy the already converted characters
    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
        *destination16++ = *converted8++;

    do {
        if (m_partialSequenceSize) {
            // Explicitly copy destination and source pointers to avoid taking pointers to the
            // local variables, which may harm code generation by disabling some optimizations
            // in some compilers.
            UChar* destinationForHandlePartialSequence = destination16;
            const uint8_t* sourceForHandlePartialSequence = source;
            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
            destination16 = destinationForHandlePartialSequence;
            source = sourceForHandlePartialSequence;
            if (m_partialSequenceSize)
                break;
        }

        while (source < end) {
            if (isASCII(*source)) {
                // Fast path for ASCII. Most UTF-8 text will be ASCII.
                if (isAlignedToMachineWord(source)) {
                    while (source < alignedEnd) {
                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                        if (!isAllASCII<LChar>(chunk))
                            break;
                        copyASCIIMachineWord(destination16, source);
                        source += sizeof(MachineWord);
                        destination16 += sizeof(MachineWord);
                    }
                    if (source == end)
                        break;
                    if (!isASCII(*source))
                        continue;
                }
                *destination16++ = *source++;
                continue;
            }
            int count = nonASCIISequenceLength(*source);
            int character;
            if (!count)
                character = nonCharacter;
            else {
                if (count > end - source) {
                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                    ASSERT(!m_partialSequenceSize);
                    m_partialSequenceSize = end - source;
                    memcpy(m_partialSequence, source, m_partialSequenceSize);
                    source = end;
                    break;
                }
                character = decodeNonASCIISequence(source, count);
            }
            if (character == nonCharacter) {
                sawError = true;
                if (stopOnError)
                    break;
                // Each error generates a replacement character and consumes one byte.
                *destination16++ = replacementCharacter;
                ++source;
                continue;
            }
            source += count;
            destination16 = appendCharacter(destination16, character);
        }
    } while (flush && m_partialSequenceSize);

    buffer16.shrink(destination16 - buffer16.characters());

    return String::adopt(buffer16);
}