Esempio n. 1
0
ConversionResult
ConvertUTF8toUTF32( const UTF8**    sourceStart,
                    const UTF8*     sourceEnd,
                    UTF32**         targetStart,
                    UTF32*          targetEnd,
                    ConversionFlags flags )
{
    ConversionResult result = conversionOK;
    const UTF8*      source = *sourceStart;
    UTF32*           target = *targetStart;

    while( source < sourceEnd )
    {
        UTF32          ch = 0;
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
        if( source + extraBytesToRead >= sourceEnd )
        {
            result = sourceExhausted; break;
        }
        /* Do this check whether lenient or strict */
        if( !isLegalUTF8( source, extraBytesToRead + 1 ) )
        {
            result = sourceIllegal;
            break;
        }
        /*
         * The cases all fall through. See "Note A" below.
         */
        switch( extraBytesToRead )
        {
            case 5:
                ch += *source++; ch <<= 6;

            case 4:
                ch += *source++; ch <<= 6;

            case 3:
                ch += *source++; ch <<= 6;

            case 2:
                ch += *source++; ch <<= 6;

            case 1:
                ch += *source++; ch <<= 6;

            case 0:
                ch += *source++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if( target >= targetEnd )
        {
            source -= ( extraBytesToRead + 1 ); /* Back up the source pointer!
                                                  */
            result = targetExhausted; break;
        }
        if( ch <= UNI_MAX_LEGAL_UTF32 )
        {
            /*
             * UTF-16 surrogate values are illegal in UTF-32, and anything
             * over Plane 17 (> 0x10FFFF) is illegal.
             */
            if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END )
            {
                if( flags == strictConversion )
                {
                    source -= ( extraBytesToRead + 1 ); /* return to the illegal
                                                          value itself */
                    result = sourceIllegal;
                    break;
                }
                else
                {
                    *target++ = UNI_REPLACEMENT_CHAR;
                }
            }
            else
            {
                *target++ = ch;
            }
        }
        else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
        {
            result = sourceIllegal;
            *target++ = UNI_REPLACEMENT_CHAR;
        }
    }

    *sourceStart = source;
    *targetStart = target;
    return result;
}
Esempio n. 2
0
ConversionResult
ConvertUTF8toUTF16( const UTF8**    sourceStart,
                    const UTF8*     sourceEnd,
                    UTF16**         targetStart,
                    UTF16*          targetEnd,
                    ConversionFlags flags )
{
    ConversionResult result = conversionOK;
    const UTF8*      source = *sourceStart;
    UTF16*           target = *targetStart;

    while( source < sourceEnd )
    {
        UTF32          ch = 0;
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
        if( source + extraBytesToRead >= sourceEnd )
        {
            result = sourceExhausted; break;
        }
        /* Do this check whether lenient or strict */
        if( !isLegalUTF8( source, extraBytesToRead + 1 ) )
        {
            result = sourceIllegal;
            break;
        }
        /*
         * The cases all fall through. See "Note A" below.
         */
        switch( extraBytesToRead )
        {
            case 5:
                ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */

            case 4:
                ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */

            case 3:
                ch += *source++; ch <<= 6;

            case 2:
                ch += *source++; ch <<= 6;

            case 1:
                ch += *source++; ch <<= 6;

            case 0:
                ch += *source++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if( target >= targetEnd )
        {
            source -= ( extraBytesToRead + 1 ); /* Back up source pointer! */
            result = targetExhausted; break;
        }
        if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */
        { /* UTF-16 surrogate values are illegal in UTF-32 */
            if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END )
            {
                if( flags == strictConversion )
                {
                    source -= ( extraBytesToRead + 1 ); /* return to the illegal
                                                          value itself */
                    result = sourceIllegal;
                    break;
                }
                else
                {
                    *target++ = UNI_REPLACEMENT_CHAR;
                }
            }
            else
            {
                *target++ = (UTF16)ch; /* normal case */
            }
        }
        else if( ch > UNI_MAX_UTF16 )
        {
            if( flags == strictConversion )
            {
                result = sourceIllegal;
                source -= ( extraBytesToRead + 1 ); /* return to the start */
                break; /* Bail out; shouldn't continue */
            }
            else
            {
                *target++ = UNI_REPLACEMENT_CHAR;
            }
        }
        else
        {
            /* target is a character in range 0xFFFF - 0x10FFFF. */
            if( target + 1 >= targetEnd )
            {
                source -= ( extraBytesToRead + 1 ); /* Back up source pointer!
                                                      */
                result = targetExhausted; break;
            }
            ch -= halfBase;
            *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START );
            *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START );
        }
    }

    *sourceStart = source;
    *targetStart = target;
    return result;
}
Esempio n. 3
0
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd,
    UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    UChar orAllData = 0;
    while (source < sourceEnd) {
        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
        if (sourceEnd - source < utf8SequenceLength)  {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
            result = sourceIllegal;
            break;
        }

        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

        if (target >= targetEnd) {
            source -= utf8SequenceLength; // Back up source pointer!
            result = targetExhausted;
            break;
        }

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character)) {
                if (strict) {
                    source -= utf8SequenceLength; // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                }
                *target++ = replacementCharacter;
                orAllData |= replacementCharacter;
            } else {
                *target++ = static_cast<UChar>(character); // normal case
                orAllData |= character;
            }
        } else if (U_IS_SUPPLEMENTARY(character)) {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= utf8SequenceLength; // Back up source pointer!
                result = targetExhausted;
                break;
            }
            *target++ = U16_LEAD(character);
            *target++ = U16_TRAIL(character);
            orAllData = 0xffff;
        } else {
            if (strict) {
                source -= utf8SequenceLength; // return to the start
                result = sourceIllegal;
                break; // Bail out; shouldn't continue
            } else {
                *target++ = replacementCharacter;
                orAllData |= replacementCharacter;
            }
        }
    }
    *sourceStart = source;
    *targetStart = target;

    if (sourceAllASCII)
        *sourceAllASCII = !(orAllData & ~0x7f);

    return result;
}
Esempio n. 4
0
static ConversionResult ConvertUTF8toUTF32Impl(
        const UTF8** sourceStart, const UTF8* sourceEnd, 
        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
        Boolean InputIsPartial) {
    ConversionResult result = conversionOK;
    const UTF8* source = *sourceStart;
    UTF32* target = *targetStart;
    while (source < sourceEnd) {
        UTF32 ch = 0;
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
        if (extraBytesToRead >= sourceEnd - source) {
            if (flags == strictConversion || InputIsPartial) {
                result = sourceExhausted;
                break;
            } else {
                result = sourceIllegal;

                /*
                 * Replace the maximal subpart of ill-formed sequence with
                 * replacement character.
                 */
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
                                                                    sourceEnd);
                *target++ = UNI_REPLACEMENT_CHAR;
                continue;
            }
        }
        if (target >= targetEnd) {
            result = targetExhausted; break;
        }

        /* Do this check whether lenient or strict */
        if (!isLegalUTF8(source, extraBytesToRead+1)) {
            result = sourceIllegal;
            if (flags == strictConversion) {
                /* Abort conversion. */
                break;
            } else {
                /*
                 * Replace the maximal subpart of ill-formed sequence with
                 * replacement character.
                 */
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
                                                                    sourceEnd);
                *target++ = UNI_REPLACEMENT_CHAR;
                continue;
            }
        }
        /*
         * The cases all fall through. See "Note A" below.
         */
        switch (extraBytesToRead) {
            case 5: ch += *source++; ch <<= 6;
            case 4: ch += *source++; ch <<= 6;
            case 3: ch += *source++; ch <<= 6;
            case 2: ch += *source++; ch <<= 6;
            case 1: ch += *source++; ch <<= 6;
            case 0: ch += *source++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if (ch <= UNI_MAX_LEGAL_UTF32) {
            /*
             * UTF-16 surrogate values are illegal in UTF-32, and anything
             * over Plane 17 (> 0x10FFFF) is illegal.
             */
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
                if (flags == strictConversion) {
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
                    result = sourceIllegal;
                    break;
                } else {
                    *target++ = UNI_REPLACEMENT_CHAR;
                }
            } else {
                *target++ = ch;
            }
        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
            result = sourceIllegal;
            *target++ = UNI_REPLACEMENT_CHAR;
        }
    }
    *sourceStart = source;
    *targetStart = target;
    return result;
}
Esempio n. 5
0
Utf8Codec::result Utf8Codec::do_in(MBState& s, const char* fromBegin, const char* fromEnd, const char*& fromNext,
                                   Char* toBegin, Char* toEnd, Char*& toNext) const
{
    Utf8Codec::result retstat = ok;
    fromNext = fromBegin;
    toNext = toBegin;

    // check for empty input
    if (fromEnd == fromBegin)
        return ok;

    // check for incomplete byte order mark:
    if (numBytes(s, fromBegin, fromEnd) < 3)
    {
        if (getByte(s, fromBegin, fromEnd, 0) == '\xef')
        {
            while (fromNext < fromEnd)
                s.value.mbytes[s.n++] = *fromNext++;
            return ok;
        }
    }
    else
    {
        // skip byte order mark
        if (getByte(s, fromBegin, fromEnd, 0) == '\xef'
            && getByte(s, fromBegin, fromEnd, 1) == '\xbb'
            && getByte(s, fromBegin, fromEnd, 2) == '\xbf')
        {
            if (s.n <= 3)
            {
                fromNext += 3 - s.n;
                s.n = 0;
            }
            else
            {
                std::memmove(s.value.mbytes, s.value.mbytes + 3, s.n - 3);
                s.n -= 3;
            }
        }
    }

    while (fromNext < fromEnd)
    {
        if (toNext >= toEnd)
        {
            retstat = partial;
            break;
        }

        if (s.n < sizeof(s.value.mbytes))
        {
            s.value.mbytes[s.n++] = *fromNext++;
        }

        uint8_t* fnext = reinterpret_cast<uint8_t *>(&s.value.mbytes[0]);
        uint8_t* fend = fnext + s.n;

        const size_t extraBytesToRead = trailingBytesForUTF8[*fnext];
        if (fnext + extraBytesToRead >= fend)
        {
            retstat = partial;
            break;
        }

        if( !isLegalUTF8( fnext, extraBytesToRead + 1 ) )
        {
            retstat = error;
            break;
        }

        *toNext = Char(0);
        switch (extraBytesToRead)
        {
            case 5: *toNext = Char((toNext->value() + *fnext++) << 6); // We should never get this for legal UTF-8
            case 4: *toNext = Char((toNext->value() + *fnext++) << 6); // We should never get this for legal UTF-8
            case 3: *toNext = Char((toNext->value() + *fnext++) << 6);
            case 2: *toNext = Char((toNext->value() + *fnext++) << 6);
            case 1: *toNext = Char((toNext->value() + *fnext++) << 6);
            case 0: *toNext = Char((toNext->value() + *fnext++));
        }

        *toNext = Char(toNext->value() - offsetsFromUTF8[extraBytesToRead]);

        // UTF-16 surrogate values are illegal in UTF-32, and anything
        // over Plane 17 (> 0x10FFFF) is illegal.
        if (*toNext > MaxLegalUtf32)
        {
            *toNext = ReplacementChar;
        }
        else if(*toNext >= SurHighStart && *toNext <= SurLowEnd)
        {
            *toNext = ReplacementChar;
        }

        s.n = 0;
        ++toNext;
    }

    return retstat;
}