ConversionResult ConvertUTF8toUTF32( const UTF8** sourceStart, const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags ) { ConversionResult result = conversionOK; const UTF8* source = *sourceStart; UTF32* target = *targetStart; while( source < sourceEnd ) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if( source + extraBytesToRead >= sourceEnd ) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) { result = sourceIllegal; break; } /* * The cases all fall through. See "Note A" below. */ switch( extraBytesToRead ) { case 5: ch += *source++; ch <<= 6; case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if( target >= targetEnd ) { source -= ( extraBytesToRead + 1 ); /* Back up the source pointer! */ result = targetExhausted; break; } if( ch <= UNI_MAX_LEGAL_UTF32 ) { /* * UTF-16 surrogate values are illegal in UTF-32, and anything * over Plane 17 (> 0x10FFFF) is illegal. */ if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) { if( flags == strictConversion ) { source -= ( extraBytesToRead + 1 ); /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { *target++ = ch; } } else /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ { result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } *sourceStart = source; *targetStart = target; return result; }
ConversionResult ConvertUTF8toUTF16( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags ) { ConversionResult result = conversionOK; const UTF8* source = *sourceStart; UTF16* target = *targetStart; while( source < sourceEnd ) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if( source + extraBytesToRead >= sourceEnd ) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ if( !isLegalUTF8( source, extraBytesToRead + 1 ) ) { result = sourceIllegal; break; } /* * The cases all fall through. See "Note A" below. */ switch( extraBytesToRead ) { case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if( target >= targetEnd ) { source -= ( extraBytesToRead + 1 ); /* Back up source pointer! */ result = targetExhausted; break; } if( ch <= UNI_MAX_BMP ) /* Target is a character <= 0xFFFF */ { /* UTF-16 surrogate values are illegal in UTF-32 */ if( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) { if( flags == strictConversion ) { source -= ( extraBytesToRead + 1 ); /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { *target++ = (UTF16)ch; /* normal case */ } } else if( ch > UNI_MAX_UTF16 ) { if( flags == strictConversion ) { result = sourceIllegal; source -= ( extraBytesToRead + 1 ); /* return to the start */ break; /* Bail out; shouldn't continue */ } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if( target + 1 >= targetEnd ) { source -= ( extraBytesToRead + 1 ); /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; *target++ = (UTF16)( ( ch >> halfShift ) + UNI_SUR_HIGH_START ); *target++ = (UTF16)( ( ch & halfMask ) + UNI_SUR_LOW_START ); } } *sourceStart = source; *targetStart = target; return result; }
ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) { ConversionResult result = conversionOK; const char* source = *sourceStart; UChar* target = *targetStart; UChar orAllData = 0; while (source < sourceEnd) { int utf8SequenceLength = inlineUTF8SequenceLength(*source); if (sourceEnd - source < utf8SequenceLength) { result = sourceExhausted; break; } // Do this check whether lenient or strict if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { result = sourceIllegal; break; } UChar32 character = readUTF8Sequence(source, utf8SequenceLength); if (target >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) { if (strict) { source -= utf8SequenceLength; // return to the illegal value itself result = sourceIllegal; break; } *target++ = replacementCharacter; orAllData |= replacementCharacter; } else { *target++ = static_cast<UChar>(character); // normal case orAllData |= character; } } else if (U_IS_SUPPLEMENTARY(character)) { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } *target++ = U16_LEAD(character); *target++ = U16_TRAIL(character); orAllData = 0xffff; } else { if (strict) { source -= utf8SequenceLength; // return to the start result = sourceIllegal; break; // Bail out; shouldn't continue } else { *target++ = replacementCharacter; orAllData |= replacementCharacter; } } } *sourceStart = source; *targetStart = target; if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f); return result; }
static ConversionResult ConvertUTF8toUTF32Impl( const UTF8** sourceStart, const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, Boolean InputIsPartial) { ConversionResult result = conversionOK; const UTF8* source = *sourceStart; UTF32* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (extraBytesToRead >= sourceEnd - source) { if (flags == strictConversion || InputIsPartial) { result = sourceExhausted; break; } else { result = sourceIllegal; /* * Replace the maximal subpart of ill-formed sequence with * replacement character. */ source += findMaximalSubpartOfIllFormedUTF8Sequence(source, sourceEnd); *target++ = UNI_REPLACEMENT_CHAR; continue; } } if (target >= targetEnd) { result = targetExhausted; break; } /* Do this check whether lenient or strict */ if (!isLegalUTF8(source, extraBytesToRead+1)) { result = sourceIllegal; if (flags == strictConversion) { /* Abort conversion. */ break; } else { /* * Replace the maximal subpart of ill-formed sequence with * replacement character. */ source += findMaximalSubpartOfIllFormedUTF8Sequence(source, sourceEnd); *target++ = UNI_REPLACEMENT_CHAR; continue; } } /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 5: ch += *source++; ch <<= 6; case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (ch <= UNI_MAX_LEGAL_UTF32) { /* * UTF-16 surrogate values are illegal in UTF-32, and anything * over Plane 17 (> 0x10FFFF) is illegal. */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { *target++ = ch; } } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } *sourceStart = source; *targetStart = target; return result; }
Utf8Codec::result Utf8Codec::do_in(MBState& s, const char* fromBegin, const char* fromEnd, const char*& fromNext, Char* toBegin, Char* toEnd, Char*& toNext) const { Utf8Codec::result retstat = ok; fromNext = fromBegin; toNext = toBegin; // check for empty input if (fromEnd == fromBegin) return ok; // check for incomplete byte order mark: if (numBytes(s, fromBegin, fromEnd) < 3) { if (getByte(s, fromBegin, fromEnd, 0) == '\xef') { while (fromNext < fromEnd) s.value.mbytes[s.n++] = *fromNext++; return ok; } } else { // skip byte order mark if (getByte(s, fromBegin, fromEnd, 0) == '\xef' && getByte(s, fromBegin, fromEnd, 1) == '\xbb' && getByte(s, fromBegin, fromEnd, 2) == '\xbf') { if (s.n <= 3) { fromNext += 3 - s.n; s.n = 0; } else { std::memmove(s.value.mbytes, s.value.mbytes + 3, s.n - 3); s.n -= 3; } } } while (fromNext < fromEnd) { if (toNext >= toEnd) { retstat = partial; break; } if (s.n < sizeof(s.value.mbytes)) { s.value.mbytes[s.n++] = *fromNext++; } uint8_t* fnext = reinterpret_cast<uint8_t *>(&s.value.mbytes[0]); uint8_t* fend = fnext + s.n; const size_t extraBytesToRead = trailingBytesForUTF8[*fnext]; if (fnext + extraBytesToRead >= fend) { retstat = partial; break; } if( !isLegalUTF8( fnext, extraBytesToRead + 1 ) ) { retstat = error; break; } *toNext = Char(0); switch (extraBytesToRead) { case 5: *toNext = Char((toNext->value() + *fnext++) << 6); // We should never get this for legal UTF-8 case 4: *toNext = Char((toNext->value() + *fnext++) << 6); // We should never get this for legal UTF-8 case 3: *toNext = Char((toNext->value() + *fnext++) << 6); case 2: *toNext = Char((toNext->value() + *fnext++) << 6); case 1: *toNext = Char((toNext->value() + *fnext++) << 6); case 0: *toNext = Char((toNext->value() + *fnext++)); } *toNext = Char(toNext->value() - offsetsFromUTF8[extraBytesToRead]); // UTF-16 surrogate values are illegal in UTF-32, and anything // over Plane 17 (> 0x10FFFF) is illegal. if (*toNext > MaxLegalUtf32) { *toNext = ReplacementChar; } else if(*toNext >= SurHighStart && *toNext <= SurLowEnd) { *toNext = ReplacementChar; } s.n = 0; ++toNext; } return retstat; }