static ConversionResult ConvertUTF8toUTF32Impl( const UTF8** sourceStart, const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, Boolean InputIsPartial) { ConversionResult result = conversionOK; const UTF8* source = *sourceStart; UTF32* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (extraBytesToRead >= sourceEnd - source) { if (flags == strictConversion || InputIsPartial) { result = sourceExhausted; break; } else { result = sourceIllegal; /* * Replace the maximal subpart of ill-formed sequence with * replacement character. */ source += findMaximalSubpartOfIllFormedUTF8Sequence(source, sourceEnd); *target++ = UNI_REPLACEMENT_CHAR; continue; } } if (target >= targetEnd) { result = targetExhausted; break; } /* Do this check whether lenient or strict */ if (!isLegalUTF8(source, extraBytesToRead+1)) { result = sourceIllegal; if (flags == strictConversion) { /* Abort conversion. */ break; } else { /* * Replace the maximal subpart of ill-formed sequence with * replacement character. */ source += findMaximalSubpartOfIllFormedUTF8Sequence(source, sourceEnd); *target++ = UNI_REPLACEMENT_CHAR; continue; } } /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 5: ch += *source++; ch <<= 6; case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (ch <= UNI_MAX_LEGAL_UTF32) { /* * UTF-16 surrogate values are illegal in UTF-32, and anything * over Plane 17 (> 0x10FFFF) is illegal. */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { *target++ = ch; } } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } *sourceStart = source; *targetStart = target; return result; }
ConversionResult ConvertUTF8toUTF16 ( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; const UTF8* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (extraBytesToRead >= sourceEnd - source) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ if (!isLegalUTF8(source, extraBytesToRead+1)) { result = sourceIllegal; if (flags == strictConversion) { /* Abort conversion. */ break; } else { /* * Replace the maximal subpart of ill-formed sequence with * replacement character. */ source += findMaximalSubpartOfIllFormedUTF8Sequence(source, sourceEnd); *target++ = UNI_REPLACEMENT_CHAR; continue; } } /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ /* UTF-16 surrogate values are illegal in UTF-32 */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { *target++ = (UTF16)ch; /* normal case */ } } else if (ch > UNI_MAX_UTF16) { if (flags == strictConversion) { result = sourceIllegal; source -= (extraBytesToRead+1); /* return to the start */ break; /* Bail out; shouldn't continue */ } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } } *sourceStart = source; *targetStart = target; return result; }