ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool strict) { ConversionResult result = conversionOK; const char* source = *sourceStart; UChar* target = *targetStart; while (source < sourceEnd) { int utf8SequenceLength = inlineUTF8SequenceLength(*source); if (sourceEnd - source < utf8SequenceLength) { result = sourceExhausted; break; } // Do this check whether lenient or strict if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { result = sourceIllegal; break; } UChar32 character = readUTF8Sequence(source, utf8SequenceLength); if (target >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) { if (strict) { source -= utf8SequenceLength; // return to the illegal value itself result = sourceIllegal; break; } else *target++ = replacementCharacter; } else *target++ = character; // normal case } else if (U_IS_SUPPLEMENTARY(character)) { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } *target++ = U16_LEAD(character); *target++ = U16_TRAIL(character); } else { if (strict) { source -= utf8SequenceLength; // return to the start result = sourceIllegal; break; // Bail out; shouldn't continue } else *target++ = replacementCharacter; } } *sourceStart = source; *targetStart = target; return result; }
// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. // Only allows Unicode characters (U-00000000 to U-0010FFFF). // Returns -1 if the sequence is not valid (including presence of extra bytes). int decodeUTF8Sequence(const char *sequence) { // Handle 0-byte sequences (never valid). const unsigned char b0 = sequence[0]; const int length = inlineUTF8SequenceLength(b0); if (length == 0) return -1; // Handle 1-byte sequences (plain ASCII). const unsigned char b1 = sequence[1]; if (length == 1) { if (b1) return -1; return b0; } // Handle 2-byte sequences. if ((b1 & 0xC0) != 0x80) return -1; const unsigned char b2 = sequence[2]; if (length == 2) { if (b2) return -1; const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); if (c < 0x80) return -1; return c; } // Handle 3-byte sequences. if ((b2 & 0xC0) != 0x80) return -1; const unsigned char b3 = sequence[3]; if (length == 3) { if (b3) return -1; const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); if (c < 0x800) return -1; // UTF-16 surrogates should never appear in UTF-8 data. if (c >= 0xD800 && c <= 0xDFFF) return -1; // Backwards BOM and U+FFFF should never appear in UTF-8 data. if (c == 0xFFFE || c == 0xFFFF) return -1; return c; } // Handle 4-byte sequences. if ((b3 & 0xC0) != 0x80) return -1; const unsigned char b4 = sequence[4]; if (length == 4) { if (b4) return -1; const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); if (c < 0x10000 || c > 0x10FFFF) return -1; return c; } return -1; }