Exemple #1
0
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd, 
    UChar** targetStart, UChar* targetEnd, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    while (source < sourceEnd) {
        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
        if (sourceEnd - source < utf8SequenceLength)  {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
            result = sourceIllegal;
            break;
        }

        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

        if (target >= targetEnd) {
            source -= utf8SequenceLength; // Back up source pointer!
            result = targetExhausted;
            break;
        }

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character)) {
                if (strict) {
                    source -= utf8SequenceLength; // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                } else
                    *target++ = replacementCharacter;
            } else
                *target++ = character; // normal case
        } else if (U_IS_SUPPLEMENTARY(character)) {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= utf8SequenceLength; // Back up source pointer!
                result = targetExhausted;
                break;
            }
            *target++ = U16_LEAD(character);
            *target++ = U16_TRAIL(character);
        } else {
            if (strict) {
                source -= utf8SequenceLength; // return to the start
                result = sourceIllegal;
                break; // Bail out; shouldn't continue
            } else
                *target++ = replacementCharacter;
        }
    }
    *sourceStart = source;
    *targetStart = target;
    return result;
}
// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
// Only allows Unicode characters (U-00000000 to U-0010FFFF).
// Returns -1 if the sequence is not valid (including presence of extra bytes).
int decodeUTF8Sequence(const char *sequence)
{
  // Handle 0-byte sequences (never valid).
  const unsigned char b0 = sequence[0];
  const int length = inlineUTF8SequenceLength(b0);
  if (length == 0)
    return -1;

  // Handle 1-byte sequences (plain ASCII).
  const unsigned char b1 = sequence[1];
  if (length == 1) {
    if (b1)
      return -1;
    return b0;
  }

  // Handle 2-byte sequences.
  if ((b1 & 0xC0) != 0x80)
    return -1;
  const unsigned char b2 = sequence[2];
  if (length == 2) {
    if (b2)
      return -1;
    const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
    if (c < 0x80)
      return -1;
    return c;
  }

  // Handle 3-byte sequences.
  if ((b2 & 0xC0) != 0x80)
    return -1;
  const unsigned char b3 = sequence[3];
  if (length == 3) {
    if (b3)
      return -1;
    const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
    if (c < 0x800)
      return -1;
    // UTF-16 surrogates should never appear in UTF-8 data.
    if (c >= 0xD800 && c <= 0xDFFF)
      return -1;
    // Backwards BOM and U+FFFF should never appear in UTF-8 data.
    if (c == 0xFFFE || c == 0xFFFF)
      return -1;
    return c;
  }

  // Handle 4-byte sequences.
  if ((b3 & 0xC0) != 0x80)
    return -1;
  const unsigned char b4 = sequence[4];
  if (length == 4) {
    if (b4)
      return -1;
    const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    if (c < 0x10000 || c > 0x10FFFF)
      return -1;
    return c;
  }

  return -1;
}