示例#1
0
// returns true if we performed an UTF-8 decoding
static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
                               const ushort *end, ushort decoded)
{
    int charsNeeded;
    uint min_uc;
    uint uc;

    if (decoded <= 0xC1) {
        // an UTF-8 first character must be at least 0xC0
        // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
        return false;
    } else if (decoded < 0xe0) {
        charsNeeded = 2;
        min_uc = 0x80;
        uc = decoded & 0x1f;
    } else if (decoded < 0xf0) {
        charsNeeded = 3;
        min_uc = 0x800;
        uc = decoded & 0x0f;
    } else if (decoded < 0xf5) {
        charsNeeded = 4;
        min_uc = 0x10000;
        uc = decoded & 0x07;
    } else {
        // the last Unicode character is U+10FFFF
        // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
        // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
        return false;
    }

    // are there enough remaining?
    if (end - input < 3*charsNeeded)
        return false;

    if (input[3] != '%')
        return false;

    // first continuation character
    decoded = decodePercentEncoding(input + 3);
    if ((decoded & 0xc0) != 0x80)
        return false;
    uc <<= 6;
    uc |= decoded & 0x3f;

    if (charsNeeded > 2) {
        if (input[6] != '%')
            return false;

        // second continuation character
        decoded = decodePercentEncoding(input + 6);
        if ((decoded & 0xc0) != 0x80)
            return false;
        uc <<= 6;
        uc |= decoded & 0x3f;

        if (charsNeeded > 3) {
            if (input[9] != '%')
                return false;

            // third continuation character
            decoded = decodePercentEncoding(input + 9);
            if ((decoded & 0xc0) != 0x80)
                return false;
            uc <<= 6;
            uc |= decoded & 0x3f;
        }
    }

    // we've decoded something; safety-check it
    if (uc < min_uc)
        return false;
    if (isSurrogate(uc) || isNonCharacter(uc) || uc > 0x10ffff/*QChar::LastValidCodePoint*/)
        return false;

    if (!QChar::requiresSurrogates(uc)) {
        // UTF-8 decoded and no surrogates are required
        // detach if necessary
        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 1);
        *output++ = uc;
    } else {
        // UTF-8 decoded to something that requires a surrogate pair
        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 2);
        *output++ = QChar::highSurrogate(uc);
        *output++ = QChar::lowSurrogate(uc);
    }
    input += charsNeeded * 3 - 1;
    return true;
}
 // Returns true if the code point is an Unicode character (that is, 
 // an Unicode scalar value excluding noncharacters). 
 inline constexpr bool isUnicodeCharacter(code_point_t code_point) {
     return isUnicodeScalarValue(code_point) and not isNonCharacter(code_point);
 }