Exemplo n.º 1
0
static Value decode(ExecState *exec, const List &args, const char *do_not_unescape, bool strict)
{
  UString s = "", str = args[0].toString(exec);
  int k = 0, len = str.size();
  const UChar *d = str.data();
  UChar u;
  while (k < len) {
    const UChar *p = d + k;
    UChar c = *p;
    if (c == '%') {
      int charLen = 0;
      if (k <= len - 3 && isxdigit(p[1].uc) && isxdigit(p[2].uc)) {
        const char b0 = Lexer::convertHex(p[1].uc, p[2].uc);
        const int sequenceLen = UTF8SequenceLength(b0);
        if (sequenceLen != 0 && k <= len - sequenceLen * 3) {
          charLen = sequenceLen * 3;
          char sequence[5];
          sequence[0] = b0;
          for (int i = 1; i < sequenceLen; ++i) {
            const UChar *q = p + i * 3;
            if (q[0] == '%' && isxdigit(q[1].uc) && isxdigit(q[2].uc))
              sequence[i] = Lexer::convertHex(q[1].uc, q[2].uc);
            else {
              charLen = 0;
              break;
            }
          }
          if (charLen != 0) {
            sequence[sequenceLen] = 0;
            const int character = decodeUTF8Sequence(sequence);
            if (character < 0 || character >= 0x110000) {
              charLen = 0;
            } else if (character >= 0x10000) {
              // Convert to surrogate pair.
              s.append(static_cast<unsigned short>(0xD800 | ((character - 0x10000) >> 10)));
              u = static_cast<unsigned short>(0xDC00 | ((character - 0x10000) & 0x3FF));
            } else {
              u = static_cast<unsigned short>(character);
            }
          }
Exemplo n.º 2
0
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd, 
    UChar** targetStart, UChar* targetEnd, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    while (source < sourceEnd) {
        UChar32 ch = 0;
        int extraBytesToRead = UTF8SequenceLength(*source) - 1;
        if (source + extraBytesToRead >= sourceEnd) {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
            result = sourceIllegal;
            break;
        }
        // The cases all fall through.
        switch (extraBytesToRead) {
            case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
            case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
            case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
            case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
            case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
            case 0: ch += static_cast<unsigned char>(*source++);
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if (target >= targetEnd) {
            source -= (extraBytesToRead + 1); // Back up source pointer!
            result = targetExhausted; break;
        }
        if (ch <= 0xFFFF) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (ch >= 0xD800 && ch <= 0xDFFF) {
                if (strict) {
                    source -= (extraBytesToRead + 1); // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                } else
                    *target++ = 0xFFFD;
            } else
                *target++ = (UChar)ch; // normal case
        } else if (ch > 0x10FFFF) {
            if (strict) {
                result = sourceIllegal;
                source -= (extraBytesToRead + 1); // return to the start
                break; // Bail out; shouldn't continue
            } else
                *target++ = 0xFFFD;
        } else {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= (extraBytesToRead + 1); // Back up source pointer!
                result = targetExhausted;
                break;
            }
            ch -= 0x0010000UL;
            *target++ = (UChar)((ch >> 10) + 0xD800);
            *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
        }
    }
    *sourceStart = source;
    *targetStart = target;
    return result;
}