static Value decode(ExecState *exec, const List &args, const char *do_not_unescape, bool strict) { UString s = "", str = args[0].toString(exec); int k = 0, len = str.size(); const UChar *d = str.data(); UChar u; while (k < len) { const UChar *p = d + k; UChar c = *p; if (c == '%') { int charLen = 0; if (k <= len - 3 && isxdigit(p[1].uc) && isxdigit(p[2].uc)) { const char b0 = Lexer::convertHex(p[1].uc, p[2].uc); const int sequenceLen = UTF8SequenceLength(b0); if (sequenceLen != 0 && k <= len - sequenceLen * 3) { charLen = sequenceLen * 3; char sequence[5]; sequence[0] = b0; for (int i = 1; i < sequenceLen; ++i) { const UChar *q = p + i * 3; if (q[0] == '%' && isxdigit(q[1].uc) && isxdigit(q[2].uc)) sequence[i] = Lexer::convertHex(q[1].uc, q[2].uc); else { charLen = 0; break; } } if (charLen != 0) { sequence[sequenceLen] = 0; const int character = decodeUTF8Sequence(sequence); if (character < 0 || character >= 0x110000) { charLen = 0; } else if (character >= 0x10000) { // Convert to surrogate pair. s.append(static_cast<unsigned short>(0xD800 | ((character - 0x10000) >> 10))); u = static_cast<unsigned short>(0xDC00 | ((character - 0x10000) & 0x3FF)); } else { u = static_cast<unsigned short>(character); } }
ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool strict) { ConversionResult result = conversionOK; const char* source = *sourceStart; UChar* target = *targetStart; while (source < sourceEnd) { UChar32 ch = 0; int extraBytesToRead = UTF8SequenceLength(*source) - 1; if (source + extraBytesToRead >= sourceEnd) { result = sourceExhausted; break; } // Do this check whether lenient or strict if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { result = sourceIllegal; break; } // The cases all fall through. switch (extraBytesToRead) { case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; case 0: ch += static_cast<unsigned char>(*source++); } ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { source -= (extraBytesToRead + 1); // Back up source pointer! result = targetExhausted; break; } if (ch <= 0xFFFF) { // UTF-16 surrogate values are illegal in UTF-32 if (ch >= 0xD800 && ch <= 0xDFFF) { if (strict) { source -= (extraBytesToRead + 1); // return to the illegal value itself result = sourceIllegal; break; } else *target++ = 0xFFFD; } else *target++ = (UChar)ch; // normal case } else if (ch > 0x10FFFF) { if (strict) { result = sourceIllegal; source -= (extraBytesToRead + 1); // return to the start break; // Bail out; shouldn't continue } else *target++ = 0xFFFD; } else { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { source -= (extraBytesToRead + 1); // Back up source pointer! result = targetExhausted; break; } ch -= 0x0010000UL; *target++ = (UChar)((ch >> 10) + 0xD800); *target++ = (UChar)((ch & 0x03FF) + 0xDC00); } } *sourceStart = source; *targetStart = target; return result; }