static bool InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp, bool *isAsciip) { *isAsciip = true; // First, count how many jschars need to be in the inflated string. // |i| is the index into |src|, and |j| is the the index into |dst|. size_t srclen = src.length(); uint32_t j = 0; for (uint32_t i = 0; i < srclen; i++, j++) { uint32_t v = uint32_t(src[i]); if (!(v & 0x80)) { // ASCII code unit. Simple copy. if (action == Copy) dst[j] = jschar(v); } else { // Non-ASCII code unit. Determine its length in bytes (n). *isAsciip = false; uint32_t n = 1; while (v & (0x80 >> n)) n++; #define INVALID(report, arg, n2) \ do { \ if (action == CountAndReportInvalids) { \ report(cx, arg); \ return false; \ } else { \ if (action == Copy) \ dst[j] = jschar(REPLACE_UTF8); \ else \ JS_ASSERT(action == CountAndIgnoreInvalids); \ n = n2; \ goto invalidMultiByteCodeUnit; \ } \ } while (0) // Check the leading byte. if (n < 2 || n > 4) INVALID(ReportInvalidCharacter, i, 1); // Check that |src| is large enough to hold an n-byte code unit. if (i + n > srclen) INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); // Check the second byte. From Unicode Standard v6.2, Table 3-7 // Well-Formed UTF-8 Byte Sequences. if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F { INVALID(ReportInvalidCharacter, i, 1); } // Check the continuation bytes. for (uint32_t m = 1; m < n; m++) if ((src[i + m] & 0xC0) != 0x80) INVALID(ReportInvalidCharacter, i, m); // Determine the code unit's length in jschars and act accordingly. v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n); if (v < 0x10000) { // The n-byte UTF8 code unit will fit in a single jschar. if (action == Copy) dst[j] = jschar(v); } else { v -= 0x10000; if (v <= 0xFFFFF) { // The n-byte UTF8 code unit will fit in two jschars. if (action == Copy) dst[j] = jschar((v >> 10) + 0xD800); j++; if (action == Copy) dst[j] = jschar((v & 0x3FF) + 0xDC00); } else { // The n-byte UTF8 code unit won't fit in two jschars. INVALID(ReportTooBigCharacter, v, 1); } } invalidMultiByteCodeUnit: // Move i to the last byte of the multi-byte code unit; the loop // header will do the final i++ to move to the start of the next // code unit. i += n - 1; } }
static bool InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp, JS::SmallestEncoding *smallestEncoding) { if (Action != AssertNoInvalids) *smallestEncoding = JS::SmallestEncoding::ASCII; auto RequireLatin1 = [&smallestEncoding]{ *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding); }; auto RequireUTF16 = [&smallestEncoding]{ *smallestEncoding = JS::SmallestEncoding::UTF16; }; // Count how many code units need to be in the inflated string. // |i| is the index into |src|, and |j| is the the index into |dst|. size_t srclen = src.length(); uint32_t j = 0; for (uint32_t i = 0; i < srclen; i++, j++) { uint32_t v = uint32_t(src[i]); if (!(v & 0x80)) { // ASCII code unit. Simple copy. if (Action == Copy) dst[j] = CharT(v); } else { // Non-ASCII code unit. Determine its length in bytes (n). uint32_t n = 1; while (v & (0x80 >> n)) n++; #define INVALID(report, arg, n2) \ do { \ if (Action == CountAndReportInvalids) { \ report(cx, arg); \ return false; \ } else if (Action == AssertNoInvalids) { \ MOZ_CRASH("invalid UTF-8 string: " # report); \ } else { \ if (Action == Copy) { \ if (std::is_same<decltype(dst[0]), Latin1Char>::value) \ dst[j] = CharT(REPLACE_UTF8_LATIN1); \ else \ dst[j] = CharT(REPLACE_UTF8); \ } else { \ MOZ_ASSERT(Action == CountAndIgnoreInvalids || \ Action == FindEncoding); \ } \ n = n2; \ goto invalidMultiByteCodeUnit; \ } \ } while (0) // Check the leading byte. if (n < 2 || n > 4) INVALID(ReportInvalidCharacter, i, 1); // Check that |src| is large enough to hold an n-byte code unit. if (i + n > srclen) INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); // Check the second byte. From Unicode Standard v6.2, Table 3-7 // Well-Formed UTF-8 Byte Sequences. if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F { INVALID(ReportInvalidCharacter, i, 1); } // Check the continuation bytes. for (uint32_t m = 1; m < n; m++) { if ((src[i + m] & 0xC0) != 0x80) INVALID(ReportInvalidCharacter, i, m); } // Determine the code unit's length in CharT and act accordingly. v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n); if (Action != AssertNoInvalids) { if (v > 0xff) { RequireUTF16(); if (Action == FindEncoding) { MOZ_ASSERT(dst == nullptr); return true; } } else { RequireLatin1(); } } if (v < 0x10000) { // The n-byte UTF8 code unit will fit in a single CharT. if (Action == Copy) dst[j] = CharT(v); } else { v -= 0x10000; if (v <= 0xFFFFF) { // The n-byte UTF8 code unit will fit in two CharT units. if (Action == Copy) dst[j] = CharT((v >> 10) + 0xD800); j++; if (Action == Copy) dst[j] = CharT((v & 0x3FF) + 0xDC00); } else { // The n-byte UTF8 code unit won't fit in two CharT units. INVALID(ReportTooBigCharacter, v, 1); } } invalidMultiByteCodeUnit: // Move i to the last byte of the multi-byte code unit; the loop // header will do the final i++ to move to the start of the next // code unit. i += n - 1; if (Action != AssertNoInvalids) RequireUTF16(); } }