unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) { if (!data) return 0; StringHasher stringHasher; dataLength = 0; utf16Length = 0; while (data < dataEnd || (!dataEnd && *data)) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); dataLength++; utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); dataLength += utf8SequenceLength; if (!dataEnd) { for (int i = 1; i < utf8SequenceLength; ++i) { if (!data[i]) return 0; } } else if (dataEnd - data < utf8SequenceLength) { return 0; } if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else { return 0; } } return stringHasher.hashWithTop8BitsMasked(); }
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) { if (!data) return 0; WTF::StringHasher stringHasher; utf16Length = 0; while (data < dataEnd) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); if (dataEnd - data < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else return 0; } return stringHasher.hash(); }
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd) { while (b < bEnd) { if (isASCII(*b)) { if (*a++ != *b++) return false; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); if (bEnd - b < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(b, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return false; if (*a++ != character) return false; } else if (U_IS_SUPPLEMENTARY(character)) { if (*a++ != U16_LEAD(character)) return false; if (*a++ != U16_TRAIL(character)) return false; } else { return false; } } return a == aEnd; }
// Given a first byte, gives the length of the UTF-8 sequence it begins. // Returns 0 for bytes that are not legal starts of UTF-8 sequences. // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). int UTF8SequenceLength(char b0) { return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); }
int UTF8SequenceLengthNonASCII(char b0) { return inlineUTF8SequenceLengthNonASCII(b0); }
inline int inlineUTF8SequenceLength(char b0) { return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); }