/*reads the specified utf8 character*/ UnicodeChar utf8_read(BYTE const* start) { BYTE rv = *start; if(likely(rv < 128)) { return UnicodeChar((char)rv); } else if(rv >= 240) { return UnicodeChar( ((uint32_t)rv - 240) * 262144 + extract_utf8(*(start + 1)) * 4096 + extract_utf8(*(start + 2)) * 64 + extract_utf8(*(start + 3)) ); } else if(rv >= 224) { return UnicodeChar( ((uint32_t)rv - 224) * 4096 + extract_utf8(*(start + 1)) * 64 + extract_utf8(*(start + 2)) ); } else if(rv >= 192) { return UnicodeChar( ((uint32_t)rv - 192) * 64 + extract_utf8(*(start + 1)) ); } throw_HlError("utf8 string read error"); }
bool XMLReader::ParseText() { // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) mValue.clear(); mNodeType = kWhitespace; mCurrentName.Clear(); while (true) { switch (PeekChar()) { case UnicodeChar(0xFFFF): case UnicodeChar('<'): return ! mValue.empty(); case UnicodeChar('&'): ReadChar(); if (! ParseReference(mValue)) return false; break; default: { UnicodeChar c = ReadChar(); if (! IsWhitespace(c)) mNodeType = kText; mValue += c; break; } } } }
bool XMLReader::IsNameChar(UnicodeChar c) { // NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender return IsLetter(c) || IsDigit(c) || (c == UnicodeChar('.')) || (c == UnicodeChar('-')) || (c == UnicodeChar('_')) || (c == UnicodeChar(':')); }
bool XMLReader::ParseName(Name & name) { // Name ::= (Letter | '_' | ':') (NameChar)* UnicodeString temp; UnicodeChar c = ReadChar(); if ((c == UnicodeChar('_')) || (c == UnicodeChar(':')) || IsLetter(c)) { temp += c; while (IsNameChar(PeekChar())) temp += ReadChar(); name.SetName(temp); return true; } return false; }
bool XMLReader::IsName(const UnicodeString & name) { // Name ::= (Letter | '_' | ':') (NameChar)* size_t n = name.size(); if (n == 0) return false; UnicodeChar c = name[0]; if ((c == UnicodeChar('_')) || (c == UnicodeChar(':')) || IsLetter(c)) { for (size_t i = 1; i < n; ++i) { if (! IsNameChar(name[i])) return false; } } return true; }
bool XMLReader::Equals(const UnicodeString & a, const char * b) { size_t bLen = std::strlen(b); if (a.size() != bLen) return false; for (size_t i = 0; i < bLen; ++i) { if (a[i] != UnicodeChar(b[i])) return false; } return true; }
bool XMLReader::ParseAttValue(UnicodeString & value) { // AttValue ::= '"' ([^<&"] | Reference)* '"' // | "'" ([^<&'] | Reference)* "'" value.clear(); if (ParseString("\"")) { while (true) { UnicodeChar c = ReadChar(); switch (c) { case UnicodeChar('<'): case 0: return false; case '&': if (! ParseReference(value)) return false; break; case UnicodeChar('"'): return true; default: value += c; break; } } } else if (ParseString("'")) { while (true) { UnicodeChar c = ReadChar(); switch (c) { case UnicodeChar('<'): case 0: return false; case '&': if (! ParseReference(value)) return false; break; case UnicodeChar('\''): return true; default: value += c; break; } } } return false; }
UnicodeChar StringCell::charAt(CharLengthType offset) const { if (offset >= charLength()) { return UnicodeChar(); } const std::uint8_t *charPtr = const_cast<StringCell*>(this)->charPointer(offset); return utf8::decodeChar(&charPtr); }
UnicodeChar XMLReader::ReadChar() { if (mOutputStart < mOutputEnd) { return *mOutputStart++; } else { FillOutputBuffer(); if (mOutputStart < mOutputEnd) return *mOutputStart++; else return UnicodeChar(0xFFFF); } }
void XMLReader::Name::DivideName() { mNamespaceURI.clear(); UnicodeString::size_type i = mName.find(UnicodeChar(':')); if (i == UnicodeString::npos) { mPrefix.clear(); mLocalName = mName; } else { mPrefix = UnicodeString(mName, 0, i); mLocalName = UnicodeString(mName, i + 1); } }
bool XMLReader::BufferStartsWith(const char * prefix) { ptrdiff_t prefixLen = std::strlen(prefix); if (mOutputEnd - mOutputStart < prefixLen) FillOutputBuffer(); if (mOutputEnd - mOutputStart >= prefixLen) { const char * prefixEnd = prefix + prefixLen; const UnicodeChar * output = mOutputStart; while (prefix < prefixEnd) { if (*output++ != UnicodeChar(*prefix++)) return false; } return true; } else return false; }
void ConvertToUnicode::Convert(const char * & sourceStart, const char * sourceEnd, UnicodeChar * & destStart, UnicodeChar * destEnd) { // Note that Microsoft's code page 65001 (UTF-8) does not recognize BOM, so we have to // handle that manually. if (mIsStartOfInput && (mSourceEncoding == TextEncoding::UTF8()) && (sourceStart + 3 <= sourceEnd) && (destStart < destEnd) && (sourceStart[0] == char(0xEF)) && (sourceStart[1] == char(0xBB)) && (sourceStart[2] == char(0xBF))) { *destStart++ = UnicodeChar(0xFEFF); sourceStart += 3; mIsStartOfInput = false; } while ((sourceStart < sourceEnd) && (destStart < destEnd)) { const char * charEnd = sourceStart; while (true) { if (charEnd >= sourceEnd) return; if (! ::IsDBCSLeadByteEx(mSourceEncoding, *charEnd++)) break; } int result = ::MultiByteToWideChar(mSourceEncoding, 0, sourceStart, int(charEnd - sourceStart), destStart, int(destEnd - destStart)); if (result > 0) { destStart += result; sourceStart = charEnd; mIsStartOfInput = false; } else { DWORD err = ::GetLastError(); if (err == ERROR_INSUFFICIENT_BUFFER) { return; } else { ThrowXMLError(err); } } } }
static void testAll(World &world) { auto sourceString = reinterpret_cast<const std::uint8_t*>(u8"Hello world everyone!"); const size_t sourceLength = 21; // Create a source bytevector alloc::BytevectorRef origBv(world, BytevectorCell::fromData(world, sourceString, sourceLength)); // Create a direct copy alloc::BytevectorRef copyBv(world, origBv->copy(world)); ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(copyBv)); // Set an byte of the copy ASSERT_TRUE(copyBv->setByteAt(0, 4)); // The sharing should now be broken ASSERT_FALSE(sharedByteArrayFor(origBv) == sharedByteArrayFor(copyBv)); // Create a copy from appending a single bytevector alloc::BytevectorRef appendedBv(world, BytevectorCell::fromAppended(world, {origBv})); ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(appendedBv)); // Replace part of the byte array ASSERT_TRUE(appendedBv->replace(3, origBv, 0, 1)); // Sharing should now be broken ASSERT_FALSE(sharedByteArrayFor(origBv) == sharedByteArrayFor(appendedBv)); // Create a string from the bytevector alloc::StringRef origString(world, origBv->utf8ToString(world)); ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(origString)); // Create a string as a copy alloc::StringRef copyString(world, origString->copy(world)); ASSERT_TRUE(sharedByteArrayFor(origString) == sharedByteArrayFor(copyString)); // Set a character in the string ASSERT_TRUE(copyString->setCharAt(5, UnicodeChar('!'))); // Sharing should now be broken ASSERT_FALSE(sharedByteArrayFor(origString) == sharedByteArrayFor(copyString)); // Create a string from appending a single string alloc::StringRef appendedString(world, StringCell::fromAppended(world, {origString})); ASSERT_TRUE(sharedByteArrayFor(origString) == sharedByteArrayFor(appendedString)); // Fill the string ASSERT_TRUE(appendedString->fill(UnicodeChar(4))); // Sharing should now be broken ASSERT_FALSE(sharedByteArrayFor(origString) == sharedByteArrayFor(appendedString)); // Create a symbol from the appended string alloc::SymbolRef symbol(world, SymbolCell::fromString(world, appendedString)); ASSERT_TRUE(sharedByteArrayFor(appendedString) == sharedByteArrayFor(symbol)); // Writing to the string again should break sharing // Symbols are immutable so breaking cannot happen from the symbol side appendedString->replace(1, origString, 0, 1); ASSERT_FALSE(sharedByteArrayFor(appendedString) == sharedByteArrayFor(symbol)); // // Test a grand tour of string -> symbol -> string -> bytevector -> string // alloc::StringRef firstString(world, StringCell::fromUtf8StdString(world, u8"Hello world everyone!")); alloc::SymbolRef firstSymbol(world, SymbolCell::fromString(world, firstString)); alloc::StringRef secondString(world, StringCell::fromSymbol(world, firstSymbol)); alloc::BytevectorRef firstBv(world, secondString->toUtf8Bytevector(world)); alloc::StringRef thirdString(world, firstBv->utf8ToString(world)); ASSERT_TRUE(sharedByteArrayFor(firstString) == sharedByteArrayFor(thirdString)); }
bool XMLReader::StartsWith(const UnicodeChar * haystack, const char * needle) { while (*needle) { if (*haystack++ != UnicodeChar(*needle++)) return false; } return true; }
// Returns true if a node was read successfully, false on EOF bool XMLReader::ReadInternal() { switch (mNodeType) { case kNone: FillInputBuffer(); if ((mInputEnd - mInputStart) >= 2) { uint16_t x = (uint16_t(mInputBuffer[0]) << 8) | mInputBuffer[1]; switch (x) { case 0xFEFF: case 0x003C: mConverter.Reset(TextEncoding::UTF16BE()); break; case 0xFFFE: case 0x3C00: mConverter.Reset(TextEncoding::UTF16LE()); break; } } mNodeType = kDocument; return true; case kDocument: // An XML document can start with: // document ::= prolog element Misc* // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' // Misc ::= Comment | PI | S // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' // S ::= (#x20 | #x9 | #xD | #xA)+ // If the XML file starts with a byte order mark, throw it away. // The earlier code for the kNone case has already used it to // set the default encoding. ParseChar(UnicodeChar(0xFEFF)); if (BufferStartsWith("<?xml")) { if (! ParseXmlDeclaration()) return false; UnicodeString encodingName = GetAttribute("encoding"); if (encodingName.empty()) return true; TextEncoding newEncoding = TextEncoding::WebCharset(encodingName.c_str()); if (newEncoding == mConverter.GetSourceEncoding()) return true; // The encoding in the XML declaration is different from the one // we assumed, so we have to reset all the input buffering and // re-parse the XmlDeclaration. mConverter.Reset(newEncoding); mInput.Restart(); mInputStart = mInputEnd = mInputBuffer; mOutputStart = mOutputEnd = mOutputBuffer; ParseChar(UnicodeChar(0xFEFF)); return ParseXmlDeclaration(); } else if (StartsWithWhitespace()) return ParseRequiredWhitespace(); //else if (BufferStartsWith("<!--")) return ParseComment(); //else if (BufferStartsWith("<?")) return ParseProcessingInstruction(); //else if (BufferStartsWith("<!DOCTYPE")) return ParseDocumentType(); else if (BufferStartsWith("<")) return ParseElement(); else return false; case kXmlDeclaration: case kElement: case kEndElement: case kText: case kWhitespace: if (BufferStartsWith("</")) return ParseEndElement(); else if (BufferStartsWith("<")) return ParseElement(); else return ParseText(); } return false; }
UnicodeChar ref(size_t i) const { return UnicodeChar( (char) start[i] ); }
bool XMLReader::ParseReference(UnicodeString & value) { // Note that the '&' has already been read when this // function is called. // Reference ::= EntityRef | CharRef // EntityRef ::= '&' Name ';' // CharRef ::= '&#' [0-9]+ ';' // | '&#x' [0-9a-fA-F]+ ';' if (ParseString("#x")) { UnicodeChar result = 0; while (true) { UnicodeChar c = ReadChar(); if (c == UnicodeChar(';')) { value += result; return true; } else if ((c >= UnicodeChar('0')) && (c <= UnicodeChar('9'))) { result = (result * 16) + (c - UnicodeChar('0')); } else if ((c >= UnicodeChar('A')) && (c <= UnicodeChar('F'))) { result = (result * 16) + (c - UnicodeChar('A') + 10); } else if ((c >= UnicodeChar('a')) && (c <= UnicodeChar('f'))) { result = (result * 16) + (c - UnicodeChar('a') + 10); } else return false; } } else if (ParseString("#")) { UnicodeChar result = 0; while (true) { UnicodeChar c = ReadChar(); if (c == UnicodeChar(';')) { value += result; return true; } else if ((c >= UnicodeChar('0')) && (c <= UnicodeChar('9'))) { result = (result * 10) + (c - UnicodeChar('0')); } else return false; } } // Note that we're hardwiring the minimal set of entity names. // It would be more correct to call ParseName and then call // a function to get the entity value. else if (ParseString("amp;")) { value += '&'; return true; } else if (ParseString("quot;")) { value += '"'; return true; } else if (ParseString("apos;")) { value += '\''; return true; } else if (ParseString("lt;")) { value += '<'; return true; } else if (ParseString("gt;")) { value += '>'; return true; } return false; }