示例#1
0
文件: hlstrings.cpp 项目: AmkG/hl
/*reads the specified utf8 character*/
UnicodeChar utf8_read(BYTE const* start) {
	BYTE rv = *start;
	if(likely(rv < 128)) {
		 return UnicodeChar((char)rv);
	} else if(rv >= 240) {
		return UnicodeChar(
			((uint32_t)rv - 240) * 262144
			+ extract_utf8(*(start + 1)) * 4096
			+ extract_utf8(*(start + 2)) * 64
			+ extract_utf8(*(start + 3))
		);
	} else if(rv >= 224) {
		return UnicodeChar(
			((uint32_t)rv - 224) * 4096
			+ extract_utf8(*(start + 1)) * 64
			+ extract_utf8(*(start + 2))
		);
	} else if(rv >= 192) {
		return UnicodeChar(
			((uint32_t)rv - 192) * 64
			+ extract_utf8(*(start + 1))
		);
	}
	throw_HlError("utf8 string read error");
}
	bool XMLReader::ParseText()
	{
		// CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)

		mValue.clear();
		mNodeType = kWhitespace;
		mCurrentName.Clear();
		while (true) {
			switch (PeekChar()) {
				case UnicodeChar(0xFFFF):
				case UnicodeChar('<'):
					return ! mValue.empty();
				case UnicodeChar('&'):
					ReadChar();
					if (! ParseReference(mValue)) return false;
					break;
				default: {
				    UnicodeChar c = ReadChar();
				    if (! IsWhitespace(c)) mNodeType = kText;
					mValue += c;
					break;
				}
			}
		}
	}
	bool XMLReader::IsNameChar(UnicodeChar c)
	{
		// NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
		return IsLetter(c)
			|| IsDigit(c)
			|| (c == UnicodeChar('.')) || (c == UnicodeChar('-'))
			|| (c == UnicodeChar('_')) || (c == UnicodeChar(':'));
	}
	bool XMLReader::ParseName(Name & name)
	{
		// Name ::= (Letter | '_' | ':') (NameChar)*
		UnicodeString temp;
		UnicodeChar c = ReadChar();
		if ((c == UnicodeChar('_')) || (c == UnicodeChar(':')) || IsLetter(c)) {
			temp += c;
			while (IsNameChar(PeekChar())) temp += ReadChar();
			name.SetName(temp);
			return true;
		}
		return false;
	}
	bool XMLReader::IsName(const UnicodeString & name)
	{
		// Name ::= (Letter | '_' | ':') (NameChar)*
		size_t n = name.size();
		if (n == 0) return false;
		UnicodeChar c = name[0];
		if ((c == UnicodeChar('_')) || (c == UnicodeChar(':')) || IsLetter(c)) {
			for (size_t i = 1; i < n; ++i) {
				if (! IsNameChar(name[i])) return false;
			}
		}
		return true;
	}
	bool XMLReader::Equals(const UnicodeString & a, const char * b)
	{
		size_t bLen = std::strlen(b);
		if (a.size() != bLen) return false;
		for (size_t i = 0; i < bLen; ++i) {
			if (a[i] != UnicodeChar(b[i])) return false;
		}
		return true;
	}
	bool XMLReader::ParseAttValue(UnicodeString & value)
	{
		// AttValue ::= '"' ([^<&"] | Reference)* '"'
		//	|  "'" ([^<&'] | Reference)* "'"
		value.clear();
		if (ParseString("\"")) {
			while (true) {
				UnicodeChar c = ReadChar();
				switch (c) {
					case UnicodeChar('<'):
					case 0:
						return false;
					case '&':
                        if (! ParseReference(value)) return false;
						break;
					case UnicodeChar('"'):
						return true;
					default:
						value += c;
						break;
				}
			}
		}
		else if (ParseString("'")) {
			while (true) {
				UnicodeChar c = ReadChar();
				switch (c) {
					case UnicodeChar('<'):
					case 0:
						return false;
					case '&':
                        if (! ParseReference(value)) return false;
						break;
					case UnicodeChar('\''):
						return true;
					default:
						value += c;
						break;
				}
			}
		}
		return false;
	}
示例#8
0
UnicodeChar StringCell::charAt(CharLengthType offset) const
{
	if (offset >= charLength())
	{
		return UnicodeChar();
	}

	const std::uint8_t *charPtr = const_cast<StringCell*>(this)->charPointer(offset);
	return utf8::decodeChar(&charPtr);
}
	UnicodeChar XMLReader::ReadChar()
	{
		if (mOutputStart < mOutputEnd) {
			return *mOutputStart++;
		}
		else {
			FillOutputBuffer();
			if (mOutputStart < mOutputEnd) return *mOutputStart++;
			else return UnicodeChar(0xFFFF);
		}
	}
示例#10
0
	void XMLReader::Name::DivideName()
	{
		mNamespaceURI.clear();
		UnicodeString::size_type i = mName.find(UnicodeChar(':'));
		if (i == UnicodeString::npos) {
			mPrefix.clear();
			mLocalName = mName;
		}
		else {
			mPrefix = UnicodeString(mName, 0, i);
			mLocalName = UnicodeString(mName, i + 1);
		}
	}
示例#11
0
	bool XMLReader::BufferStartsWith(const char * prefix)
	{
		ptrdiff_t prefixLen = std::strlen(prefix);
		if (mOutputEnd - mOutputStart < prefixLen) FillOutputBuffer();
		if (mOutputEnd - mOutputStart >= prefixLen) {
			const char * prefixEnd = prefix + prefixLen;
			const UnicodeChar * output = mOutputStart;
			while (prefix < prefixEnd) {
				if (*output++ != UnicodeChar(*prefix++)) return false;
			}
			return true;
		}
		else return false;
	}
	void ConvertToUnicode::Convert(const char * & sourceStart, const char * sourceEnd,
		UnicodeChar * & destStart, UnicodeChar * destEnd)
	{
		// Note that Microsoft's code page 65001 (UTF-8) does not recognize BOM, so we have to
		// handle that manually.
		if (mIsStartOfInput && 
			(mSourceEncoding == TextEncoding::UTF8()) &&
			(sourceStart + 3 <= sourceEnd) &&
			(destStart < destEnd) &&
			(sourceStart[0] == char(0xEF)) &&
			(sourceStart[1] == char(0xBB)) &&
			(sourceStart[2] == char(0xBF))) {
			*destStart++ = UnicodeChar(0xFEFF);
			sourceStart += 3;
			mIsStartOfInput = false;
		}
		while ((sourceStart < sourceEnd) && (destStart < destEnd)) {
			const char * charEnd = sourceStart;
			while (true) {
				if (charEnd >= sourceEnd) return;
				if (! ::IsDBCSLeadByteEx(mSourceEncoding, *charEnd++)) break;
			}
			int result = ::MultiByteToWideChar(mSourceEncoding, 0, sourceStart, int(charEnd - sourceStart),
				destStart, int(destEnd - destStart));
			if (result > 0) {
				destStart += result;
				sourceStart = charEnd;
				mIsStartOfInput = false;
			}
			else {
				DWORD err = ::GetLastError();
				if (err == ERROR_INSUFFICIENT_BUFFER) {
					return;
				}
				else {
					ThrowXMLError(err);
				}
			}
		}
	}
示例#13
0
	static void testAll(World &world)
	{
		auto sourceString = reinterpret_cast<const std::uint8_t*>(u8"Hello world everyone!");
		const size_t sourceLength = 21;

		// Create a source bytevector
		alloc::BytevectorRef origBv(world, BytevectorCell::fromData(world, sourceString, sourceLength));

		// Create a direct copy
		alloc::BytevectorRef copyBv(world, origBv->copy(world));
		ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(copyBv));

		// Set an byte of the copy
		ASSERT_TRUE(copyBv->setByteAt(0, 4));
		// The sharing should now be broken
		ASSERT_FALSE(sharedByteArrayFor(origBv) == sharedByteArrayFor(copyBv));

		// Create a copy from appending a single bytevector
		alloc::BytevectorRef appendedBv(world, BytevectorCell::fromAppended(world, {origBv}));
		ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(appendedBv));

		// Replace part of the byte array
		ASSERT_TRUE(appendedBv->replace(3, origBv, 0, 1));
		// Sharing should now be broken
		ASSERT_FALSE(sharedByteArrayFor(origBv) == sharedByteArrayFor(appendedBv));

		// Create a string from the bytevector
		alloc::StringRef origString(world, origBv->utf8ToString(world));
		ASSERT_TRUE(sharedByteArrayFor(origBv) == sharedByteArrayFor(origString));

		// Create a string as a copy
		alloc::StringRef copyString(world, origString->copy(world));
		ASSERT_TRUE(sharedByteArrayFor(origString) == sharedByteArrayFor(copyString));

		// Set a character in the string
		ASSERT_TRUE(copyString->setCharAt(5, UnicodeChar('!')));
		// Sharing should now be broken
		ASSERT_FALSE(sharedByteArrayFor(origString) == sharedByteArrayFor(copyString));

		// Create a string from appending a single string
		alloc::StringRef appendedString(world, StringCell::fromAppended(world, {origString}));
		ASSERT_TRUE(sharedByteArrayFor(origString) == sharedByteArrayFor(appendedString));

		// Fill the string
		ASSERT_TRUE(appendedString->fill(UnicodeChar(4)));
		// Sharing should now be broken
		ASSERT_FALSE(sharedByteArrayFor(origString) == sharedByteArrayFor(appendedString));

		// Create a symbol from the appended string
		alloc::SymbolRef symbol(world, SymbolCell::fromString(world, appendedString));
		ASSERT_TRUE(sharedByteArrayFor(appendedString) == sharedByteArrayFor(symbol));

		// Writing to the string again should break sharing
		// Symbols are immutable so breaking cannot happen from the symbol side
		appendedString->replace(1, origString, 0, 1);
		ASSERT_FALSE(sharedByteArrayFor(appendedString) == sharedByteArrayFor(symbol));

		//
		// Test a grand tour of string ->  symbol -> string -> bytevector -> string
		//
		
		alloc::StringRef firstString(world, StringCell::fromUtf8StdString(world, u8"Hello world everyone!"));
		alloc::SymbolRef firstSymbol(world, SymbolCell::fromString(world, firstString));
		alloc::StringRef secondString(world, StringCell::fromSymbol(world, firstSymbol));
		alloc::BytevectorRef firstBv(world, secondString->toUtf8Bytevector(world));
		alloc::StringRef thirdString(world, firstBv->utf8ToString(world));

		ASSERT_TRUE(sharedByteArrayFor(firstString) == sharedByteArrayFor(thirdString));
	}
示例#14
0
	bool XMLReader::StartsWith(const UnicodeChar * haystack, const char * needle) {
	    while (*needle) {
	        if (*haystack++ != UnicodeChar(*needle++)) return false;
	    }
	    return true;
	}
示例#15
0
	// Returns true if a node was read successfully, false on EOF
	bool XMLReader::ReadInternal()
	{
		switch (mNodeType) {
			case kNone:
				FillInputBuffer();
				if ((mInputEnd - mInputStart) >= 2) {
					uint16_t x = (uint16_t(mInputBuffer[0]) << 8) | mInputBuffer[1];
					switch (x) {
						case 0xFEFF:
						case 0x003C:
							mConverter.Reset(TextEncoding::UTF16BE());
							break;
						case 0xFFFE:
						case 0x3C00:
							mConverter.Reset(TextEncoding::UTF16LE());
							break;
					}
				}
				mNodeType = kDocument;
				return true;
			
			case kDocument:
				// An XML document can start with:
				// document	::= prolog element Misc*
				// prolog	::= XMLDecl? Misc* (doctypedecl Misc*)?
				// XMLDecl	::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
				// Misc ::= Comment | PI | S
				// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
				// Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
				// PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
				// S ::= (#x20 | #x9 | #xD | #xA)+

				// If the XML file starts with a byte order mark, throw it away.
				// The earlier code for the kNone case has already used it to
				// set the default encoding.
				ParseChar(UnicodeChar(0xFEFF));

				if (BufferStartsWith("<?xml")) {
					if (! ParseXmlDeclaration()) return false;
					UnicodeString encodingName = GetAttribute("encoding");
					if (encodingName.empty()) return true;
					TextEncoding newEncoding = TextEncoding::WebCharset(encodingName.c_str());
					if (newEncoding == mConverter.GetSourceEncoding()) return true;

					// The encoding in the XML declaration is different from the one
					// we assumed, so we have to reset all the input buffering and
					// re-parse the XmlDeclaration.
					mConverter.Reset(newEncoding);
					mInput.Restart();
					mInputStart = mInputEnd = mInputBuffer;
					mOutputStart = mOutputEnd = mOutputBuffer;
					ParseChar(UnicodeChar(0xFEFF));
					return ParseXmlDeclaration();
				}
				else if (StartsWithWhitespace()) return ParseRequiredWhitespace();
				//else if (BufferStartsWith("<!--")) return ParseComment();
				//else if (BufferStartsWith("<?")) return ParseProcessingInstruction();
				//else if (BufferStartsWith("<!DOCTYPE")) return ParseDocumentType();
				else if (BufferStartsWith("<")) return ParseElement();
				else return false;

			case kXmlDeclaration:
			case kElement:
			case kEndElement:
			case kText:
		    case kWhitespace:
				if (BufferStartsWith("</")) return ParseEndElement();
				else if (BufferStartsWith("<")) return ParseElement();
				else return ParseText();
		}
		return false;
	}
示例#16
0
文件: hlstrings.cpp 项目: AmkG/hl
	UnicodeChar ref(size_t i) const {
		return UnicodeChar(
			(char) start[i]
		);
	}
示例#17
0
	bool XMLReader::ParseReference(UnicodeString & value)
	{
		// Note that the '&' has already been read when this
		// function is called.
		// Reference 	   ::=    	EntityRef | CharRef
		// EntityRef 	   ::=    	'&' Name ';'
		// CharRef 	   ::=    	'&#' [0-9]+ ';'
		//	| '&#x' [0-9a-fA-F]+ ';'
		if (ParseString("#x")) {
			UnicodeChar result = 0;
			while (true) {
				UnicodeChar c = ReadChar();
				if (c == UnicodeChar(';')) {
					value += result;
					return true;
				}
				else if ((c >= UnicodeChar('0')) && (c <= UnicodeChar('9'))) {
					result = (result * 16) + (c - UnicodeChar('0'));
				}
				else if ((c >= UnicodeChar('A')) && (c <= UnicodeChar('F'))) {
					result = (result * 16) + (c - UnicodeChar('A') + 10);
				}
				else if ((c >= UnicodeChar('a')) && (c <= UnicodeChar('f'))) {
					result = (result * 16) + (c - UnicodeChar('a') + 10);
				}
				else return false;
			}
		}
		else if (ParseString("#")) {
			UnicodeChar result = 0;
			while (true) {
				UnicodeChar c = ReadChar();
				if (c == UnicodeChar(';')) {
					value += result;
					return true;
				}
				else if ((c >= UnicodeChar('0')) && (c <= UnicodeChar('9'))) {
					result = (result * 10) + (c - UnicodeChar('0'));
				}
				else return false;
			}
		}
		// Note that we're hardwiring the minimal set of entity names.
		// It would be more correct to call ParseName and then call
		// a function to get the entity value.
		else if (ParseString("amp;")) {
			value += '&';
			return true;
		}
		else if (ParseString("quot;")) {
			value += '"';
			return true;
		}
		else if (ParseString("apos;")) {
			value += '\'';
			return true;
		}
		else if (ParseString("lt;")) {
			value += '<';
			return true;
		}
		else if (ParseString("gt;")) {
			value += '>';
			return true;
		}

		return false;
	}