const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) { TiXmlDocument* document = GetDocument(); // Read in text and elements in any order. const char* pWithWhiteSpace = p; p = SkipWhiteSpace( p, encoding ); while ( p && *p ) { if ( *p != '<' ) { // Take what we have, make a text element. TiXmlText* textNode = new TiXmlText( "" ); if ( !textNode ) { return 0; } if ( TiXmlBase::IsWhiteSpaceCondensed() ) { p = textNode->Parse( p, data, encoding ); } else { // Special case: we want to keep the white space // so that leading spaces aren't removed. p = textNode->Parse( pWithWhiteSpace, data, encoding ); } if ( !textNode->Blank() ) LinkEndChild( textNode ); else delete textNode; } else { // We hit a '<' // Have we hit a new element or an end tag? This could also be // a TiXmlText in the "CDATA" style. if ( StringEqual( p, "</", false, encoding ) ) { return p; } else { TiXmlNode* node = Identify( p, encoding ); if ( node ) { p = node->Parse( p, data, encoding ); LinkEndChild( node ); } else { return 0; } } } pWithWhiteSpace = p; p = SkipWhiteSpace( p, encoding ); } if ( !p ) { if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding ); } return p; }
const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) { ClearError(); // Parse away, at the document level. Since a document // contains nothing but other tags, most of what happens // here is skipping white space. if ( !p || !*p ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); return 0; } // Note that, for a document, this needs to come // before the while space skip, so that parsing // starts from the pointer we are given. location.Clear(); if ( prevData ) { location.row = prevData->cursor.row; location.col = prevData->cursor.col; } else { location.row = 0; location.col = 0; } TiXmlParsingData data( p, TabSize(), location.row, location.col ); location = data.Cursor(); if ( encoding == TIXML_ENCODING_UNKNOWN ) { // Check for the Microsoft UTF-8 lead bytes. const unsigned char* pU = (const unsigned char*)p; if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) { encoding = TIXML_ENCODING_UTF8; useMicrosoftBOM = true; } } p = SkipWhiteSpace( p, encoding ); if ( !p ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); return 0; } while ( p && *p ) { TiXmlNode* node = Identify( p, encoding ); if ( node ) { p = node->Parse( p, &data, encoding ); LinkEndChild( node ); } else { break; } // Did we get encoding info? if ( encoding == TIXML_ENCODING_UNKNOWN && node->ToDeclaration() ) { TiXmlDeclaration* dec = node->ToDeclaration(); const char* enc = dec->Encoding(); assert( enc ); if ( *enc == 0 ) encoding = TIXML_ENCODING_UTF8; else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) encoding = TIXML_ENCODING_UTF8; else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice else encoding = TIXML_ENCODING_LEGACY; } p = SkipWhiteSpace( p, encoding ); } // Was this empty? if ( !firstChild ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); return 0; } // All is well. return p; }
const char* TiXmlDocument::Parse ( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) { ClearError(); // Parse away, at the document level. Since a document // contains nothing but other tags, most of what happens // here is skipping white space. // sherm 100319: I changed this so that untagged top-level text is // parsed as a Text node rather than a parsing error. CDATA text was // already allowed at the top level so this seems more consistent. if ( !p || !*p ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); return 0; } // Note that, for a document, this needs to come // before the while space skip, so that parsing // starts from the pointer we are given. location.Clear(); if ( prevData ) { location.row = prevData->cursor.row; location.col = prevData->cursor.col; } else { location.row = 0; location.col = 0; } TiXmlParsingData data( p, TabSize(), location.row, location.col ); location = data.Cursor(); if ( encoding == TIXML_ENCODING_UNKNOWN ) { // Check for the Microsoft UTF-8 lead bytes. const unsigned char* pU = (const unsigned char*)p; if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) { encoding = TIXML_ENCODING_UTF8; useMicrosoftBOM = true; } } // Remember the start of white space in case we end up reading a text // element in a "keep white space" mode. const char* pWithWhiteSpace = p; p = SkipWhiteSpace( p, encoding ); if ( !p ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); return 0; } // sherm 100319: ignore all but the first Declaration bool haveSeenDeclaration = false; while ( p && *p ) { TiXmlNode* node = 0; if ( *p != '<' ) { // sherm 100319: I added this case by stealing the code from // Element parsing; see above comment. // Take what we have, make a text element. TiXmlText* textNode = new TiXmlText( "" ); if ( !textNode ) { SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding ); return 0; } if ( TiXmlBase::IsWhiteSpaceCondensed() ) { p = textNode->Parse( p, &data, encoding ); } else { // Special case: we want to keep the white space // so that leading spaces aren't removed. p = textNode->Parse( pWithWhiteSpace, &data, encoding ); } if ( !textNode->Blank() ) { LinkEndChild( textNode ); node = textNode; } else delete textNode; } else // We saw a '<', now identify what kind of tag it is. { TiXmlNode* node = Identify( p, encoding ); if ( node ) { p = node->Parse( p, &data, encoding ); if (node->ToDeclaration()) { if (haveSeenDeclaration) { delete node; node=0; // ignore duplicate Declaration } else haveSeenDeclaration = true; } if (node) LinkEndChild( node ); } else { // If Identify fails then no further parsing is possible. break; } } // Did we get encoding info? if ( encoding == TIXML_ENCODING_UNKNOWN && node && node->ToDeclaration() ) { TiXmlDeclaration* dec = node->ToDeclaration(); const char* enc = dec->Encoding(); assert( enc ); if ( *enc == 0 ) encoding = TIXML_ENCODING_UTF8; else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) encoding = TIXML_ENCODING_UTF8; else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice else encoding = TIXML_ENCODING_LEGACY; } pWithWhiteSpace = p; p = SkipWhiteSpace( p, encoding ); } // Was this empty? if ( !firstChild ) { SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); return 0; } // All is well. return p; }