Node *Parser::parseDocument() { readChar( false ); Node *result = new Node; result->setName( "" ); result->setNodeType( Document ); if( !parseProlog( result ) ) reportError( "Missing prolog" ); if( !parseElement( result ) ) reportError( "Missing content" ); while( parseMisc( result ) ) {} return result; }
void SGMLParser :: parse(const char * aSchemaFile, const char * aDocument) { printf("Loading schema\n"); clock_t start = clock(); loadSchema(aSchemaFile); clock_t end = clock(); printf("Time taken for loading the schema: %f\n", (double)(end - start)/CLOCKS_PER_SEC); start = clock(); printf("Starting to scan the HTML document\n"); mScanner->setDocument(aDocument); printf("Loaded the document\n"); // Assume the doctype is HTML. mDocTypeName = "HTML"; ElementParser elementParser(mScanner, mSchema, mDocTypeName); // See if we can scan a whole HTML document. try { mToken = mScanner->nextToken(); parseSStar(); printf("Got first token: %s\n", mScanner->getTokenText().c_str()); parseProlog(); while (mToken != EOF_SYM) { switch (mToken) { case ELEMENT_OPEN_SYM: { // Kickstart the element parser. TElementPtr element = elementParser.parseStartTag(); TDOMString name = element->getTagName(); ElementToken elmToken = ElementToken(START_TAG, name, element); TElementDeclarationPtr declaration = mSchema->getDeclaration(mDocTypeName); mToken = elementParser.parse(elmToken, declaration); break; } case DECLARATION_SYM: { mToken = mScanner->nextToken(); if (mToken == COMMENT_SYM) { if (mCommentDeclParser == NULL) mCommentDeclParser = new CommentDeclParser(mScanner, TSchemaPtr()); mToken = mCommentDeclParser->parse(mToken, ELEMENT_OPEN_SYM); } else throw ReadException(mScanner->getLineNr(), mScanner->getCharNr(), "Expected comment sym", GENERIC, true); break; } case DECLARATION_END_SYM: { mToken = mScanner->nextToken(ELEMENT_OPEN_SYM); break; } case TEXT_SYM: { mToken = mScanner->nextToken(); break; } case SPACE_SYM: { // Not doing anything with that right now. mToken = mScanner->nextToken(); break; } default: { printf("Found token: %s\n", mScanner->getTokenText().c_str()); mToken = mScanner->nextToken(); } } } } catch(ReadException r) { printf( "Found error: line: %i char %i message: %s\n", r.getLineNr(), r.getCharNr(), r.getErrorMessage().c_str()); } end = clock(); printf("Time taken: %f\n", (double)(end - start)/CLOCKS_PER_SEC); TDocumentPtr document = elementParser.getDocument(); showTree(document, 0); }
static UXMLNODE parseElement(struct UXMLPARSER *parser, UXMLNODE parent, int *type) { UXMLNODE node; UXMLCHAR *name = parser->offset; size_t nameLength; int isEmpty; if (!skipName(parser)) { setError(parser, 0, "Unexpected end of input parsing stag name"); return NULL; } nameLength = (parser->offset - name); switch (*name) { case '!': { if (stringCompare(name, "![CDATA[", 8)) { *type = UXML_NT_CDATA; return parseCDATA(parser, parent); } else if (stringCompare(name, "!--", 3)) { *type = UXML_NT_COMMENT; return parseComment(parser, parent); } else if (stringCompare(name, "!DOCTYPE", 8)) { *type = UXML_NT_DOCTYPE; return parseDocType(parser, parent); } else { setError(parser, 0, "Unexpected ! tag"); return NULL; } break; } case '?': { if (stringCompare(name, "?xml", 4)) { *type = UXML_NT_PROLOG; return parseProlog(parser, parent); } else { setError(parser, 0, "Unexpected prolog tag"); return NULL; } break; } default: { *type = UXML_NT_ELEMENT; node = parser->createElement(parser, parent, name, nameLength); if (!parseAttributes(parser, node, &isEmpty)) { goto ERROR_DESTROY_NODE; } if (isEmpty) { parser->offset ++; if (*(parser->offset) != '>') { setError(parser, 0, "Unexpected end of input parsing stag name"); goto ERROR_DESTROY_NODE; } parser->offset ++; } else { parser->offset ++; if (!parseContent(parser, node, name, nameLength)) { goto ERROR_DESTROY_NODE; } } break; } } return node; ERROR_DESTROY_NODE: parser->destroyNode(parser, node); return NULL; }