static inline void add_sgml_attribute(struct dom_stack *stack, struct dom_scanner_token *token, struct dom_scanner_token *valtoken) { struct sgml_parser *parser = get_sgml_parser(stack); struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_string *value = valtoken ? &valtoken->string : NULL; struct sgml_node_info *info; struct dom_node *node; node = add_dom_attribute(parent, &token->string, value); info = get_sgml_node_info(parser->info->attributes, node); node->data.attribute.type = info->type; node->data.attribute.id = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER); node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE); if (valtoken && valtoken->type == SGML_TOKEN_STRING) node->data.attribute.quoted = 1; if (!node || !push_dom_node(stack, node)) return; pop_dom_node(stack); }
static inline void add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token) { struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_node *node = add_dom_node(parent, type, &token->string); if (!node) return; if (token->type == SGML_TOKEN_SPACE) node->data.text.only_space = 1; if (push_dom_node(stack, node)) pop_dom_node(stack); }
static void sgml_parsing_pop(struct dom_stack *stack, struct dom_node *node, void *data) { struct sgml_parser *parser = get_sgml_parser(stack); struct sgml_parsing_state *parsing = data; /* Pop the stack back to the state it was in. This includes cleaning * away even immutable states left on the stack. */ while (parsing->depth < parser->stack.depth) { get_dom_stack_top(&parser->stack)->immutable = 0; pop_dom_node(&parser->stack); } assert(parsing->depth == parser->stack.depth); }
struct dom_node * parse_sgml(struct sgml_parser *parser, struct dom_string *buffer) { struct sgml_parsing_state *parsing; if (!parser->root) { parser->root = add_sgml_document(&parser->stack, &parser->uri); if (!parser->root) return NULL; get_dom_stack_top(&parser->stack)->immutable = 1; } parsing = init_sgml_parsing_state(parser, buffer); if (!parsing) return NULL; /* FIXME: Make parse_sgml_plain() return something (error code or if * can be guarenteed a root node). */ parse_sgml_plain(&parser->stack, &parsing->scanner); pop_dom_node(&parser->parsing); return parser->root; }
void pop_dom_state(struct dom_stack *stack, struct dom_stack_state *target) { struct dom_stack_state *state; unsigned int pos; assert(stack); if (!target) return; if (dom_stack_is_empty(stack)) return; foreachback_dom_stack_state (stack, state, pos) { /* Don't pop past states marked immutable. */ if (state->immutable) break; /* Pop until the target state is reached. */ pop_dom_node(stack); if (state == target) break; } }
/* Parse a CSS3 selector and add selector nodes to the @select struct. */ static enum dom_code parse_dom_select(struct dom_select *select, struct dom_stack *stack, struct dom_string *string) { struct dom_scanner scanner; struct dom_select_node sel; init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0, 0); memset(&sel, 0, sizeof(sel)); while (dom_scanner_has_tokens(&scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(&scanner); enum dom_code code; struct dom_select_node *select_node; assert(token); if (token->type == '{' || token->type == '}' || token->type == ';' || token->type == ',') break; /* Examine the selector fragment */ switch (token->type) { case CSS_TOKEN_IDENT: sel.node.type = DOM_NODE_ELEMENT; copy_dom_string(&sel.node.string, &token->string); if (dom_scanner_token_contains(token, "*")) sel.match.element |= DOM_SELECT_ELEMENT_UNIVERSAL; break; case CSS_TOKEN_HASH: case CSS_TOKEN_HEX_COLOR: /* ID fragment */ sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_ID; /* Skip the leading '#'. */ skip_dom_scanner_token_char(token); break; case '[': sel.node.type = DOM_NODE_ATTRIBUTE; code = parse_dom_select_attribute(&sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '.': token = get_next_dom_scanner_token(&scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_CODE_SYNTAX_ERR; sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST; set_dom_string(&sel.node.string, "class", -1); copy_dom_string(&sel.node.data.attribute.value, &token->string); break; case ':': code = parse_dom_select_pseudo(select, &sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '>': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_CHILD; break; case '+': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_ADJACENT; break; case '~': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_INDIRECT_ADJACENT; break; default: return DOM_CODE_SYNTAX_ERR; } skip_dom_scanner_token(&scanner); if (sel.node.type == DOM_NODE_UNKNOWN) continue; select_node = mem_calloc(1, sizeof(*select_node)); copy_struct(select_node, &sel); if (!dom_stack_is_empty(stack)) { struct dom_node *node = &select_node->node; struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_node_list **list = get_dom_node_list(parent, node); int sort = (node->type == DOM_NODE_ATTRIBUTE); int index; assertm(list != NULL, "Adding node to bad parent [%d -> %d]", node->type, parent->type); index = *list && (*list)->size > 0 && sort ? get_dom_node_map_index(*list, node) : -1; if (!add_to_dom_node_list(list, node, index)) { done_dom_node(node); return DOM_CODE_ALLOC_ERR; } node->parent = parent; } else { assert(!select->selector); select->selector = select_node; } code = push_dom_node(stack, &select_node->node); if (code != DOM_CODE_OK) return code; if (select_node->node.type != DOM_NODE_ELEMENT) pop_dom_node(stack); memset(&sel, 0, sizeof(sel)); } if (select->selector) return DOM_CODE_OK; return DOM_CODE_ERR; }
static void parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { struct dom_scanner_token target; while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); switch (token->type) { case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: if (!add_sgml_element(stack, token)) { if (token->type == SGML_TOKEN_ELEMENT) { skip_dom_scanner_token(scanner); break; } skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END); break; } if (token->type == SGML_TOKEN_ELEMENT_BEGIN) { parse_sgml_attributes(stack, scanner); } else { skip_dom_scanner_token(scanner); } break; case SGML_TOKEN_ELEMENT_EMPTY_END: pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ELEMENT_END: if (!token->string.length) { pop_dom_node(stack); } else { struct dom_string string; struct dom_stack_state *state; set_dom_string(&string, token->string.string, token->string.length); state = search_dom_stack(stack, DOM_NODE_ELEMENT, &string); if (state) { struct sgml_parser_state *pstate; pstate = get_sgml_parser_state(stack, state); copy_struct(&pstate->end_token, token); pop_dom_state(stack, state); } } skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_COMMENT: add_sgml_node(stack, DOM_NODE_COMMENT, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_ATTLIST: case SGML_TOKEN_NOTATION_DOCTYPE: case SGML_TOKEN_NOTATION_ELEMENT: case SGML_TOKEN_NOTATION_ENTITY: case SGML_TOKEN_NOTATION: skip_dom_scanner_token(scanner); break; case SGML_TOKEN_CDATA_SECTION: add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_PROCESS_XML_STYLESHEET: case SGML_TOKEN_PROCESS_XML: case SGML_TOKEN_PROCESS: copy_struct(&target, token); /* Skip the target token */ token = get_next_dom_scanner_token(scanner); if (!token) break; assert(token->type == SGML_TOKEN_PROCESS_DATA); if (add_sgml_proc_instruction(stack, &target, token) && (target.type == SGML_TOKEN_PROCESS_XML || target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET) && token->string.length > 0) { /* Parse the <?xml data="attributes"?>. */ struct dom_scanner attr_scanner; init_dom_scanner(&attr_scanner, &sgml_scanner_info, &token->string, SGML_STATE_ELEMENT, scanner->count_lines); if (dom_scanner_has_tokens(&attr_scanner)) parse_sgml_attributes(stack, &attr_scanner); } pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ENTITY: add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_SPACE: case SGML_TOKEN_TEXT: default: add_sgml_node(stack, DOM_NODE_TEXT, token); skip_dom_scanner_token(scanner); } } }
int main(int argc, char *argv[]) { struct sgml_parser *parser; enum sgml_document_type doctype = SGML_DOCTYPE_HTML; enum sgml_parser_flag flags = 0; enum sgml_parser_type type = SGML_PARSER_STREAM; enum dom_code code = 0; enum dom_config_flag normalize_flags = 0; struct dom_config config; int normalize = 0; int dump = 0; int complete = 1; size_t read_stdin = 0; struct dom_string uri = STATIC_DOM_STRING("dom://test"); struct dom_string source = STATIC_DOM_STRING("(no source)"); int i; for (i = 1; i < argc; i++) { char *arg = argv[i]; if (strncmp(arg, "--", 2)) break; arg += 2; if (get_test_opt(&arg, "uri", &i, argc, argv, "a URI")) { set_dom_string(&uri, arg, strlen((const char *)arg)); } else if (get_test_opt(&arg, "src", &i, argc, argv, "a string")) { set_dom_string(&source, arg, strlen((const char *)arg)); } else if (get_test_opt(&arg, "stdin", &i, argc, argv, "a number")) { read_stdin = atoi(arg); flags |= SGML_PARSER_INCREMENTAL; } else if (get_test_opt(&arg, "normalize", &i, argc, argv, "a string")) { normalize = 1; normalize_flags = parse_dom_config(arg, ','); type = SGML_PARSER_TREE; } else if (!strcmp(arg, "print-lines")) { flags |= SGML_PARSER_COUNT_LINES; } else if (!strcmp(arg, "incomplete")) { flags |= SGML_PARSER_INCREMENTAL; complete = 0; } else if (!strcmp(arg, "dump")) { type = SGML_PARSER_TREE; dump = 1; } else if (!strcmp(arg, "error")) { flags |= SGML_PARSER_DETECT_ERRORS; } else if (!strcmp(arg, "help")) { die(NULL); } else { die("Unknown argument '%s'", arg - 2); } } parser = init_sgml_parser(type, doctype, &uri, flags); if (!parser) return 1; parser->error_func = sgml_error_function; if (normalize) add_dom_config_normalizer(&parser->stack, &config, normalize_flags); else if (!dump) add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info); if (read_stdin > 0) { unsigned char *buffer; buffer = mem_alloc(read_stdin); if (!buffer) die("Cannot allocate buffer"); complete = 0; while (!complete) { size_t size = fread(buffer, 1, read_stdin, stdin); if (ferror(stdin)) die("error reading from stdin"); complete = feof(stdin); code = parse_sgml(parser, buffer, size, complete); switch (code) { case DOM_CODE_OK: break; case DOM_CODE_INCOMPLETE: if (!complete) break; /* Error */ default: complete = 1; } } mem_free(buffer); } else { code = parse_sgml(parser, source.string, source.length, complete); } if (parser->root) { assert(!complete || parser->stack.depth > 0); while (!dom_stack_is_empty(&parser->stack)) { get_dom_stack_top(&parser->stack)->immutable = 0; pop_dom_node(&parser->stack); } if (normalize || dump) { struct dom_stack stack; /* Note, that we cannot free nodes when walking the DOM * tree since walk_dom_node() uses an index to traverse * the tree. */ init_dom_stack(&stack, DOM_STACK_FLAG_NONE); /* XXX: This context needs to be added first because it * assumes the parser can be accessed via * stack->contexts[0].data. */ if (normalize) add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info); else if (dump) add_sgml_file_dumper(&stack, stdout); walk_dom_nodes(&stack, parser->root); done_dom_stack(&stack); done_dom_node(parser->root); } } done_sgml_parser(parser); #ifdef DEBUG_MEMLEAK check_memory_leaks(); #endif return code != DOM_CODE_OK ? 1 : 0; }
/* FIXME: Instead of walking all nodes in the tree only visit those which are * of actual interest to the contexts on the stack. */ void walk_dom_nodes(struct dom_stack *stack, struct dom_node *root) { struct dom_stack_context *context; assert(root && stack); context = add_dom_stack_context(stack, NULL, &dom_stack_walk_context_info); if (!context) return; if (push_dom_node(stack, root) != DOM_CODE_OK) return; while (!dom_stack_is_empty(stack)) { struct dom_stack_state *state = get_dom_stack_top(stack); struct dom_stack_walk_state *wstate = get_dom_stack_state_data(context, state); struct dom_node_list *list = wstate->list; struct dom_node *node = state->node; switch (node->type) { case DOM_NODE_DOCUMENT: if (!list) list = node->data.document.children; break; case DOM_NODE_ELEMENT: if (!list) list = node->data.element.map; if (list == node->data.element.children) break; if (is_dom_node_list_member(list, wstate->index) && list == node->data.element.map) break; list = node->data.element.children; break; case DOM_NODE_PROCESSING_INSTRUCTION: if (!list) list = node->data.proc_instruction.map; break; case DOM_NODE_DOCUMENT_TYPE: if (!list) list = node->data.document_type.entities; if (list == node->data.document_type.notations) break; if (is_dom_node_list_member(list, wstate->index) && list == node->data.document_type.entities) break; list = node->data.document_type.notations; break; case DOM_NODE_ATTRIBUTE: case DOM_NODE_TEXT: case DOM_NODE_CDATA_SECTION: case DOM_NODE_COMMENT: case DOM_NODE_NOTATION: case DOM_NODE_DOCUMENT_FRAGMENT: case DOM_NODE_ENTITY_REFERENCE: case DOM_NODE_ENTITY: default: break; } /* Reset list state if it is a new list */ if (list != wstate->list) { wstate->list = list; wstate->index = 0; } /* If we have next child node */ if (is_dom_node_list_member(list, wstate->index)) { struct dom_node *child = list->entries[wstate->index++]; if (push_dom_node(stack, child) == DOM_CODE_OK) continue; } pop_dom_node(stack); } done_dom_stack_context(stack, context); }