/* Set @string to the value of the given @node, however, with strings * compressed and entity references 'expanded'. */ static void set_enhanced_dom_node_value(struct dom_string *string, struct dom_node *node) { struct dom_string *value; assert(node); memset(string, 0, sizeof(*string)); switch (node->type) { case DOM_NODE_ENTITY_REFERENCE: /* FIXME: Set to the entity value. */ string->string = null_or_stracpy(string->string); break; default: value = get_dom_node_value(node); if (!value) { set_dom_string(string, NULL, 0); return; } string->string = compress_string(value->string, value->length); } string->length = string->string ? strlen((const char *)string->string) : 0; }
static enum dom_code normalize_text_node_whitespace(struct dom_node *node) { unsigned char buf[256]; struct dom_string string = INIT_DOM_STRING(NULL, 0); int count = 0, i = 0; unsigned char *text = node->string.string; assert(node->type == DOM_NODE_TEXT); while (i < node->string.length) { int j; for (j = 0; j < sizeof(buf) && i < node->string.length; i++) { unsigned char data = text[i]; if (isspace(data)) { if (count == 1) continue; data = ' '; count = 1; } else { count = 0; } buf[j++] = data; } if (!add_to_dom_string(&string, buf, j)) { done_dom_string(&string); return DOM_CODE_ALLOC_ERR; } } if (node->allocated) done_dom_string(&node->string); set_dom_string(&node->string, string.string, string.length); node->allocated = 1; return DOM_CODE_OK; }
/* Parse a CSS3 selector and add selector nodes to the @select struct. */ static enum dom_code parse_dom_select(struct dom_select *select, struct dom_stack *stack, struct dom_string *string) { struct dom_scanner scanner; struct dom_select_node sel; init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0, 0); memset(&sel, 0, sizeof(sel)); while (dom_scanner_has_tokens(&scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(&scanner); enum dom_code code; struct dom_select_node *select_node; assert(token); if (token->type == '{' || token->type == '}' || token->type == ';' || token->type == ',') break; /* Examine the selector fragment */ switch (token->type) { case CSS_TOKEN_IDENT: sel.node.type = DOM_NODE_ELEMENT; copy_dom_string(&sel.node.string, &token->string); if (dom_scanner_token_contains(token, "*")) sel.match.element |= DOM_SELECT_ELEMENT_UNIVERSAL; break; case CSS_TOKEN_HASH: case CSS_TOKEN_HEX_COLOR: /* ID fragment */ sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_ID; /* Skip the leading '#'. */ skip_dom_scanner_token_char(token); break; case '[': sel.node.type = DOM_NODE_ATTRIBUTE; code = parse_dom_select_attribute(&sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '.': token = get_next_dom_scanner_token(&scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_CODE_SYNTAX_ERR; sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST; set_dom_string(&sel.node.string, "class", -1); copy_dom_string(&sel.node.data.attribute.value, &token->string); break; case ':': code = parse_dom_select_pseudo(select, &sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '>': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_CHILD; break; case '+': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_ADJACENT; break; case '~': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_INDIRECT_ADJACENT; break; default: return DOM_CODE_SYNTAX_ERR; } skip_dom_scanner_token(&scanner); if (sel.node.type == DOM_NODE_UNKNOWN) continue; select_node = mem_calloc(1, sizeof(*select_node)); copy_struct(select_node, &sel); if (!dom_stack_is_empty(stack)) { struct dom_node *node = &select_node->node; struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_node_list **list = get_dom_node_list(parent, node); int sort = (node->type == DOM_NODE_ATTRIBUTE); int index; assertm(list != NULL, "Adding node to bad parent [%d -> %d]", node->type, parent->type); index = *list && (*list)->size > 0 && sort ? get_dom_node_map_index(*list, node) : -1; if (!add_to_dom_node_list(list, node, index)) { done_dom_node(node); return DOM_CODE_ALLOC_ERR; } node->parent = parent; } else { assert(!select->selector); select->selector = select_node; } code = push_dom_node(stack, &select_node->node); if (code != DOM_CODE_OK) return code; if (select_node->node.type != DOM_NODE_ELEMENT) pop_dom_node(stack); memset(&sel, 0, sizeof(sel)); } if (select->selector) return DOM_CODE_OK; return DOM_CODE_ERR; }
static void parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { struct dom_scanner_token target; while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); switch (token->type) { case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: if (!add_sgml_element(stack, token)) { if (token->type == SGML_TOKEN_ELEMENT) { skip_dom_scanner_token(scanner); break; } skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END); break; } if (token->type == SGML_TOKEN_ELEMENT_BEGIN) { parse_sgml_attributes(stack, scanner); } else { skip_dom_scanner_token(scanner); } break; case SGML_TOKEN_ELEMENT_EMPTY_END: pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ELEMENT_END: if (!token->string.length) { pop_dom_node(stack); } else { struct dom_string string; struct dom_stack_state *state; set_dom_string(&string, token->string.string, token->string.length); state = search_dom_stack(stack, DOM_NODE_ELEMENT, &string); if (state) { struct sgml_parser_state *pstate; pstate = get_sgml_parser_state(stack, state); copy_struct(&pstate->end_token, token); pop_dom_state(stack, state); } } skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_COMMENT: add_sgml_node(stack, DOM_NODE_COMMENT, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_ATTLIST: case SGML_TOKEN_NOTATION_DOCTYPE: case SGML_TOKEN_NOTATION_ELEMENT: case SGML_TOKEN_NOTATION_ENTITY: case SGML_TOKEN_NOTATION: skip_dom_scanner_token(scanner); break; case SGML_TOKEN_CDATA_SECTION: add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_PROCESS_XML_STYLESHEET: case SGML_TOKEN_PROCESS_XML: case SGML_TOKEN_PROCESS: copy_struct(&target, token); /* Skip the target token */ token = get_next_dom_scanner_token(scanner); if (!token) break; assert(token->type == SGML_TOKEN_PROCESS_DATA); if (add_sgml_proc_instruction(stack, &target, token) && (target.type == SGML_TOKEN_PROCESS_XML || target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET) && token->string.length > 0) { /* Parse the <?xml data="attributes"?>. */ struct dom_scanner attr_scanner; init_dom_scanner(&attr_scanner, &sgml_scanner_info, &token->string, SGML_STATE_ELEMENT, scanner->count_lines); if (dom_scanner_has_tokens(&attr_scanner)) parse_sgml_attributes(stack, &attr_scanner); } pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ENTITY: add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_SPACE: case SGML_TOKEN_TEXT: default: add_sgml_node(stack, DOM_NODE_TEXT, token); skip_dom_scanner_token(scanner); } } }
int main(int argc, char *argv[]) { struct sgml_parser *parser; enum sgml_document_type doctype = SGML_DOCTYPE_HTML; enum sgml_parser_flag flags = 0; enum sgml_parser_type type = SGML_PARSER_STREAM; enum dom_code code = 0; enum dom_config_flag normalize_flags = 0; struct dom_config config; int normalize = 0; int dump = 0; int complete = 1; size_t read_stdin = 0; struct dom_string uri = STATIC_DOM_STRING("dom://test"); struct dom_string source = STATIC_DOM_STRING("(no source)"); int i; for (i = 1; i < argc; i++) { char *arg = argv[i]; if (strncmp(arg, "--", 2)) break; arg += 2; if (get_test_opt(&arg, "uri", &i, argc, argv, "a URI")) { set_dom_string(&uri, arg, strlen((const char *)arg)); } else if (get_test_opt(&arg, "src", &i, argc, argv, "a string")) { set_dom_string(&source, arg, strlen((const char *)arg)); } else if (get_test_opt(&arg, "stdin", &i, argc, argv, "a number")) { read_stdin = atoi(arg); flags |= SGML_PARSER_INCREMENTAL; } else if (get_test_opt(&arg, "normalize", &i, argc, argv, "a string")) { normalize = 1; normalize_flags = parse_dom_config(arg, ','); type = SGML_PARSER_TREE; } else if (!strcmp(arg, "print-lines")) { flags |= SGML_PARSER_COUNT_LINES; } else if (!strcmp(arg, "incomplete")) { flags |= SGML_PARSER_INCREMENTAL; complete = 0; } else if (!strcmp(arg, "dump")) { type = SGML_PARSER_TREE; dump = 1; } else if (!strcmp(arg, "error")) { flags |= SGML_PARSER_DETECT_ERRORS; } else if (!strcmp(arg, "help")) { die(NULL); } else { die("Unknown argument '%s'", arg - 2); } } parser = init_sgml_parser(type, doctype, &uri, flags); if (!parser) return 1; parser->error_func = sgml_error_function; if (normalize) add_dom_config_normalizer(&parser->stack, &config, normalize_flags); else if (!dump) add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info); if (read_stdin > 0) { unsigned char *buffer; buffer = mem_alloc(read_stdin); if (!buffer) die("Cannot allocate buffer"); complete = 0; while (!complete) { size_t size = fread(buffer, 1, read_stdin, stdin); if (ferror(stdin)) die("error reading from stdin"); complete = feof(stdin); code = parse_sgml(parser, buffer, size, complete); switch (code) { case DOM_CODE_OK: break; case DOM_CODE_INCOMPLETE: if (!complete) break; /* Error */ default: complete = 1; } } mem_free(buffer); } else { code = parse_sgml(parser, source.string, source.length, complete); } if (parser->root) { assert(!complete || parser->stack.depth > 0); while (!dom_stack_is_empty(&parser->stack)) { get_dom_stack_top(&parser->stack)->immutable = 0; pop_dom_node(&parser->stack); } if (normalize || dump) { struct dom_stack stack; /* Note, that we cannot free nodes when walking the DOM * tree since walk_dom_node() uses an index to traverse * the tree. */ init_dom_stack(&stack, DOM_STACK_FLAG_NONE); /* XXX: This context needs to be added first because it * assumes the parser can be accessed via * stack->contexts[0].data. */ if (normalize) add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info); else if (dump) add_sgml_file_dumper(&stack, stdout); walk_dom_nodes(&stack, parser->root); done_dom_stack(&stack); done_dom_node(parser->root); } } done_sgml_parser(parser); #ifdef DEBUG_MEMLEAK check_memory_leaks(); #endif return code != DOM_CODE_OK ? 1 : 0; }
static enum dom_code append_node_text(struct dom_config *config, struct dom_node *node) { struct dom_node *prev = get_dom_node_prev(node); size_t length; struct dom_string dest; struct dom_string src; int error = 0; copy_struct(&src, &node->string); if (!prev || prev->type != DOM_NODE_TEXT) { /* Preserve text nodes with no one to append to. */ if (node->type == DOM_NODE_TEXT) return DOM_CODE_OK; prev = NULL; set_dom_string(&dest, NULL, 0); } else { if (prev->allocated) { copy_struct(&dest, &prev->string); } else { set_dom_string(&dest, NULL, 0); if (!add_to_dom_string(&dest, prev->string.string, prev->string.length)) return DOM_CODE_ALLOC_ERR; set_dom_string(&prev->string, dest.string, dest.length); prev->allocated = 1; } } length = dest.length; switch (node->type) { case DOM_NODE_CDATA_SECTION: case DOM_NODE_TEXT: if (!add_to_dom_string(&dest, src.string, src.length)) error = 1; break; case DOM_NODE_ENTITY_REFERENCE: /* FIXME: Until we will have uniform encoding at this point * (UTF-8) we just add the entity reference unexpanded assuming * that convert_string() will eventually do the work of * expanding it. */ if (!add_to_dom_string(&dest, "&", 1) || !add_to_dom_string(&dest, src.string, src.length) || !add_to_dom_string(&dest, ";", 1)) { error = 1; } break; default: INTERNAL("Cannot append from node %d", node->type); } if (error) { if (prev) prev->string.length = length; else done_dom_string(&dest); return DOM_CODE_ALLOC_ERR; } if (prev) { copy_struct(&prev->string, &dest); if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) && node->type != DOM_NODE_ENTITY_REFERENCE) { /* XXX: Ignore errors since we want to always * free the appended node at this point. */ normalize_text_node_whitespace(prev); } return DOM_CODE_FREE_NODE; } else { int was_cdata_section = node->type == DOM_NODE_CDATA_SECTION; node->type = DOM_NODE_TEXT; memset(&node->data, 0, sizeof(node->data)); node->allocated = 1; copy_struct(&node->string, &dest); if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) && was_cdata_section) { /* XXX: Ignore errors since we want to always ok the * append. */ normalize_text_node_whitespace(node); } return DOM_CODE_OK; } }