Beispiel #1
0
static inline struct dom_node *
add_sgml_element(struct dom_stack *stack, struct dom_scanner_token *token)
{
	struct sgml_parser *parser = get_sgml_parser(stack);
	struct dom_node *parent = get_dom_stack_top(stack)->node;
	struct dom_stack_state *state;
	struct sgml_parser_state *pstate;
	struct dom_node *node;
	struct sgml_node_info *node_info;

	node = add_dom_element(parent, &token->string);
	if (!node) return NULL;

	node_info = get_sgml_node_info(parser->info->elements, node);
	node->data.element.type = node_info->type;

	if (!push_dom_node(stack, node))
		return NULL;

	state = get_dom_stack_top(stack);
	assert(node == state->node);

	pstate = get_sgml_parser_state(stack, state);
	pstate->info = node_info;

	return node;
}
Beispiel #2
0
static inline void
add_sgml_attribute(struct dom_stack *stack,
		   struct dom_scanner_token *token, struct dom_scanner_token *valtoken)
{
	struct sgml_parser *parser = get_sgml_parser(stack);
	struct dom_node *parent = get_dom_stack_top(stack)->node;
	struct dom_string *value = valtoken ? &valtoken->string : NULL;
	struct sgml_node_info *info;
	struct dom_node *node;

	node = add_dom_attribute(parent, &token->string, value);

	info = get_sgml_node_info(parser->info->attributes, node);

	node->data.attribute.type      = info->type;
	node->data.attribute.id	       = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER);
	node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE);

	if (valtoken && valtoken->type == SGML_TOKEN_STRING)
		node->data.attribute.quoted = 1;

	if (!node || !push_dom_node(stack, node))
		return;

	pop_dom_node(stack);
}
Beispiel #3
0
void
pop_dom_node(struct dom_stack *stack)
{
	struct dom_stack_state *state;
	int i;

	assert(stack);

	if (dom_stack_is_empty(stack))
		return;

	state = get_dom_stack_top(stack);
	if (state->immutable)
		return;

	if (call_dom_stack_callbacks(stack, state, DOM_STACK_POP)
	    || (stack->flags & DOM_STACK_FLAG_FREE_NODES))
		done_dom_node(state->node);

	stack->depth--;
	assert(stack->depth >= 0);

	for (i = 0; i < stack->contexts_size; i++) {
		struct dom_stack_context *context = stack->contexts[i];

		if (context->info->object_size) {
			void *state_data = get_dom_stack_state_data(context, state);

			memset(state_data, 0, context->info->object_size);
		}
	}

	memset(state, 0, sizeof(*state));
}
Beispiel #4
0
static inline void
parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
{
	struct dom_scanner_token name;

	assert(dom_scanner_has_tokens(scanner)
	       && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
	           || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION)));

	if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
		skip_dom_scanner_token(scanner);

	while (dom_scanner_has_tokens(scanner)) {
		struct dom_scanner_token *token = get_dom_scanner_token(scanner);

		assert(token);

		switch (token->type) {
		case SGML_TOKEN_TAG_END:
			skip_dom_scanner_token(scanner);
			/* and return */
		case SGML_TOKEN_ELEMENT:
		case SGML_TOKEN_ELEMENT_BEGIN:
		case SGML_TOKEN_ELEMENT_END:
		case SGML_TOKEN_ELEMENT_EMPTY_END:
			return;

		case SGML_TOKEN_IDENT:
			copy_struct(&name, token);

			/* Skip the attribute name token */
			token = get_next_dom_scanner_token(scanner);
			if (token && token->type == '=') {
				/* If the token is not a valid value token
				 * ignore it. */
				token = get_next_dom_scanner_token(scanner);
				if (token
				    && token->type != SGML_TOKEN_IDENT
				    && token->type != SGML_TOKEN_ATTRIBUTE
				    && token->type != SGML_TOKEN_STRING)
					token = NULL;
			} else {
				token = NULL;
			}

			add_sgml_attribute(stack, &name, token);

			/* Skip the value token */
			if (token)
				skip_dom_scanner_token(scanner);
			break;

		default:
			skip_dom_scanner_token(scanner);

		}
	}
}
Beispiel #5
0
static void
sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
{
	struct sgml_parser *parser = get_sgml_parser(stack);
	struct sgml_parsing_state *parsing = data;

	parsing->depth = parser->stack.depth;
	get_dom_stack_top(&parser->stack)->immutable = 1;
	init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
			 SGML_STATE_TEXT, 0);
}
Beispiel #6
0
/* Create a new parsing state by pushing a new text node containing the*/
static struct sgml_parsing_state *
init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer)
{
	struct dom_stack_state *state;
	struct dom_node *node;

	node = init_dom_node(DOM_NODE_TEXT, buffer);
	if (!node || !push_dom_node(&parser->parsing, node))
		return NULL;

	state = get_dom_stack_top(&parser->parsing);

	return get_dom_stack_state_data(parser->parsing.contexts[0], state);
}
Beispiel #7
0
static inline void
add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token)
{
	struct dom_node *parent = get_dom_stack_top(stack)->node;
	struct dom_node *node = add_dom_node(parent, type, &token->string);

	if (!node) return;

	if (token->type == SGML_TOKEN_SPACE)
		node->data.text.only_space = 1;

	if (push_dom_node(stack, node))
		pop_dom_node(stack);
}
Beispiel #8
0
static void
sgml_parsing_pop(struct dom_stack *stack, struct dom_node *node, void *data)
{
	struct sgml_parser *parser = get_sgml_parser(stack);
	struct sgml_parsing_state *parsing = data;

	/* Pop the stack back to the state it was in. This includes cleaning
	 * away even immutable states left on the stack. */
	while (parsing->depth < parser->stack.depth) {
		get_dom_stack_top(&parser->stack)->immutable = 0;
		pop_dom_node(&parser->stack);
	}

	assert(parsing->depth == parser->stack.depth);
}
Beispiel #9
0
struct dom_node *
parse_sgml(struct sgml_parser *parser, struct dom_string *buffer)
{
	struct sgml_parsing_state *parsing;

	if (!parser->root) {
		parser->root = add_sgml_document(&parser->stack, &parser->uri);
		if (!parser->root)
			return NULL;
		get_dom_stack_top(&parser->stack)->immutable = 1;
	}

	parsing = init_sgml_parsing_state(parser, buffer);
	if (!parsing) return NULL;

	/* FIXME: Make parse_sgml_plain() return something (error code or if
	 * can be guarenteed a root node). */
	parse_sgml_plain(&parser->stack, &parsing->scanner);

	pop_dom_node(&parser->parsing);

	return parser->root;
}
Beispiel #10
0
static inline struct dom_node *
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
			  struct dom_scanner_token *data)
{
	struct dom_node *parent = get_dom_stack_top(stack)->node;
	struct dom_string *data_str = data ? &data->string : NULL;
	struct dom_node *node;

	node = add_dom_proc_instruction(parent, &target->string, data_str);
	if (!node) return NULL;

	switch (target->type) {
	case SGML_TOKEN_PROCESS_XML:
		node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
		break;

	case SGML_TOKEN_PROCESS:
	default:
		node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
	}

	return push_dom_node(stack, node);
}
Beispiel #11
0
/* Parse a CSS3 selector and add selector nodes to the @select struct. */
static enum dom_code
parse_dom_select(struct dom_select *select, struct dom_stack *stack,
		 struct dom_string *string)
{
	struct dom_scanner scanner;
	struct dom_select_node sel;

	init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0, 0);

	memset(&sel, 0, sizeof(sel));

	while (dom_scanner_has_tokens(&scanner)) {
		struct dom_scanner_token *token = get_dom_scanner_token(&scanner);
		enum dom_code code;
		struct dom_select_node *select_node;

		assert(token);

		if (token->type == '{'
		    || token->type == '}'
		    || token->type == ';'
		    || token->type == ',')
			break;

		/* Examine the selector fragment */

		switch (token->type) {
		case CSS_TOKEN_IDENT:
			sel.node.type = DOM_NODE_ELEMENT;
			copy_dom_string(&sel.node.string, &token->string);
			if (dom_scanner_token_contains(token, "*"))
				sel.match.element |= DOM_SELECT_ELEMENT_UNIVERSAL;
			break;

		case CSS_TOKEN_HASH:
		case CSS_TOKEN_HEX_COLOR:
			/* ID fragment */
			sel.node.type = DOM_NODE_ATTRIBUTE;
			sel.match.attribute |= DOM_SELECT_ATTRIBUTE_ID;
			/* Skip the leading '#'. */
			skip_dom_scanner_token_char(token);
			break;

		case '[':
			sel.node.type = DOM_NODE_ATTRIBUTE;
			code = parse_dom_select_attribute(&sel, &scanner);
			if (code != DOM_CODE_OK)
				return code;
			break;

		case '.':
			token = get_next_dom_scanner_token(&scanner);
			if (!token || token->type != CSS_TOKEN_IDENT)
				return DOM_CODE_SYNTAX_ERR;

			sel.node.type = DOM_NODE_ATTRIBUTE;
			sel.match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST;
			set_dom_string(&sel.node.string, "class", -1);
			copy_dom_string(&sel.node.data.attribute.value, &token->string);
			break;

		case ':':
			code = parse_dom_select_pseudo(select, &sel, &scanner);
			if (code != DOM_CODE_OK)
				return code;
			break;

		case '>':
			if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT)
				return DOM_CODE_SYNTAX_ERR;
			sel.match.element |= DOM_SELECT_RELATION_DIRECT_CHILD;
			break;

		case '+':
			if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT)
				return DOM_CODE_SYNTAX_ERR;
			sel.match.element |= DOM_SELECT_RELATION_DIRECT_ADJACENT;
			break;

		case '~':
			if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT)
				return DOM_CODE_SYNTAX_ERR;
			sel.match.element |= DOM_SELECT_RELATION_INDIRECT_ADJACENT;
			break;

		default:
			return DOM_CODE_SYNTAX_ERR;
		}

		skip_dom_scanner_token(&scanner);

		if (sel.node.type == DOM_NODE_UNKNOWN)
			continue;

		select_node = mem_calloc(1, sizeof(*select_node));
		copy_struct(select_node, &sel);

		if (!dom_stack_is_empty(stack)) {
			struct dom_node *node = &select_node->node;
			struct dom_node *parent = get_dom_stack_top(stack)->node;
			struct dom_node_list **list = get_dom_node_list(parent, node);
			int sort = (node->type == DOM_NODE_ATTRIBUTE);
			int index;

			assertm(list != NULL, "Adding node to bad parent [%d -> %d]",
				node->type, parent->type);

			index = *list && (*list)->size > 0 && sort
				? get_dom_node_map_index(*list, node) : -1;

			if (!add_to_dom_node_list(list, node, index)) {
				done_dom_node(node);
				return DOM_CODE_ALLOC_ERR;
			}

			node->parent = parent;

		} else {
			assert(!select->selector);
			select->selector = select_node;
		}

		code = push_dom_node(stack, &select_node->node);
		if (code != DOM_CODE_OK)
			return code;

		if (select_node->node.type != DOM_NODE_ELEMENT)
			pop_dom_node(stack);

		memset(&sel, 0, sizeof(sel));
	}

	if (select->selector)
		return DOM_CODE_OK;

	return DOM_CODE_ERR;
}
Beispiel #12
0
int
main(int argc, char *argv[])
{
	struct sgml_parser *parser;
	enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
	enum sgml_parser_flag flags = 0;
	enum sgml_parser_type type = SGML_PARSER_STREAM;
	enum dom_code code = 0;
	enum dom_config_flag normalize_flags = 0;
	struct dom_config config;
	int normalize = 0;
	int dump = 0;
	int complete = 1;
	size_t read_stdin = 0;
	struct dom_string uri = STATIC_DOM_STRING("dom://test");
	struct dom_string source = STATIC_DOM_STRING("(no source)");
	int i;

	for (i = 1; i < argc; i++) {
		char *arg = argv[i];

		if (strncmp(arg, "--", 2))
			break;

		arg += 2;

		if (get_test_opt(&arg, "uri", &i, argc, argv, "a URI")) {
			set_dom_string(&uri, arg, strlen((const char *)arg));

		} else if (get_test_opt(&arg, "src", &i, argc, argv, "a string")) {
			set_dom_string(&source, arg, strlen((const char *)arg));

		} else if (get_test_opt(&arg, "stdin", &i, argc, argv, "a number")) {
			read_stdin = atoi(arg);
			flags |= SGML_PARSER_INCREMENTAL;

		} else if (get_test_opt(&arg, "normalize", &i, argc, argv, "a string")) {
			normalize = 1;
			normalize_flags = parse_dom_config(arg, ',');
			type = SGML_PARSER_TREE;

		} else if (!strcmp(arg, "print-lines")) {
			flags |= SGML_PARSER_COUNT_LINES;

		} else if (!strcmp(arg, "incomplete")) {
			flags |= SGML_PARSER_INCREMENTAL;
			complete = 0;

		} else if (!strcmp(arg, "dump")) {
			type = SGML_PARSER_TREE;
			dump = 1;

		} else if (!strcmp(arg, "error")) {
			flags |= SGML_PARSER_DETECT_ERRORS;

		} else if (!strcmp(arg, "help")) {
			die(NULL);

		} else {
			die("Unknown argument '%s'", arg - 2);
		}
	}

	parser = init_sgml_parser(type, doctype, &uri, flags);
	if (!parser) return 1;

	parser->error_func = sgml_error_function;
	if (normalize)
		add_dom_config_normalizer(&parser->stack, &config, normalize_flags);
	else if (!dump)
		add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);

	if (read_stdin > 0) {
		unsigned char *buffer;

		buffer = mem_alloc(read_stdin);
		if (!buffer)
			die("Cannot allocate buffer");

		complete = 0;

		while (!complete) {
			size_t size = fread(buffer, 1, read_stdin, stdin);

			if (ferror(stdin))
				die("error reading from stdin");

			complete = feof(stdin);

			code = parse_sgml(parser, buffer, size, complete);
			switch (code) {
			case DOM_CODE_OK:
				break;

			case DOM_CODE_INCOMPLETE:
				if (!complete) break;
				/* Error */
			default:
				complete = 1;
			}
		}

		mem_free(buffer);

	} else {
		code = parse_sgml(parser, source.string, source.length, complete);
	}

	if (parser->root) {
		assert(!complete || parser->stack.depth > 0);

		while (!dom_stack_is_empty(&parser->stack)) {
			get_dom_stack_top(&parser->stack)->immutable = 0;
			pop_dom_node(&parser->stack);
		}

		if (normalize || dump) {
			struct dom_stack stack;

			/* Note, that we cannot free nodes when walking the DOM
			 * tree since walk_dom_node() uses an index to traverse
			 * the tree. */
			init_dom_stack(&stack, DOM_STACK_FLAG_NONE);
			/* XXX: This context needs to be added first because it
			 * assumes the parser can be accessed via
			 * stack->contexts[0].data. */
			if (normalize)
				add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info);
			else if (dump)
				add_sgml_file_dumper(&stack, stdout);
			walk_dom_nodes(&stack, parser->root);
			done_dom_stack(&stack);
			done_dom_node(parser->root);
		}
	}

	done_sgml_parser(parser);
#ifdef DEBUG_MEMLEAK
	check_memory_leaks();
#endif

	return code != DOM_CODE_OK ? 1 : 0;
}
Beispiel #13
0
/* FIXME: Instead of walking all nodes in the tree only visit those which are
 * of actual interest to the contexts on the stack. */
void
walk_dom_nodes(struct dom_stack *stack, struct dom_node *root)
{
	struct dom_stack_context *context;

	assert(root && stack);

	context = add_dom_stack_context(stack, NULL, &dom_stack_walk_context_info);
	if (!context)
		return;

	if (push_dom_node(stack, root) != DOM_CODE_OK)
		return;

	while (!dom_stack_is_empty(stack)) {
		struct dom_stack_state *state = get_dom_stack_top(stack);
		struct dom_stack_walk_state *wstate = get_dom_stack_state_data(context, state);
		struct dom_node_list *list = wstate->list;
		struct dom_node *node = state->node;

		switch (node->type) {
		case DOM_NODE_DOCUMENT:
			if (!list) list = node->data.document.children;
			break;

		case DOM_NODE_ELEMENT:
			if (!list) list = node->data.element.map;

			if (list == node->data.element.children) break;
			if (is_dom_node_list_member(list, wstate->index)
			    && list == node->data.element.map)
				break;

			list = node->data.element.children;
			break;

		case DOM_NODE_PROCESSING_INSTRUCTION:
			if (!list) list = node->data.proc_instruction.map;
			break;

		case DOM_NODE_DOCUMENT_TYPE:
			if (!list) list = node->data.document_type.entities;

			if (list == node->data.document_type.notations) break;
			if (is_dom_node_list_member(list, wstate->index)
			    && list == node->data.document_type.entities)
				break;

			list = node->data.document_type.notations;
			break;

		case DOM_NODE_ATTRIBUTE:
		case DOM_NODE_TEXT:
		case DOM_NODE_CDATA_SECTION:
		case DOM_NODE_COMMENT:
		case DOM_NODE_NOTATION:
		case DOM_NODE_DOCUMENT_FRAGMENT:
		case DOM_NODE_ENTITY_REFERENCE:
		case DOM_NODE_ENTITY:
		default:
			break;
		}

		/* Reset list state if it is a new list */
		if (list != wstate->list) {
			wstate->list  = list;
			wstate->index = 0;
		}

		/* If we have next child node */
		if (is_dom_node_list_member(list, wstate->index)) {
			struct dom_node *child = list->entries[wstate->index++];

			if (push_dom_node(stack, child) == DOM_CODE_OK)
				continue;
		}

		pop_dom_node(stack);
	}

	done_dom_stack_context(stack, context);
}