static inline void parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) { struct dom_scanner_token name; assert(dom_scanner_has_tokens(scanner) && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION))); if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN) skip_dom_scanner_token(scanner); while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); assert(token); switch (token->type) { case SGML_TOKEN_TAG_END: skip_dom_scanner_token(scanner); /* and return */ case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: case SGML_TOKEN_ELEMENT_END: case SGML_TOKEN_ELEMENT_EMPTY_END: return; case SGML_TOKEN_IDENT: copy_struct(&name, token); /* Skip the attribute name token */ token = get_next_dom_scanner_token(scanner); if (token && token->type == '=') { /* If the token is not a valid value token * ignore it. */ token = get_next_dom_scanner_token(scanner); if (token && token->type != SGML_TOKEN_IDENT && token->type != SGML_TOKEN_ATTRIBUTE && token->type != SGML_TOKEN_STRING) token = NULL; } else { token = NULL; } add_sgml_attribute(stack, &name, token); /* Skip the value token */ if (token) skip_dom_scanner_token(scanner); break; default: skip_dom_scanner_token(scanner); } } }
struct dom_scanner_token * skip_dom_scanner_tokens(struct dom_scanner *scanner, int skipto, int precedence) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); /* Skip tokens while handling some basic precedens of special chars * so we don't skip to long. */ while (token) { if (token->type == skipto || token->precedence > precedence) break; token = get_next_dom_scanner_token(scanner); } return (token && token->type == skipto) ? get_next_dom_scanner_token(scanner) : NULL; }
/* Parses attribute selector. For example '[foo="bar"]' or '[foo|="boo"]'. */ static enum dom_code parse_dom_select_attribute(struct dom_select_node *sel, struct dom_scanner *scanner) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); /* Get '['. */ if (token->type != '[') return DOM_CODE_SYNTAX_ERR; /* Get the attribute name. */ token = get_next_dom_scanner_token(scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_CODE_SYNTAX_ERR; copy_dom_string(&sel->node.string, &token->string); /* Get the optional '=' combo or ending ']'. */ token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; switch (token->type) { case ']': sel->match.attribute |= DOM_SELECT_ATTRIBUTE_ANY; return DOM_CODE_OK; case CSS_TOKEN_SELECT_SPACE_LIST: sel->match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST; break; case CSS_TOKEN_SELECT_HYPHEN_LIST: sel->match.attribute |= DOM_SELECT_ATTRIBUTE_HYPHEN_LIST; break; case CSS_TOKEN_SELECT_BEGIN: sel->match.attribute |= DOM_SELECT_ATTRIBUTE_BEGIN; break; case CSS_TOKEN_SELECT_END: sel->match.attribute |= DOM_SELECT_ATTRIBUTE_END; break; case CSS_TOKEN_SELECT_CONTAINS: sel->match.attribute |= DOM_SELECT_ATTRIBUTE_CONTAINS; break; default: return DOM_CODE_SYNTAX_ERR; } /* Get the required value. */ token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; switch (token->type) { case CSS_TOKEN_IDENT: case CSS_TOKEN_STRING: copy_dom_string(&sel->node.data.attribute.value, &token->string); break; default: return DOM_CODE_SYNTAX_ERR; } /* Get the ending ']'. */ token = get_next_dom_scanner_token(scanner); if (token && token->type == ']') return DOM_CODE_OK; return DOM_CODE_SYNTAX_ERR; }
/* Parse a CSS3 selector and add selector nodes to the @select struct. */ static enum dom_code parse_dom_select(struct dom_select *select, struct dom_stack *stack, struct dom_string *string) { struct dom_scanner scanner; struct dom_select_node sel; init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0, 0); memset(&sel, 0, sizeof(sel)); while (dom_scanner_has_tokens(&scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(&scanner); enum dom_code code; struct dom_select_node *select_node; assert(token); if (token->type == '{' || token->type == '}' || token->type == ';' || token->type == ',') break; /* Examine the selector fragment */ switch (token->type) { case CSS_TOKEN_IDENT: sel.node.type = DOM_NODE_ELEMENT; copy_dom_string(&sel.node.string, &token->string); if (dom_scanner_token_contains(token, "*")) sel.match.element |= DOM_SELECT_ELEMENT_UNIVERSAL; break; case CSS_TOKEN_HASH: case CSS_TOKEN_HEX_COLOR: /* ID fragment */ sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_ID; /* Skip the leading '#'. */ skip_dom_scanner_token_char(token); break; case '[': sel.node.type = DOM_NODE_ATTRIBUTE; code = parse_dom_select_attribute(&sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '.': token = get_next_dom_scanner_token(&scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_CODE_SYNTAX_ERR; sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST; set_dom_string(&sel.node.string, "class", -1); copy_dom_string(&sel.node.data.attribute.value, &token->string); break; case ':': code = parse_dom_select_pseudo(select, &sel, &scanner); if (code != DOM_CODE_OK) return code; break; case '>': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_CHILD; break; case '+': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_DIRECT_ADJACENT; break; case '~': if (get_element_relation(&sel) != DOM_SELECT_RELATION_DESCENDANT) return DOM_CODE_SYNTAX_ERR; sel.match.element |= DOM_SELECT_RELATION_INDIRECT_ADJACENT; break; default: return DOM_CODE_SYNTAX_ERR; } skip_dom_scanner_token(&scanner); if (sel.node.type == DOM_NODE_UNKNOWN) continue; select_node = mem_calloc(1, sizeof(*select_node)); copy_struct(select_node, &sel); if (!dom_stack_is_empty(stack)) { struct dom_node *node = &select_node->node; struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_node_list **list = get_dom_node_list(parent, node); int sort = (node->type == DOM_NODE_ATTRIBUTE); int index; assertm(list != NULL, "Adding node to bad parent [%d -> %d]", node->type, parent->type); index = *list && (*list)->size > 0 && sort ? get_dom_node_map_index(*list, node) : -1; if (!add_to_dom_node_list(list, node, index)) { done_dom_node(node); return DOM_CODE_ALLOC_ERR; } node->parent = parent; } else { assert(!select->selector); select->selector = select_node; } code = push_dom_node(stack, &select_node->node); if (code != DOM_CODE_OK) return code; if (select_node->node.type != DOM_NODE_ELEMENT) pop_dom_node(stack); memset(&sel, 0, sizeof(sel)); } if (select->selector) return DOM_CODE_OK; return DOM_CODE_ERR; }
/* Parse a pseudo-class or -element with the syntax: ':<ident>'. */ static enum dom_code parse_dom_select_pseudo(struct dom_select *select, struct dom_select_node *sel, struct dom_scanner *scanner) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); enum dom_select_pseudo pseudo; enum dom_code code; /* Skip double :'s in front of some pseudo's (::first-line, etc.) */ do { token = get_next_dom_scanner_token(scanner); } while (token && token->type == ':'); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_CODE_SYNTAX_ERR; pseudo = get_dom_select_pseudo(token); switch (pseudo) { case DOM_SELECT_PSEUDO_UNKNOWN: return DOM_CODE_ERR; case DOM_SELECT_PSEUDO_CONTAINS: /* FIXME: E:contains("text") */ break; case DOM_SELECT_PSEUDO_NTH_CHILD: case DOM_SELECT_PSEUDO_NTH_LAST_CHILD: code = parse_dom_select_nth_arg(&sel->nth_child, scanner); if (code != DOM_CODE_OK) return code; sel->match.element |= DOM_SELECT_ELEMENT_NTH_CHILD; break; case DOM_SELECT_PSEUDO_FIRST_CHILD: sel->match.element |= DOM_SELECT_ELEMENT_NTH_CHILD; set_dom_select_nth_match(&sel->nth_child, 0, 1); break; case DOM_SELECT_PSEUDO_LAST_CHILD: sel->match.element |= DOM_SELECT_ELEMENT_NTH_CHILD; set_dom_select_nth_match(&sel->nth_child, 0, -1); break; case DOM_SELECT_PSEUDO_ONLY_CHILD: sel->match.element |= DOM_SELECT_ELEMENT_NTH_CHILD; set_dom_select_nth_match(&sel->nth_child, 0, 0); break; case DOM_SELECT_PSEUDO_NTH_TYPE: case DOM_SELECT_PSEUDO_NTH_LAST_TYPE: code = parse_dom_select_nth_arg(&sel->nth_type, scanner); if (code != DOM_CODE_OK) return code; sel->match.element |= DOM_SELECT_ELEMENT_NTH_TYPE; break; case DOM_SELECT_PSEUDO_FIRST_TYPE: sel->match.element |= DOM_SELECT_ELEMENT_NTH_TYPE; set_dom_select_nth_match(&sel->nth_type, 0, 1); break; case DOM_SELECT_PSEUDO_LAST_TYPE: sel->match.element |= DOM_SELECT_ELEMENT_NTH_TYPE; set_dom_select_nth_match(&sel->nth_type, 0, -1); break; case DOM_SELECT_PSEUDO_ONLY_TYPE: sel->match.element |= DOM_SELECT_ELEMENT_NTH_TYPE; set_dom_select_nth_match(&sel->nth_type, 0, 0); break; case DOM_SELECT_PSEUDO_ROOT: sel->match.element |= DOM_SELECT_ELEMENT_ROOT; break; case DOM_SELECT_PSEUDO_EMPTY: sel->match.element |= DOM_SELECT_ELEMENT_EMPTY; break; default: /* It's a bitflag! */ select->pseudo |= pseudo; } return DOM_CODE_OK; }
/* Parses the '(...)' part of ':nth-of-type(...)' and ':nth-child(...)'. */ static enum dom_code parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct dom_scanner *scanner) { struct dom_scanner_token *token = get_next_dom_scanner_token(scanner); int sign = 1; int number = -1; if (!token || token->type != '(') return DOM_CODE_SYNTAX_ERR; token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; switch (token->type) { case CSS_TOKEN_IDENT: if (dom_scanner_token_contains(token, "even")) { nth->step = 2; nth->index = 0; } else if (dom_scanner_token_contains(token, "odd")) { nth->step = 2; nth->index = 1; } else { /* Check for 'n' ident below. */ break; } if (skip_css_tokens(scanner, ')')) return DOM_CODE_OK; return DOM_CODE_SYNTAX_ERR; case '-': sign = -1; token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; if (token->type != CSS_TOKEN_IDENT) break; if (token->type != CSS_TOKEN_NUMBER) return DOM_CODE_SYNTAX_ERR; /* Fall-through */ case CSS_TOKEN_NUMBER: number = get_scanner_token_number(token); if (number < 0) return DOM_CODE_VALUE_ERR; token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; break; default: return DOM_CODE_SYNTAX_ERR; } /* The rest can contain n+ part */ switch (token->type) { case CSS_TOKEN_IDENT: if (!dom_scanner_token_contains(token, "n")) return DOM_CODE_SYNTAX_ERR; nth->step = sign * number; token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; if (token->type != '+') break; token = get_next_dom_scanner_token(scanner); if (!token) return DOM_CODE_SYNTAX_ERR; if (token->type != CSS_TOKEN_NUMBER) break; number = get_scanner_token_number(token); if (number < 0) return DOM_CODE_VALUE_ERR; nth->index = sign * number; break; default: nth->step = 0; nth->index = sign * number; } if (skip_css_tokens(scanner, ')')) return DOM_CODE_OK; return DOM_CODE_SYNTAX_ERR; }
static void parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { struct dom_scanner_token target; while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); switch (token->type) { case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: if (!add_sgml_element(stack, token)) { if (token->type == SGML_TOKEN_ELEMENT) { skip_dom_scanner_token(scanner); break; } skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END); break; } if (token->type == SGML_TOKEN_ELEMENT_BEGIN) { parse_sgml_attributes(stack, scanner); } else { skip_dom_scanner_token(scanner); } break; case SGML_TOKEN_ELEMENT_EMPTY_END: pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ELEMENT_END: if (!token->string.length) { pop_dom_node(stack); } else { struct dom_string string; struct dom_stack_state *state; set_dom_string(&string, token->string.string, token->string.length); state = search_dom_stack(stack, DOM_NODE_ELEMENT, &string); if (state) { struct sgml_parser_state *pstate; pstate = get_sgml_parser_state(stack, state); copy_struct(&pstate->end_token, token); pop_dom_state(stack, state); } } skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_COMMENT: add_sgml_node(stack, DOM_NODE_COMMENT, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_ATTLIST: case SGML_TOKEN_NOTATION_DOCTYPE: case SGML_TOKEN_NOTATION_ELEMENT: case SGML_TOKEN_NOTATION_ENTITY: case SGML_TOKEN_NOTATION: skip_dom_scanner_token(scanner); break; case SGML_TOKEN_CDATA_SECTION: add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_PROCESS_XML_STYLESHEET: case SGML_TOKEN_PROCESS_XML: case SGML_TOKEN_PROCESS: copy_struct(&target, token); /* Skip the target token */ token = get_next_dom_scanner_token(scanner); if (!token) break; assert(token->type == SGML_TOKEN_PROCESS_DATA); if (add_sgml_proc_instruction(stack, &target, token) && (target.type == SGML_TOKEN_PROCESS_XML || target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET) && token->string.length > 0) { /* Parse the <?xml data="attributes"?>. */ struct dom_scanner attr_scanner; init_dom_scanner(&attr_scanner, &sgml_scanner_info, &token->string, SGML_STATE_ELEMENT, scanner->count_lines); if (dom_scanner_has_tokens(&attr_scanner)) parse_sgml_attributes(stack, &attr_scanner); } pop_dom_node(stack); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ENTITY: add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token); skip_dom_scanner_token(scanner); break; case SGML_TOKEN_SPACE: case SGML_TOKEN_TEXT: default: add_sgml_node(stack, DOM_NODE_TEXT, token); skip_dom_scanner_token(scanner); } } }