xml_token_type_t xml_get_tag_attributes(xml_reader_t *xr, xml_node_t *node) { ni_stringbuf_t tokenValue, attrName, attrValue; xml_token_type_t token; ni_stringbuf_init(&tokenValue); ni_stringbuf_init(&attrName); ni_stringbuf_init(&attrValue); token = xml_get_token(xr, &tokenValue); while (1) { if (token == RightAngle || token == RightAngleQ || token == RightAngleSlash) break; if (token != Identifier) { xml_parse_error(xr, "Unexpected token in tag attributes"); token = None; break; } ni_stringbuf_move(&attrName, &tokenValue); token = xml_get_token(xr, &tokenValue); if (token != Equals) { xml_node_add_attr(node, attrName.string, NULL); continue; } token = xml_get_token(xr, &tokenValue); if (token != QuotedString) { xml_parse_error(xr, "Attribute value not a quoted string!"); token = None; break; } xml_debug(" attr %s=%s\n", attrName.string, tokenValue.string); xml_node_add_attr(node, attrName.string, tokenValue.string); token = xml_get_token(xr, &tokenValue); } ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&attrName); ni_stringbuf_destroy(&attrValue); return token; }
int xml_get_attr(XML_PARSER* parser, XML_TOKEN* attr, XML_TOKEN* value) { xml_get_token(parser, attr); if(attr->type != XML_TOKEN_ATTR_NAME) { parser->pushed_back_token = *attr; parser->has_token = 1; return 0; } xml_get_token(parser, value); if(value->type != XML_TOKEN_ATTR_VALUE) { parser->pushed_back_token = *attr; parser->has_token = 1; return 0; } return 1; }
void xml_skip_attributes(XML_PARSER* parser) { XML_TOKEN token; while(xml_get_token(parser, &token) && (token.type == XML_TOKEN_ATTR_NAME || token.type == XML_TOKEN_ATTR_VALUE)) { } parser->pushed_back_token = token; parser->has_token = 1; }
int xml_get_child(XML_PARSER* parser, XML_TOKEN* child) { while(xml_get_token(parser, child)) { if(child->type == XML_TOKEN_END_ELEMENT) { return 0; } else if(child->type == XML_TOKEN_BEGIN_ELEMENT) { return 1; } } return 0; }
int xml_get_text_and_skip(XML_PARSER* parser, XML_TOKEN* text) { xml_skip_attributes(parser); while(xml_get_token(parser, text)) { if(text->type == XML_TOKEN_TEXT) { xml_skip_element(parser); return 1; } else if(text->type == XML_TOKEN_BEGIN_ELEMENT) { xml_skip_element(parser); } else { return 0; } } }
void xml_skip_element(XML_PARSER* parser) { XML_TOKEN token; int elements = 1; while(xml_get_token(parser, &token)) { if(token.type == XML_TOKEN_BEGIN_ELEMENT) { ++elements; } else if(token.type == XML_TOKEN_END_ELEMENT) { if(--elements == 0) { return; } } } }
int main(int argc, char** argv) { int n; for(n = 1; n < argc; ++n) { int fd = open(argv[n], O_RDONLY); struct stat stat; char* buf; int nbytes; fstat(fd, &stat); buf = (char*)malloc(stat.st_size+1); nbytes = read(fd, buf, stat.st_size); fprintf(stderr, "read %d/%d bytes\n", nbytes, stat.st_size); buf[stat.st_size] = 0; close(fd); { XML_PARSER parser; XML_TOKEN token; xml_init_parser(&parser, buf); while(xml_get_token(&parser, &token)) { /* const char* types[] = { "begin element", "end element", "attr name", "attr value", "text", }; fprintf(stderr, "parsed '%s': '", types[token.type]); write(2, token.str, token.length); fprintf(stderr, "'\n"); */ } if(token.type == XML_TOKEN_ERROR) { fprintf(stderr, "ERROR: %s\n", token.str); } } } }
int xml_parse_tokens(struct xml_parser * sptr ) { int c; int whoops; int status=_XML_SUCCESS; while ((c = xml_remove_space( sptr )) != 0) { if ((whoops = xml_get_token( sptr )) > 0) { if ((status = xml_use_token(sptr)) != _XML_SUCCESS) break; } else if ( whoops == -1 ) { status=_XML_INCORRECT_TOKEN; break; } else if (!( sptr->punctuation = xml_getch(sptr) )) break; else if ((status = xml_use_punctuation( sptr )) != _XML_SUCCESS) break; } return(status); }
int xml_get_token(XML_PARSER* parser, XML_TOKEN* token) { if(parser->state == XML_STATE_ERROR) { token->str = ""; token->length = 0; token->type = XML_TOKEN_ERROR; return 0; } if(parser->state == XML_STATE_END_DOCUMENT) { token->type = XML_TOKEN_END_DOCUMENT; return 0; } if(parser->has_token) { parser->has_token = 0; *token = parser->pushed_back_token; return token->type != XML_TOKEN_END_DOCUMENT && token->type != XML_TOKEN_ERROR; } if(parser->state == XML_STATE_FIND_ELEMENT) { int text_found = 0; const char* begin = parser->pos; while(*parser->pos && *parser->pos != '<') { if(isalnum(*parser->pos) || ispunct(*parser->pos)) { text_found = 1; } ++parser->pos; } if(!*parser->pos) { ERROR_AND_RETURN(parser->elements, "unexpected end of document"); ERROR_AND_RETURN(text_found, "text found at top level"); parser->state = XML_STATE_END_DOCUMENT; token->type = XML_TOKEN_END_DOCUMENT; return 0; } if(text_found) { token->type = XML_TOKEN_TEXT; token->str = begin; token->length = parser->pos - begin; return 1; } else { const char* begin = parser->pos+1; if(*begin == '?') { parser->pos = strstr(begin,"?>"); ERROR_AND_RETURN(parser->pos == NULL, "End of DTD not found"); parser->pos += 2; return xml_get_token(parser, token); } else if(*begin == '/') { --parser->elements; ++begin; parser->pos = strchr(begin,'>'); ERROR_AND_RETURN(parser->pos == NULL, "unexpected end of document"); token->type = XML_TOKEN_END_ELEMENT; token->str = begin; token->length = parser->pos - token->str; ++parser->pos; return 1; } else { ++parser->elements; token->str = parser->pos; while(*parser->pos && *parser->pos != ' ' && *parser->pos != '>' && *parser->pos != '/') { ++parser->pos; } ERROR_AND_RETURN(!*parser->pos, "unexpected end of document"); token->type = XML_TOKEN_BEGIN_ELEMENT; token->str = begin; token->length = parser->pos - token->str; if(*parser->pos == '>') { ++parser->pos; parser->state = XML_STATE_FIND_ELEMENT; } else { parser->state = XML_STATE_FIND_ATTRIBUTE; } return 1; } } } else if(parser->state == XML_STATE_FIND_ATTRIBUTE) { while(*parser->pos == ' ') { ++parser->pos; } ERROR_AND_RETURN(!*parser->pos, "unexpected end of document"); if(*parser->pos == '/') { --parser->elements; token->type = XML_TOKEN_END_ELEMENT; token->str = ""; token->length = 0; ++parser->pos; while(*parser->pos != '>') { ERROR_AND_RETURN(!*parser->pos, "unexpected end of document"); ERROR_AND_RETURN(isalpha(*parser->pos) || ispunct(*parser->pos), "unexpected characters at end of element"); ++parser->pos; } } else if(isalpha(*parser->pos)) { token->type = XML_TOKEN_ATTR_NAME; token->str = parser->pos; parser->pos = strchr(token->str, '='); ERROR_AND_RETURN(parser->pos == NULL, "unexpected end of document"); token->length = parser->pos - token->str; parser->state = XML_STATE_FIND_VALUE; return 1; } else if(*parser->pos == '>') { ++parser->pos; parser->state = XML_STATE_FIND_ELEMENT; return xml_get_token(parser, token); } else { SET_ERROR("unexpected characters when searching for attribute"); return 0; } } else if(parser->state == XML_STATE_FIND_VALUE) { ++parser->pos; while(*parser->pos && *parser->pos != '"') { ERROR_AND_RETURN(!isspace(*parser->pos), "unexpected character when searching for value"); ++parser->pos; } ERROR_AND_RETURN(!*parser->pos, "unexpected end of document"); token->str = parser->pos+1; parser->pos = strchr(token->str,'"'); ERROR_AND_RETURN(parser->pos == NULL, "unexpected end of document"); token->length = parser->pos - token->str; token->type = XML_TOKEN_ATTR_VALUE; ++parser->pos; parser->state = XML_STATE_FIND_ATTRIBUTE; return 1; } else { SET_ERROR("bad state"); return 0; } }
ni_bool_t xml_get_identifier(xml_reader_t *xr, ni_stringbuf_t *res) { return xml_get_token(xr, res) == Identifier; }
ni_bool_t xml_process_element_nested(xml_reader_t *xr, xml_node_t *cur, unsigned int nesting) { ni_stringbuf_t tokenValue, identifier; xml_token_type_t token; xml_node_t *child; ni_stringbuf_init(&tokenValue); ni_stringbuf_init(&identifier); while (1) { token = xml_get_token(xr, &tokenValue); switch (token) { case CData: /* process element content */ xml_node_set_cdata(cur, tokenValue.string); break; case LeftAngleExclam: /* Most likely <!DOCTYPE ...> */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open <! not followed by identifier"); goto error; } if (strcmp(identifier.string, "DOCTYPE")) { xml_parse_error(xr, "Unexpected element: <!%s ...> not supported", identifier.string); goto error; } while (1) { token = xml_get_token(xr, &identifier); if (token == RightAngle) break; if (token == Identifier && !xr->doctype) ni_string_dup(&xr->doctype, identifier.string); if (token != Identifier && token != QuotedString) { xml_parse_error(xr, "Error parsing <!DOCTYPE ...> attributes"); goto error; } } break; case LeftAngle: /* New element start */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open < not followed by identifier"); goto error; } child = xml_node_new(identifier.string, cur); if (xr->shared_location) child->location = xml_location_new(xr->shared_location, xr->lineCount); token = xml_get_tag_attributes(xr, child); if (token == None) { xml_parse_error(xr, "Error parsing <%s ...> tag attributes", child->name); goto error; } else if (token == RightAngle) { /* Handle <foo>...</foo> */ xml_debug("%*.*s<%s>\n", nesting, nesting, "", child->name); if (!xml_process_element_nested(xr, child, nesting + 2)) goto error; } else if (token == RightAngleSlash) { /* We parsed a "<foo/>" element - nothing left to do, we're done */ xml_debug("%*.*s<%s/>\n", nesting, nesting, "", child->name); } else { xml_parse_error(xr, "Unexpected token %s at end of <%s ...", xml_token_name(token), child->name); goto error; } break; case LeftAngleSlash: /* Element end */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: end tag open </ not followed by identifier"); goto error; } if (xml_get_token(xr, &tokenValue) != RightAngle) { xml_parse_error(xr, "Bad element: </%s - missing tag close", identifier.string); goto error; } if (cur->parent == NULL) { xml_parse_error(xr, "Unexpected </%s> tag", identifier.string); goto error; } if (strcmp(cur->name, identifier.string)) { xml_parse_error(xr, "Closing tag </%s> does not match <%s>", identifier.string, cur->name); goto error; } xml_debug("%*.*s</%s>\n", nesting, nesting, "", cur->name); goto success; case LeftAngleQ: /* New PI node starts here */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open <? not followed by identifier"); goto error; } child = xml_node_new(identifier.string, NULL); if (xr->shared_location) child->location = xml_location_new(xr->shared_location, xr->lineCount); token = xml_get_tag_attributes(xr, child); if (token == None) { xml_parse_error(xr, "Error parsing <?%s ...?> tag attributes", child->name); xml_node_free(child); goto error; } else if (token == RightAngleQ) { xml_debug("%*.*s<%s>\n", nesting, nesting, "", child->name); xml_process_pi_node(xr, child); xml_node_free(child); } else { xml_parse_error(xr, "Unexpected token %s at end of <?%s ...", xml_token_name(token), child->name); xml_node_free(child); goto error; } break; case EndOfDocument: if (cur->parent) { xml_parse_error(xr, "End of document while processing element <%s>", cur->name); goto error; } goto success; case None: /* parser error */ goto error; default: xml_parse_error(xr, "Unexpected token %s", xml_token_name(token)); goto error; } } success: ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&identifier); return TRUE; error: ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&identifier); return FALSE; }
bool xml_get_identifier(xml_reader_t *xr, string_t *res) { return xml_get_token(xr, res) == Identifier; }