/* * Process command. When we get here, we've processed "<!-" */ xml_token_type_t xml_skip_comment(xml_reader_t *xr) { int match = 0, cc; if (xml_getc(xr) != '-') { xml_parse_error(xr, "Unexpected <!-...> element"); return None; } while ((cc = xml_getc(xr)) != EOF) { if (cc == '-') { match++; } else { if (cc == '>' && match >= 2) { #ifdef XMLDEBUG_PARSER xml_debug("Processed comment\n"); #endif return Comment; } match = 0; } } xml_parse_error(xr, "Unexpected end of file while parsing comment"); return None; }
/* * Expand an XML entity. * For now, we support &<number>; as well as symbolic entities * lt gt amp */ bool xml_expand_entity(xml_reader_t *xr, string_t *res) { char entity[128]; unsigned int elen = 0; int cc, expanded; while ((cc = xml_getc(xr)) != ';') { if (cc == EOF) { xml_parse_error(xr, "Unexpenced EOF in entity"); return false; } if (isspace(cc)) continue; if (elen + 1 >= sizeof(entity)) { xml_parse_error(xr, "Entity string too long"); return false; } entity[elen++] = cc; } entity[elen] = '\0'; if (elen == 0) { xml_parse_error(xr, "Empty entity &;"); return false; } if (!strcasecmp(entity, "lt")) expanded = '<'; else if (!strcasecmp(entity, "gt")) expanded = '>'; else if (!strcasecmp(entity, "amp")) expanded = '&'; else { const char *es = entity; if (*es == '#') { expanded = strtoul(es + 1, (char **) &es, 0); if (*es == '\0') goto good; } xml_parse_error(xr, "Cannot expand unknown entity &%s;", entity); return false; } good: string_putc(res, expanded); return true; }
/* * Expand an XML entity. * For now, we support &<number>; as well as symbolic entities * lt gt amp */ ni_bool_t xml_expand_entity(xml_reader_t *xr, ni_stringbuf_t *res) { char temp[128]; ni_stringbuf_t entity = NI_STRINGBUF_INIT_BUFFER(temp); int cc, expanded; while ((cc = xml_getc(xr)) != ';') { if (cc == EOF) { xml_parse_error(xr, "Unexpenced EOF in entity"); return FALSE; } if (isspace(cc)) continue; if (entity.len + sizeof(char) >= entity.size) { xml_parse_error(xr, "Entity is too long"); return FALSE; } ni_stringbuf_putc(&entity, cc); } if (ni_string_empty(entity.string)) { xml_parse_error(xr, "Empty entity &;"); return FALSE; } if (!strcasecmp(entity.string, "lt")) expanded = '<'; else if (!strcasecmp(entity.string, "gt")) expanded = '>'; else if (!strcasecmp(entity.string, "amp")) expanded = '&'; else { const char *es = entity.string; if (*es == '#') { expanded = strtoul(es + 1, (char **) &es, 0); if (*es == '\0') goto good; } xml_parse_error(xr, "Cannot expand unknown entity &%s;", entity.string); return FALSE; } good: ni_stringbuf_putc(res, expanded); return TRUE; }
/* * Get the next token from the XML stream */ xml_token_type_t xml_get_token(xml_reader_t *xr, ni_stringbuf_t *res) { #ifdef XMLDEBUG_PARSER xml_parser_state_t old_state = xr->state; #endif xml_token_type_t token; ni_stringbuf_clear(res); switch (xr->state) { default: xml_parse_error(xr, "Unexpected state %u in XML reader", xr->state); return None; case Error: return None; case Initial: token = xml_get_token_initial(xr, res); break; case Tag: token = xml_get_token_tag(xr, res); break; } xml_debug("++ %3u %-7s %-10s (%s)\n", xr->lineCount, xml_parser_state_name(old_state), xml_token_name(token), res->string?: ""); return token; }
xml_token_type_t xml_get_tag_attributes(xml_reader_t *xr, xml_node_t *node) { ni_stringbuf_t tokenValue, attrName, attrValue; xml_token_type_t token; ni_stringbuf_init(&tokenValue); ni_stringbuf_init(&attrName); ni_stringbuf_init(&attrValue); token = xml_get_token(xr, &tokenValue); while (1) { if (token == RightAngle || token == RightAngleQ || token == RightAngleSlash) break; if (token != Identifier) { xml_parse_error(xr, "Unexpected token in tag attributes"); token = None; break; } ni_stringbuf_move(&attrName, &tokenValue); token = xml_get_token(xr, &tokenValue); if (token != Equals) { xml_node_add_attr(node, attrName.string, NULL); continue; } token = xml_get_token(xr, &tokenValue); if (token != QuotedString) { xml_parse_error(xr, "Attribute value not a quoted string!"); token = None; break; } xml_debug(" attr %s=%s\n", attrName.string, tokenValue.string); xml_node_add_attr(node, attrName.string, tokenValue.string); token = xml_get_token(xr, &tokenValue); } ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&attrName); ni_stringbuf_destroy(&attrValue); return token; }
/* * Process CDATA. When we get here, we've processed "<[CDATA" */ xml_token_type_t xml_process_cdata(xml_reader_t *xr, string_t *res) { int cc, state = 0; cc = xml_getc(xr); if (cc == EOF) goto unexpected_eof; if (cc != '[') { xml_parse_error(xr, "Unexpected '%c' after <[CDATA in XML stream", cc); return None; } while (state != 3) { cc = xml_getc(xr); if (cc == EOF) goto unexpected_eof; if (cc == ']') { if (state == 2) { string_putc(res, ']'); } else { ++state; } } else if (cc == '>' && state == 2) { ++state; } else { while (state) { string_putc(res, ']'); state--; } string_putc(res, cc); } } xr->state = Initial; return CData; unexpected_eof: xml_parse_error(xr, "Unexpected EOF after <[CDATA in XML stream"); return None; }
xml_token_type_t xml_get_token_tag(xml_reader_t *xr, ni_stringbuf_t *res) { int cc, oc; xml_skip_space(xr, NULL); cc = xml_getc(xr); if (cc == EOF) { xml_parse_error(xr, "Unexpected EOF while parsing tag"); return None; } ni_stringbuf_putc(res, cc); switch (cc) { case '<': goto error; case '?': if ((cc = xml_getc(xr)) != '>') goto error; ni_stringbuf_putc(res, cc); xr->state = Initial; return RightAngleQ; case '>': xr->state = Initial; return RightAngle; case '/': if ((cc = xml_getc(xr)) != '>') goto error; ni_stringbuf_putc(res, cc); xr->state = Initial; return RightAngleSlash; case '=': return Equals; case 'a' ... 'z': case 'A' ... 'Z': case '_': case '!': while ((cc = xml_getc(xr)) != EOF) { if (!isalnum(cc) && cc != '_' && cc != '!' && cc != ':' && cc != '-') { xml_ungetc(xr, cc); break; } ni_stringbuf_putc(res, cc); } return Identifier; case '\'': case '"': ni_stringbuf_clear(res); oc = cc; while (1) { cc = xml_getc(xr); if (cc == EOF) { xml_parse_error(xr, "Unexpected EOF while parsing quoted string"); return None; } if (cc == oc) break; ni_stringbuf_putc(res, cc); } return QuotedString; default: break; } error: xml_parse_error(xr, "Unexpected character %c in XML document", cc); return None; }
/* * While in state Initial, obtain the next token */ xml_token_type_t xml_get_token_initial(xml_reader_t *xr, ni_stringbuf_t *res) { xml_token_type_t token; int cc; restart: /* Eat initial white space and store it in @res */ xml_skip_space(xr, res); cc = xml_getc(xr); if (cc == EOF) { ni_stringbuf_clear(res); return EndOfDocument; } if (cc == '<') { /* Discard the white space in @res - we're not interested in that. */ ni_stringbuf_clear(res); ni_stringbuf_putc(res, cc); if (xr->state != Initial) { xml_parse_error(xr, "Unexpected < in XML stream (state %s)", xml_parser_state_name(xr->state)); return None; } /* tag is legal here */ xr->state = Tag; cc = xml_getc(xr); switch (cc) { case '/': ni_stringbuf_putc(res, cc); return LeftAngleSlash; case '?': ni_stringbuf_putc(res, cc); return LeftAngleQ; case '!': ni_stringbuf_putc(res, cc); /* If it's <!IDENTIFIER, return LeftAngleExclam */ cc = xml_getc(xr); if (cc != '-') { xml_ungetc(xr, cc); return LeftAngleExclam; } token = xml_skip_comment(xr); if (token == Comment) { xr->state = Initial; ni_stringbuf_clear(res); goto restart; } return token; default: xml_ungetc(xr, cc); break; } return LeftAngle; } // Looks like CDATA. // Ignore initial newline, then scan to next < do { if (cc == '<') { /* Looks like we're done. * FIXME: handle comments within CDATA? */ xml_ungetc(xr, cc); break; } else if (cc == '&') { if (!xml_expand_entity(xr, res)) return None; } else { ni_stringbuf_putc(res, cc); } cc = xml_getc(xr); } while (cc != EOF); ni_stringbuf_trim_empty_lines(res); return CData; }
ni_bool_t xml_process_element_nested(xml_reader_t *xr, xml_node_t *cur, unsigned int nesting) { ni_stringbuf_t tokenValue, identifier; xml_token_type_t token; xml_node_t *child; ni_stringbuf_init(&tokenValue); ni_stringbuf_init(&identifier); while (1) { token = xml_get_token(xr, &tokenValue); switch (token) { case CData: /* process element content */ xml_node_set_cdata(cur, tokenValue.string); break; case LeftAngleExclam: /* Most likely <!DOCTYPE ...> */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open <! not followed by identifier"); goto error; } if (strcmp(identifier.string, "DOCTYPE")) { xml_parse_error(xr, "Unexpected element: <!%s ...> not supported", identifier.string); goto error; } while (1) { token = xml_get_token(xr, &identifier); if (token == RightAngle) break; if (token == Identifier && !xr->doctype) ni_string_dup(&xr->doctype, identifier.string); if (token != Identifier && token != QuotedString) { xml_parse_error(xr, "Error parsing <!DOCTYPE ...> attributes"); goto error; } } break; case LeftAngle: /* New element start */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open < not followed by identifier"); goto error; } child = xml_node_new(identifier.string, cur); if (xr->shared_location) child->location = xml_location_new(xr->shared_location, xr->lineCount); token = xml_get_tag_attributes(xr, child); if (token == None) { xml_parse_error(xr, "Error parsing <%s ...> tag attributes", child->name); goto error; } else if (token == RightAngle) { /* Handle <foo>...</foo> */ xml_debug("%*.*s<%s>\n", nesting, nesting, "", child->name); if (!xml_process_element_nested(xr, child, nesting + 2)) goto error; } else if (token == RightAngleSlash) { /* We parsed a "<foo/>" element - nothing left to do, we're done */ xml_debug("%*.*s<%s/>\n", nesting, nesting, "", child->name); } else { xml_parse_error(xr, "Unexpected token %s at end of <%s ...", xml_token_name(token), child->name); goto error; } break; case LeftAngleSlash: /* Element end */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: end tag open </ not followed by identifier"); goto error; } if (xml_get_token(xr, &tokenValue) != RightAngle) { xml_parse_error(xr, "Bad element: </%s - missing tag close", identifier.string); goto error; } if (cur->parent == NULL) { xml_parse_error(xr, "Unexpected </%s> tag", identifier.string); goto error; } if (strcmp(cur->name, identifier.string)) { xml_parse_error(xr, "Closing tag </%s> does not match <%s>", identifier.string, cur->name); goto error; } xml_debug("%*.*s</%s>\n", nesting, nesting, "", cur->name); goto success; case LeftAngleQ: /* New PI node starts here */ if (!xml_get_identifier(xr, &identifier)) { xml_parse_error(xr, "Bad element: tag open <? not followed by identifier"); goto error; } child = xml_node_new(identifier.string, NULL); if (xr->shared_location) child->location = xml_location_new(xr->shared_location, xr->lineCount); token = xml_get_tag_attributes(xr, child); if (token == None) { xml_parse_error(xr, "Error parsing <?%s ...?> tag attributes", child->name); xml_node_free(child); goto error; } else if (token == RightAngleQ) { xml_debug("%*.*s<%s>\n", nesting, nesting, "", child->name); xml_process_pi_node(xr, child); xml_node_free(child); } else { xml_parse_error(xr, "Unexpected token %s at end of <?%s ...", xml_token_name(token), child->name); xml_node_free(child); goto error; } break; case EndOfDocument: if (cur->parent) { xml_parse_error(xr, "End of document while processing element <%s>", cur->name); goto error; } goto success; case None: /* parser error */ goto error; default: xml_parse_error(xr, "Unexpected token %s", xml_token_name(token)); goto error; } } success: ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&identifier); return TRUE; error: ni_stringbuf_destroy(&tokenValue); ni_stringbuf_destroy(&identifier); return FALSE; }