std::string parse(const std::string& xml, const std::string& name, const std::string& url, const std::string& html, const std::string& htmlheader, std::vector<std::pair<std::string, std::string> >& attaches) { std::string ret(""); std::string pp = getParserPath(url); xsltStylesheetPtr xslt = xsltParseStylesheetFile(BAD_CAST pp.c_str()); htmlDocPtr doc = NULL; static std::string encoding("gb18030"); std::string mimetype = getMIMEType(htmlheader, html); if (!mimetype.empty() && mimetype == "text/xml") { doc = html.empty() ? NULL : xmlReadDoc(BAD_CAST html.c_str(), NULL, encoding.c_str(), XML_PARSE_RECOVER); } else { doc = html.empty() ? NULL : htmlParseDoc(BAD_CAST html.c_str(), encoding.c_str()); } if (doc != NULL) { const char *params[7] = {0}; size_t n_param = 0; params[n_param] = NULL; xmlDocPtr res = xsltApplyStylesheet(xslt, doc, params); //free_xslt_params(params, n_param); if (res != NULL) { xmlChar *s = NULL; int len = 0; if (xsltSaveResultToString(&s, &len, res, xslt) >= 0) { ret.assign((const char *)s, len); xmlFree(s); } xmlFreeDoc(res); } xmlFreeDoc(doc); } return ret; }
int main(int argc, char **argv) { assert(argv[1]); std::ifstream t(argv[1]); std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); //tidy::tidy(str); //xmlDocPtr doc = htmlParseFile(f_doc, NULL); xmlDocPtr doc = htmlParseDoc(BAD_CAST str.c_str(), NULL); if (!doc) { printf("libxml failed to parse file <%s>\n", argv[1]); return 1; } xmlNode *doc_head = xmlDocGetRootElement(doc); if (argc < 3) { print_element_names(doc_head, 0); } else { run_xpath(doc, argv[2]); } //xmlFree(doc_head); return 0; }
Pvoid_t html_tokenize_into_features(const char * html, Pvoid_t features) { xmlSubstituteEntitiesDefault(0); htmlDocPtr doc = htmlParseDoc(BAD_CAST html, "UTF-8"); if (doc) { Buffer *buf = extractText(doc); features = tokenize_text(buf->buf, buf->length, features); features = tokenize_uris(doc, features); free_buffer(buf); xmlFreeDoc(doc); } return features; }