示例#1
0
    std::string parse(const std::string& xml, const std::string& name, const std::string& url, const std::string& html, const std::string& htmlheader, std::vector<std::pair<std::string, std::string> >& attaches) {
        std::string ret("");

        std::string pp = getParserPath(url);
        xsltStylesheetPtr xslt = xsltParseStylesheetFile(BAD_CAST pp.c_str());
        htmlDocPtr doc = NULL;
        static std::string encoding("gb18030");
        std::string mimetype = getMIMEType(htmlheader, html);
        if (!mimetype.empty() && mimetype == "text/xml") {
            doc = html.empty() ? NULL : xmlReadDoc(BAD_CAST html.c_str(), NULL, encoding.c_str(), XML_PARSE_RECOVER);
        } else {
            doc = html.empty() ? NULL : htmlParseDoc(BAD_CAST html.c_str(), encoding.c_str());
        }
        if (doc != NULL) {
            const char *params[7] = {0};
            size_t n_param = 0;
            params[n_param] = NULL;
            xmlDocPtr res = xsltApplyStylesheet(xslt, doc, params);
            //free_xslt_params(params, n_param);
            if (res != NULL) {
                xmlChar *s = NULL;
                int len = 0;
                if (xsltSaveResultToString(&s, &len, res, xslt) >= 0) {
                    ret.assign((const char *)s, len);
                    xmlFree(s);
                }
                xmlFreeDoc(res);
            }
            xmlFreeDoc(doc);
        }
        return ret;
    }
示例#2
0
int main(int argc, char **argv)
{
    assert(argv[1]); 
    std::ifstream t(argv[1]);
    std::string str((std::istreambuf_iterator<char>(t)),
        std::istreambuf_iterator<char>());

    //tidy::tidy(str);
	//xmlDocPtr doc = htmlParseFile(f_doc, NULL);
	xmlDocPtr doc = htmlParseDoc(BAD_CAST str.c_str(), NULL);
	if (!doc) {
		printf("libxml failed to parse file <%s>\n", argv[1]);
		return 1;
	}

	xmlNode *doc_head = xmlDocGetRootElement(doc);
    if (argc < 3) {
        print_element_names(doc_head, 0);
    } else {
        run_xpath(doc, argv[2]);
    } 

	//xmlFree(doc_head);
	return 0;
}
示例#3
0
Pvoid_t html_tokenize_into_features(const char * html, Pvoid_t features) {
	xmlSubstituteEntitiesDefault(0);
	htmlDocPtr doc = htmlParseDoc(BAD_CAST html, "UTF-8");

	if (doc) {
		Buffer *buf = extractText(doc);
		features = tokenize_text(buf->buf, buf->length, features);
		features = tokenize_uris(doc, features);
		free_buffer(buf);
		xmlFreeDoc(doc);
	}

	return features;
}