Пример #1
0
DocInfo HTMLParser::parse(RICPNS::Document &document) {

	oneurl curl;
	string html;

	cleanText(document.getText(), html);
	GumboOutput* output = gumbo_parse(html.c_str());
	GumboNode* node = output->root;

	string docUrl = document.getURL();
	string content, pageTitle;
	list<pair<string, string> > links;

//	thread t1(&HTMLParser::extractContent, this, node, ref(content));
//	thread t2(&HTMLParser::extractPageTitle, this, node, ref(pageTitle));
//	thread t3(&HTMLParser::extractLinks, this, node, ref(links), ref(docUrl));

	extractContent(node, content);
	extractPageTitle(node, pageTitle);
	extractLinks(node, links, docUrl);

//	t1.join();
//	t2.join();
//	t3.join();

	gumbo_destroy_output(&kGumboDefaultOptions, output);

	DocInfo docInfo;
	docInfo.setContent(content);

	docInfo.setCanonicalUrl(
			curl.Parse(docUrl) ?
					curl.CNormalize(docUrl) : docUrl);

	docInfo.setUrl(docUrl);

	docInfo.setTitle(pageTitle);
	docInfo.setLinks(links);
	//	cout << docInfo.getUrl() << "  -  "<<  "   " << link << endl;

//	static int i=1;
//	cout << i++ << " - " << docInfo.getCanonicalUrl() << endl;
//
//
//	for(pair<string, string> link : links){
//		cout <<  " ------- " << link.first << endl;
//		cout <<  link.second << endl;
//	}

	return docInfo;
}