DocInfo HTMLParser::parse(RICPNS::Document &document) { oneurl curl; string html; cleanText(document.getText(), html); GumboOutput* output = gumbo_parse(html.c_str()); GumboNode* node = output->root; string docUrl = document.getURL(); string content, pageTitle; list<pair<string, string> > links; // thread t1(&HTMLParser::extractContent, this, node, ref(content)); // thread t2(&HTMLParser::extractPageTitle, this, node, ref(pageTitle)); // thread t3(&HTMLParser::extractLinks, this, node, ref(links), ref(docUrl)); extractContent(node, content); extractPageTitle(node, pageTitle); extractLinks(node, links, docUrl); // t1.join(); // t2.join(); // t3.join(); gumbo_destroy_output(&kGumboDefaultOptions, output); DocInfo docInfo; docInfo.setContent(content); docInfo.setCanonicalUrl( curl.Parse(docUrl) ? curl.CNormalize(docUrl) : docUrl); docInfo.setUrl(docUrl); docInfo.setTitle(pageTitle); docInfo.setLinks(links); // cout << docInfo.getUrl() << " - "<< " " << link << endl; // static int i=1; // cout << i++ << " - " << docInfo.getCanonicalUrl() << endl; // // // for(pair<string, string> link : links){ // cout << " ------- " << link.first << endl; // cout << link.second << endl; // } return docInfo; }