void web_crawler::job_downloader(web_crawler::urls_t& urls, web_crawler::pages_t& pages) { while (!urls.empty()) { uri::uri url = urls.front(); urls.pop(); std::cout << url.string() <<std::endl; if (pages.find(url) != pages.end()) { std::cout << "url already processed " <<std::endl; continue; } if (!url.is_valid()) { std::cout << "invalid url " <<std::endl; continue; } try { thread_local http::client client; http::client::response response; auto request = http::client::request(url); response = client.get(request); auto st = status(response); if (st > 400) { std::cout << "error " << st <<std::endl; continue; } const auto page = body(response); auto links = search_for_links(page); std::remove_if(links.begin(), links.end(), [&pages](const uri::uri& link) { return pages.find(link) == pages.end(); }); for (const auto& link : links) { auto url_to_push = is_relative(link) ? link.string() : url.string() + link.string(); urls.push(url_to_push); } pages[url] = page; cv.notify_all(); } catch(std::exception& e) { std::cout<<"error " << e.what()<<std::endl; std::cout <<urls.size() << " " <<pages.size() <<std::endl; urls.push(url); std::this_thread::sleep_for(std::chrono::milliseconds(5000)); continue; } } }
void HtmlParser::search_for_links(GumboNode* node, Tags *tag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* attribute; if (node->v.element.tag == tag->htmlTag){ map<string, string> attrList; attrValue attrElement; if ( (attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) { attrElement.attrList.insert( make_pair(attribute->name, attribute->value)); } else { //Recogemos el resto de atributos y los incluimos en la variable tag for (unsigned int i=0; i<node->v.element.attributes.length; i++){ attribute = ((GumboAttribute *)node->v.element.attributes.data[i]); attrElement.attrList.insert( make_pair(attribute->name, attribute->value)); } } attrElement.content = isObtainContentTag() ? search_text(node, tag) : ""; tag->tagElement.push_back(attrElement); } GumboVector* children = &node->v.element.children; // std::cout << "****** elementos: " << children->length << endl; for (unsigned int i = 0; i < children->length; ++i) { search_for_links(static_cast<GumboNode*>(children->data[i]), tag); } }
void HtmlParser::buscarElementos(char *html, Tags *tag) { GumboOutput* output = gumbo_parse(html); search_for_links(output->root, tag); gumbo_destroy_output(&kGumboDefaultOptions, output); }