Beispiel #1
0
void web_crawler::job_downloader(web_crawler::urls_t& urls, web_crawler::pages_t& pages)
{
    while (!urls.empty())
    {
        uri::uri url = urls.front();
        urls.pop();
        std::cout << url.string() <<std::endl;

        if (pages.find(url) != pages.end())
        {
            std::cout << "url already processed " <<std::endl;
            continue;
        }

        if (!url.is_valid())
        {
            std::cout << "invalid url " <<std::endl;
            continue;
        }

        try
        {
            thread_local http::client client;
            http::client::response response;

            auto request = http::client::request(url);
            response = client.get(request);
            auto st = status(response);
            if (st > 400)
            {
                std::cout << "error " << st <<std::endl;
                continue;
            }

            const auto page = body(response);
            auto links = search_for_links(page);
            std::remove_if(links.begin(), links.end(), [&pages](const uri::uri& link) {
                return pages.find(link) == pages.end();
            });

            for (const auto& link : links)
            {
                auto url_to_push = is_relative(link) ? link.string() : url.string() + link.string();
                urls.push(url_to_push);
            }
            pages[url] = page;

            cv.notify_all();
        }
        catch(std::exception& e)
        {
            std::cout<<"error " << e.what()<<std::endl;
            std::cout <<urls.size() << " " <<pages.size() <<std::endl;
            urls.push(url);
            std::this_thread::sleep_for(std::chrono::milliseconds(5000));
            continue;
        }

    }
}
Beispiel #2
0
void HtmlParser::search_for_links(GumboNode* node, Tags *tag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }
    GumboAttribute* attribute;

    if (node->v.element.tag == tag->htmlTag){
        map<string, string> attrList;
        attrValue attrElement;

        if ( (attribute = gumbo_get_attribute(&node->v.element.attributes, tag->attr.c_str()))) {
            attrElement.attrList.insert( make_pair(attribute->name, attribute->value));
        } else {
            //Recogemos el resto de atributos y los incluimos en la variable tag
            for (unsigned int i=0; i<node->v.element.attributes.length; i++){
                attribute = ((GumboAttribute *)node->v.element.attributes.data[i]);
                attrElement.attrList.insert( make_pair(attribute->name, attribute->value));
            }
        }
        attrElement.content = isObtainContentTag() ? search_text(node, tag) : "";
        tag->tagElement.push_back(attrElement);
    }

    GumboVector* children = &node->v.element.children;
//    std::cout << "****** elementos: " << children->length << endl;
    for (unsigned int i = 0; i < children->length; ++i) {
        search_for_links(static_cast<GumboNode*>(children->data[i]), tag);
    }
}
Beispiel #3
0
void HtmlParser::buscarElementos(char *html, Tags *tag) {
    GumboOutput* output = gumbo_parse(html);
    search_for_links(output->root, tag);
    gumbo_destroy_output(&kGumboDefaultOptions, output);
}