Esempio n. 1
0
void web_crawler::job_downloader(web_crawler::urls_t& urls, web_crawler::pages_t& pages)
{
    while (!urls.empty())
    {
        uri::uri url = urls.front();
        urls.pop();
        std::cout << url.string() <<std::endl;

        if (pages.find(url) != pages.end())
        {
            std::cout << "url already processed " <<std::endl;
            continue;
        }

        if (!url.is_valid())
        {
            std::cout << "invalid url " <<std::endl;
            continue;
        }

        try
        {
            thread_local http::client client;
            http::client::response response;

            auto request = http::client::request(url);
            response = client.get(request);
            auto st = status(response);
            if (st > 400)
            {
                std::cout << "error " << st <<std::endl;
                continue;
            }

            const auto page = body(response);
            auto links = search_for_links(page);
            std::remove_if(links.begin(), links.end(), [&pages](const uri::uri& link) {
                return pages.find(link) == pages.end();
            });

            for (const auto& link : links)
            {
                auto url_to_push = is_relative(link) ? link.string() : url.string() + link.string();
                urls.push(url_to_push);
            }
            pages[url] = page;

            cv.notify_all();
        }
        catch(std::exception& e)
        {
            std::cout<<"error " << e.what()<<std::endl;
            std::cout <<urls.size() << " " <<pages.size() <<std::endl;
            urls.push(url);
            std::this_thread::sleep_for(std::chrono::milliseconds(5000));
            continue;
        }

    }
}
        ImplementClient(const boost::network::http::client& c)
            :client_(c){

                auto& imp = *get_implementation<IClient>();

                imp.Get = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{
                        auto res = client_.get(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>()
                        .request_);
                         return ImplementClientResponse::create(res).QueryInterface<IClientResponse>();
                };
                imp.Post = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{
                        auto res = client_.post(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>()
                        .request_);
                         return ImplementClientResponse::create(res).QueryInterface<IClientResponse>();
                };
                imp.Head = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{
                        auto res = client_.head(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>()
                        .request_);
                         return ImplementClientResponse::create(res).QueryInterface<IClientResponse>();
                };
                imp.Put = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{
                        auto res = client_.put(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>()
                        .request_);
                         return ImplementClientResponse::create(res).QueryInterface<IClientResponse>();
                };
                imp.Delete = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{
                        auto res = client_.delete_(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>()
                        .request_);
                         return ImplementClientResponse::create(res).QueryInterface<IClientResponse>();
                };

        }