void web_crawler::job_downloader(web_crawler::urls_t& urls, web_crawler::pages_t& pages) { while (!urls.empty()) { uri::uri url = urls.front(); urls.pop(); std::cout << url.string() <<std::endl; if (pages.find(url) != pages.end()) { std::cout << "url already processed " <<std::endl; continue; } if (!url.is_valid()) { std::cout << "invalid url " <<std::endl; continue; } try { thread_local http::client client; http::client::response response; auto request = http::client::request(url); response = client.get(request); auto st = status(response); if (st > 400) { std::cout << "error " << st <<std::endl; continue; } const auto page = body(response); auto links = search_for_links(page); std::remove_if(links.begin(), links.end(), [&pages](const uri::uri& link) { return pages.find(link) == pages.end(); }); for (const auto& link : links) { auto url_to_push = is_relative(link) ? link.string() : url.string() + link.string(); urls.push(url_to_push); } pages[url] = page; cv.notify_all(); } catch(std::exception& e) { std::cout<<"error " << e.what()<<std::endl; std::cout <<urls.size() << " " <<pages.size() <<std::endl; urls.push(url); std::this_thread::sleep_for(std::chrono::milliseconds(5000)); continue; } } }
ImplementClient(const boost::network::http::client& c) :client_(c){ auto& imp = *get_implementation<IClient>(); imp.Get = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{ auto res = client_.get(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>() .request_); return ImplementClientResponse::create(res).QueryInterface<IClientResponse>(); }; imp.Post = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{ auto res = client_.post(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>() .request_); return ImplementClientResponse::create(res).QueryInterface<IClientResponse>(); }; imp.Head = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{ auto res = client_.head(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>() .request_); return ImplementClientResponse::create(res).QueryInterface<IClientResponse>(); }; imp.Put = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{ auto res = client_.put(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>() .request_); return ImplementClientResponse::create(res).QueryInterface<IClientResponse>(); }; imp.Delete = [this](use_unknown<IClientRequest> req)->use_unknown<IClientResponse>{ auto res = client_.delete_(req.QueryInterface<IGetNative>().GetAs<ImplementClientRequest>() .request_); return ImplementClientResponse::create(res).QueryInterface<IClientResponse>(); }; }