void TWebPgFetchPersist::OnError(const int& FId, const TStr& MsgStr) { ErrorCount++; ReportError(MsgStr); Report(); // in case of bad request don't make the request again /*if (!HttpResp.Empty() && HttpResp->GetStatusCd() == 400) { TStr Url = ""; if (IsConn(FId)) Url = GetConnUrl(FId)->GetUrlStr(); if (!Notify.Empty()) { TStr Error = "TWebPgFetchPersist.OnError: Received http response 400 (Bad request). Skipping the request. Request: " + Url; Notify->OnStatus(Error.CStr()); } return; } else if (!HttpResp.Empty() && !Notify.Empty()) { Notify->OnStatusFmt("TWebPgFetchPersist.OnError: Received http response %d.", HttpResp->GetStatusCd()); }*/ if (IsConn(FId) && RepeatFailedRequests) { PUrl Url = GetConnUrl(FId); FetchUrl(Url, false); // enqueue request at the beginning of the queue } }
void TCordisEuProjWebFetch::OnError(const int&, const TStr& MsgStr){ printf("*** Error: '%s'\n", MsgStr.CStr()); if (FetchUrlStr.Empty()||(FetchRetries>10)){ TSysMsg::Quit(); // send quit message } else { FetchRetries++; printf("*** Retry (#%d): '%s'\n", FetchRetries, FetchUrlStr.CStr()); FetchUrl(FetchUrlStr); } }
optional<string> FetchUrl(const string& url, chrono::milliseconds timeout) { TRequest request; request.Url = url; request.Timeout = timeout; TResponse response = FetchUrl(request); if (response.Success) { return response.Data; } return optional<string>(); }
void SpiderThread::AnalysisData(SpiderHttp* spiderHttp) { CMyString url; CMyString host; bool haveUrl=true; if(spiderHttp->IsTxtPage()) { if(!InitalFetchEngine(spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen())) haveUrl=false; while(haveUrl) { int urlCount=m_InterfaceConfig.m_FetchUrl->FetchUrl(m_CurrentP,m_Regex.GetMatchStrLen(),m_SameRegex); for(int i=0;i<urlCount;i++) { url =*(m_InterfaceConfig.m_FetchUrl->GetUrl(i)); if(url=="./")continue; ReBuildUrlIfNeed(spiderHttp->m_Url,url,host); if(HaveAcess(host,url))continue; for(int i=0;i<m_InterfaceConfig.m_UrlFilterList.size();i++) { if(!m_InterfaceConfig.m_UrlFilterList[i]->FilterCheck(spiderHttp->m_Url.GetBuffer(),url.GetBuffer()))break; } if(i<m_InterfaceConfig.m_UrlFilterList.size())continue; if(m_InterfaceConfig.m_UrlModify) { m_InterfaceConfig.m_UrlModify->ModifyUrl(spiderHttp->m_Url.GetBuffer(),url); } AddHashMap(host,url); AddTempUrlList(url); } if(!FetchUrl(url))break; } if(m_InterfaceConfig.m_PageProcess) { m_InterfaceConfig.m_PageProcess->PageProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()); } if(m_UrlCmp) { SortTempUrlList(); } //根据深度优先还是广度优先进行抓取 AddAllUrlToUrlList(spiderHttp->m_Url); } else if(m_InterfaceConfig.m_FileProcess) { m_InterfaceConfig.m_FileProcess->FileProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()); } }
///////////////////////////////////////////////// // Web-Net-Proxy void TWebNetProxy::OnHttpRq(const int& SockId, const PHttpRq& HttpRq) { TStr RqFetchIdStr=HttpRq->GetFldVal(THttp::FetchIdFldNm); int RqFetchId=RqFetchIdStr.GetInt(-1); TStr Host;//=HttpRq->GetFldVal(THttp::HostFldNm); if (Host.Empty() || (Host=="Tralala")) Host = DfSrv; if (Host.Empty()) {OnHttpRqError(SockId,"Missing proxy target."); return;} if (RqFetchId==-1) {OnHttpRqError(SockId,"Missing fetch-id."); return;} // build new url TStr RelUrlStr=HttpRq->GetUrl()->GetRelUrlStr(); PUrl RedirUrl=TUrl::New(Host+RelUrlStr); //SaveToErrLog((TStr("ProxyTo: ")+RedirUrl->GetUrlStr()).CStr()); int RespFetchId = FetchUrl(RedirUrl); PProxyConn Conn = new TProxyConn(SockId,RqFetchId,RespFetchId); RqSockIdToProxyH.AddDat(SockId,Conn); RespFetchIdToProxyH.AddDat(RespFetchId,Conn); };
void TWebPgFetchPersist::Load(TSIn& SIn) { // load PUrls and call FetchUrl on each of them int Count = 0; while (!SIn.Eof()) { try { PUrl Url = TUrl::Load(SIn); FetchUrl(Url); Count++; } catch (PExcept ex) { Notify->OnStatusFmt("TWebPgFetchPersist.Load. Exception while loading url: %s", ex->GetMsgStr().CStr()); } catch (...) { Notify->OnStatus("TWebPgFetchPersist.Load. Unrecognized exception while loading a url."); } } }
int TWebPgFetch::FetchHttpRq(const PHttpRq& HttpRq){ PUrl Url=HttpRq->GetUrl(); Url->PutHttpRqStr(HttpRq->GetStr()); //** return FetchUrl(Url); }
int TWebPgFetch::FetchUrl(const TStr& RelUrlStr, const TStr& BaseUrlStr){ PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); return FetchUrl(Url); }
///////////////////////////////////////////////// // EuProjects-Web-Fetch void TCordisEuProjWebFetch::OnFetch(const int&, const PWebPg& WebPg){ // print url of downloaded page printf("%s\n", WebPg->GetUrlStr().CStr()); printf("-----------------------\n"); //printf("%s", WebPg->GetHttpHdStr().CStr()); //printf("%s", WebPg->GetHttpBodyAsStr().CStr()); // get current page url & set of outgoing urls with descriptions TStr WebPgUrlStr=WebPg->GetUrlStr(); TStrKdV DescUrlStrKdV; WebPg->GetOutDescUrlStrKdV(DescUrlStrKdV); // check current page type and react accordingly if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){ // if the page represents project description (ACTION=D) // save the document TStr HtmlStr=WebPg->GetHttpBodyAsStr(); WebPgUrlStr.Save(*EuProjSOut); HtmlStr.Save(*EuProjSOut); EuProjSOut->Flush(); int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD")); if (DescUrlStrKdN!=-1){ // fetch next document FetchDocN++; FetchUrlStr= TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+ FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN); FetchRetries=0; FetchUrl(FetchUrlStr); } else { printf("*** No forward pointer.\n"); TSysMsg::Quit(); } // search for forward pointer (to the next project description) /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD")); if (DescUrlStrKdN!=-1){ // fetch next project description (usual, most frequent case) FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0; FetchUrl(FetchUrlStr); } else { // last project description doesn't include forward pointer printf("*** No forward pointer.\n"); TSysMsg::Quit(); }*/ } else if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=R&")){ // if the page represents project record-set (ACTION=R) // take session id FetchSesIdStr=WebPgUrlStr.GetWcMatch("*SESSION=*&*", 1); FetchDocN=1; FetchUrlStr= TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+ FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN); FetchRetries=0; FetchUrl(FetchUrlStr); // move to the first project-description-url (first record-set only) /*int DescUrlStrKdN=0; while (DescUrlStrKdN<DescUrlStrKdV.Len()){ TStr UrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; DescUrlStrKdN++; if (UrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){ // fetch first project-description only FetchUrl(UrlStr); } } if (DescUrlStrKdN>=DescUrlStrKdV.Len()){ // quit downloading if no project descriptions printf("*** No project descriptions.\n"); TSysMsg::Quit(); }*/ // fetch next index page /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS")); if (DescUrlStrKdN!=-1){ FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0; FetchUrl(FetchUrlStr); } else { printf("*** No next 20 records.\n"); }*/ } else { // get forward pointer to the first project record-set (start only) int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS")); if (DescUrlStrKdN!=-1){ FetchUrl(DescUrlStrKdV[DescUrlStrKdN].Dat);} } }