コード例 #1
0
ファイル: webpgfetch.cpp プロジェクト: Zala/qminer
void TWebPgFetchPersist::OnError(const int& FId, const TStr& MsgStr) 
{
	ErrorCount++;
	ReportError(MsgStr);
	Report();
	// in case of bad request don't make the request again
	/*if (!HttpResp.Empty() && HttpResp->GetStatusCd() == 400) {
	TStr Url = "";
	if (IsConn(FId))
	Url = GetConnUrl(FId)->GetUrlStr();
	if (!Notify.Empty()) {
	TStr Error = "TWebPgFetchPersist.OnError: Received http response 400 (Bad request). Skipping the request. Request: " + Url;
	Notify->OnStatus(Error.CStr());
	}
	return;
	}
	else if (!HttpResp.Empty() && !Notify.Empty()) {
	Notify->OnStatusFmt("TWebPgFetchPersist.OnError: Received http response %d.", HttpResp->GetStatusCd());
	}*/

	if (IsConn(FId) && RepeatFailedRequests) {
		PUrl Url = GetConnUrl(FId);
		FetchUrl(Url, false);	// enqueue request at the beginning of the queue		
	}
}
コード例 #2
0
void TCordisEuProjWebFetch::OnError(const int&, const TStr& MsgStr){
  printf("*** Error: '%s'\n", MsgStr.CStr());
  if (FetchUrlStr.Empty()||(FetchRetries>10)){
    TSysMsg::Quit(); // send quit message
  } else {
    FetchRetries++;
    printf("*** Retry (#%d): '%s'\n", FetchRetries, FetchUrlStr.CStr());
    FetchUrl(FetchUrlStr);
  }
}
コード例 #3
0
ファイル: fetcher.cpp プロジェクト: bakwc/Exhauster
optional<string> FetchUrl(const string& url, chrono::milliseconds timeout) {
    TRequest request;
    request.Url = url;
    request.Timeout = timeout;
    TResponse response = FetchUrl(request);
    if (response.Success) {
        return response.Data;
    }
    return optional<string>();
}
コード例 #4
0
void SpiderThread::AnalysisData(SpiderHttp* spiderHttp)
{
	CMyString url;
	CMyString host;

	bool	  haveUrl=true;
	if(spiderHttp->IsTxtPage())
	{
		if(!InitalFetchEngine(spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()))
			haveUrl=false;
		while(haveUrl)
		{
			int urlCount=m_InterfaceConfig.m_FetchUrl->FetchUrl(m_CurrentP,m_Regex.GetMatchStrLen(),m_SameRegex);
			for(int i=0;i<urlCount;i++)
			{
				url	=*(m_InterfaceConfig.m_FetchUrl->GetUrl(i));
				if(url=="./")continue;
				ReBuildUrlIfNeed(spiderHttp->m_Url,url,host);
				if(HaveAcess(host,url))continue;
				for(int i=0;i<m_InterfaceConfig.m_UrlFilterList.size();i++)
				{
					if(!m_InterfaceConfig.m_UrlFilterList[i]->FilterCheck(spiderHttp->m_Url.GetBuffer(),url.GetBuffer()))break;
				}
				if(i<m_InterfaceConfig.m_UrlFilterList.size())continue;
				if(m_InterfaceConfig.m_UrlModify)
				{
					m_InterfaceConfig.m_UrlModify->ModifyUrl(spiderHttp->m_Url.GetBuffer(),url);
				}
				AddHashMap(host,url);
				AddTempUrlList(url);
			}
			if(!FetchUrl(url))break;
		}
		
		if(m_InterfaceConfig.m_PageProcess)
		{
			m_InterfaceConfig.m_PageProcess->PageProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen());
		}
		if(m_UrlCmp)
		{
			SortTempUrlList();
		}
		
		//根据深度优先还是广度优先进行抓取
		AddAllUrlToUrlList(spiderHttp->m_Url);
	}
	else if(m_InterfaceConfig.m_FileProcess)
	{
		m_InterfaceConfig.m_FileProcess->FileProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen());
	}	
}
コード例 #5
0
/////////////////////////////////////////////////
// Web-Net-Proxy
void TWebNetProxy::OnHttpRq(const int& SockId, const PHttpRq& HttpRq) {
  TStr RqFetchIdStr=HttpRq->GetFldVal(THttp::FetchIdFldNm);
  int RqFetchId=RqFetchIdStr.GetInt(-1);
  TStr Host;//=HttpRq->GetFldVal(THttp::HostFldNm);
  if (Host.Empty() || (Host=="Tralala")) Host = DfSrv;
  if (Host.Empty()) {OnHttpRqError(SockId,"Missing proxy target."); return;}
  if (RqFetchId==-1) {OnHttpRqError(SockId,"Missing fetch-id."); return;}
  // build new url
  TStr RelUrlStr=HttpRq->GetUrl()->GetRelUrlStr();
  PUrl RedirUrl=TUrl::New(Host+RelUrlStr);
  //SaveToErrLog((TStr("ProxyTo: ")+RedirUrl->GetUrlStr()).CStr());
  int RespFetchId = FetchUrl(RedirUrl);

  PProxyConn Conn = new TProxyConn(SockId,RqFetchId,RespFetchId);
  RqSockIdToProxyH.AddDat(SockId,Conn);
  RespFetchIdToProxyH.AddDat(RespFetchId,Conn);
};
コード例 #6
0
ファイル: webpgfetch.cpp プロジェクト: Zala/qminer
void TWebPgFetchPersist::Load(TSIn& SIn)
{
	// load PUrls and call FetchUrl on each of them
	int Count = 0;
	while (!SIn.Eof()) {
		try {
			PUrl Url = TUrl::Load(SIn);
			FetchUrl(Url);
			Count++;
		}
		catch (PExcept ex) {
			Notify->OnStatusFmt("TWebPgFetchPersist.Load. Exception while loading url: %s", ex->GetMsgStr().CStr());
		}
		catch (...) {
			Notify->OnStatus("TWebPgFetchPersist.Load. Unrecognized exception while loading a url.");
		}
	}
}
コード例 #7
0
ファイル: webpgfetch.cpp プロジェクト: Zala/qminer
int TWebPgFetch::FetchHttpRq(const PHttpRq& HttpRq){
  PUrl Url=HttpRq->GetUrl();
  Url->PutHttpRqStr(HttpRq->GetStr()); //**
  return FetchUrl(Url);
}
コード例 #8
0
ファイル: webpgfetch.cpp プロジェクト: Zala/qminer
int TWebPgFetch::FetchUrl(const TStr& RelUrlStr, const TStr& BaseUrlStr){
  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
  return FetchUrl(Url);
}
コード例 #9
0
/////////////////////////////////////////////////
// EuProjects-Web-Fetch
void TCordisEuProjWebFetch::OnFetch(const int&, const PWebPg& WebPg){
  // print url of downloaded page
  printf("%s\n", WebPg->GetUrlStr().CStr());
  printf("-----------------------\n");
  //printf("%s", WebPg->GetHttpHdStr().CStr());
  //printf("%s", WebPg->GetHttpBodyAsStr().CStr());
  // get current page url & set of outgoing urls with descriptions
  TStr WebPgUrlStr=WebPg->GetUrlStr();
  TStrKdV DescUrlStrKdV; WebPg->GetOutDescUrlStrKdV(DescUrlStrKdV);
  // check current page type and react accordingly
  if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){
    // if the page represents project description (ACTION=D)
    // save the document
    TStr HtmlStr=WebPg->GetHttpBodyAsStr();
    WebPgUrlStr.Save(*EuProjSOut);
    HtmlStr.Save(*EuProjSOut);
    EuProjSOut->Flush();
    int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD"));
    if (DescUrlStrKdN!=-1){
      // fetch next document
      FetchDocN++;
      FetchUrlStr=
       TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+
       FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN);
      FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      printf("*** No forward pointer.\n");
      TSysMsg::Quit();
    }
    // search for forward pointer (to the next project description)
    /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD"));
    if (DescUrlStrKdN!=-1){
      // fetch next project description (usual, most frequent case)
      FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      // last project description doesn't include forward pointer
      printf("*** No forward pointer.\n");
      TSysMsg::Quit();
    }*/
  } else
  if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=R&")){
    // if the page represents project record-set (ACTION=R)
    // take session id
    FetchSesIdStr=WebPgUrlStr.GetWcMatch("*SESSION=*&*", 1);
    FetchDocN=1;
    FetchUrlStr=
     TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+
     FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN);
    FetchRetries=0;
    FetchUrl(FetchUrlStr);
    // move to the first project-description-url (first record-set only)
    /*int DescUrlStrKdN=0;
    while (DescUrlStrKdN<DescUrlStrKdV.Len()){
      TStr UrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; DescUrlStrKdN++;
      if (UrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){
        // fetch first project-description only
        FetchUrl(UrlStr);
      }
    }
    if (DescUrlStrKdN>=DescUrlStrKdV.Len()){
      // quit downloading if no project descriptions
      printf("*** No project descriptions.\n");
      TSysMsg::Quit();
    }*/
    // fetch next index page
    /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS"));
    if (DescUrlStrKdN!=-1){
      FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat;
      FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      printf("*** No next 20 records.\n");
    }*/
  } else {
    // get forward pointer to the first project record-set (start only)
    int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS"));
    if (DescUrlStrKdN!=-1){
      FetchUrl(DescUrlStrKdV[DescUrlStrKdN].Dat);}
  }
}