void TWebTxtBs::OnFetch(const int& /*FId*/, const PWebPg& WebPg){ if (!IsUrlStr(WebPg->GetUrlStr())){ TMem HttpRespMem; WebPg->GetHttpResp()->GetAsMem(HttpRespMem); TxtBs->AddDocMem(WebPg->GetUrlStr(), HttpRespMem, WebBsFetch->IsIndexTxtBs()); if (WebPg->IsTxt()){ PWebFilter WebFilter=WebBsFetch->GetWebFilter(); PSIn HtmlDocSIn=TStrIn::New(WebPg->GetHttpBodyAsStr()); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlDocSIn, hdtHRef); TStr BaseUrlStr=WebPg->GetUrlStr(); TStr RelUrlStr; for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ if (WebFilter->IsUrlTok(HtmlDoc->GetTok(TokN), RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); if (Url->IsOk(usHttp)){ WebBsFetch->SetUrlAsFinal(Url); if (WebFilter->IsUrlOk(Url)&& (!WebBsFetch->IsUrlEnqueued(Url))&& (!IsUrlStr(Url->GetAsFinalUrlStr()))){ WebBsFetch->AddFetchUrl(Url); } } } } WebBsFetch->GoFetch(); } } }
///////////////////////////////////////////////// // Web-Page-To-File-Fetch void TWebPgToFileFetch::OnFetch(const int&, const PWebPg& WebPg){ // save web-page as http if (!OutHttpFNm.Empty()){ WebPg->SaveAsHttp(OutHttpFNm);} // save http-body if (!OutHttpBodyFNm.Empty()){ WebPg->SaveAsHttpBody(OutHttpBodyFNm);} // save web-page as xml if (!OutXmlFNm.Empty()){ TStr HtmlStr=WebPg->GetHttpBodyAsStr(); THtmlDoc::SaveHtmlToXml(HtmlStr, OutXmlFNm, WebPg->GetUrlStr(), OutXmlTextP, OutXmlUrlP, OutXmlToksP, OutXmlTagsP, OutXmlArgsP); } // save web-page as text if (!OutTxtFNm.Empty()){ TStr HtmlStr=WebPg->GetHttpBodyAsStr(); THtmlDoc::SaveHtmlToTxt(HtmlStr, OutTxtFNm, WebPg->GetUrlStr(), OutTxtUrlP, OutXmlTagsP); } // output to screen if (OutScrP){ printf("%s\n", WebPg->GetUrlStr().CStr()); printf("-----------------------\n"); printf("%s", WebPg->GetHttpHdStr().CStr()); printf("%s", WebPg->GetHttpBodyAsStr().CStr()); } }
///////////////////////////////////////////////// // Google-Web-Fetch-Saver void TGgWebFetchSaver::OnFetch(const int&, const PWebPg& WebPg){ printf("Fetched [Wait:%d Conn.:%d]: %s\n", GetWaitUrls(), GetConnUrls(), WebPg->GetUrlStr().CStr()); WebPgV.Add(WebPg); if (Empty()){ TSysMsg::Quit();} }
///////////////////////////////////////////////// // EuProjects-Web-Fetch void TCordisEuProjWebFetch::OnFetch(const int&, const PWebPg& WebPg){ // print url of downloaded page printf("%s\n", WebPg->GetUrlStr().CStr()); printf("-----------------------\n"); //printf("%s", WebPg->GetHttpHdStr().CStr()); //printf("%s", WebPg->GetHttpBodyAsStr().CStr()); // get current page url & set of outgoing urls with descriptions TStr WebPgUrlStr=WebPg->GetUrlStr(); TStrKdV DescUrlStrKdV; WebPg->GetOutDescUrlStrKdV(DescUrlStrKdV); // check current page type and react accordingly if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){ // if the page represents project description (ACTION=D) // save the document TStr HtmlStr=WebPg->GetHttpBodyAsStr(); WebPgUrlStr.Save(*EuProjSOut); HtmlStr.Save(*EuProjSOut); EuProjSOut->Flush(); int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD")); if (DescUrlStrKdN!=-1){ // fetch next document FetchDocN++; FetchUrlStr= TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+ FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN); FetchRetries=0; FetchUrl(FetchUrlStr); } else { printf("*** No forward pointer.\n"); TSysMsg::Quit(); } // search for forward pointer (to the next project description) /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD")); if (DescUrlStrKdN!=-1){ // fetch next project description (usual, most frequent case) FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0; FetchUrl(FetchUrlStr); } else { // last project description doesn't include forward pointer printf("*** No forward pointer.\n"); TSysMsg::Quit(); }*/ } else if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=R&")){ // if the page represents project record-set (ACTION=R) // take session id FetchSesIdStr=WebPgUrlStr.GetWcMatch("*SESSION=*&*", 1); FetchDocN=1; FetchUrlStr= TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+ FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN); FetchRetries=0; FetchUrl(FetchUrlStr); // move to the first project-description-url (first record-set only) /*int DescUrlStrKdN=0; while (DescUrlStrKdN<DescUrlStrKdV.Len()){ TStr UrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; DescUrlStrKdN++; if (UrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){ // fetch first project-description only FetchUrl(UrlStr); } } if (DescUrlStrKdN>=DescUrlStrKdV.Len()){ // quit downloading if no project descriptions printf("*** No project descriptions.\n"); TSysMsg::Quit(); }*/ // fetch next index page /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS")); if (DescUrlStrKdN!=-1){ FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0; FetchUrl(FetchUrlStr); } else { printf("*** No next 20 records.\n"); }*/ } else { // get forward pointer to the first project record-set (start only) int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS")); if (DescUrlStrKdN!=-1){ FetchUrl(DescUrlStrKdV[DescUrlStrKdN].Dat);} } }
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){ TStr UrlStr=WebPg->GetUrlStr(); TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl()); TStr HtmlStr=WebPg->GetHttpBodyAsStr(); PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); THtmlLxSym Sym; TChA ChA; // move to title while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<FONT>")){ TStr FaceArg=HtmlLx.GetArg("FACE", ""); TStr SizeArg=HtmlLx.GetArg("SIZE", ""); if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;} } } // extract title TChA TitleChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();} TitleChA+=ChA; } TStr TitleStr=TitleChA; //printf("'%s'\n", TitleStr.CStr()); // extract authors TStrV AuthorNmV; TChA AuthorNmChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ do { HtmlLx.GetSym(); Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();} AuthorNmChA+=ChA; } } while (!((Sym==hsyETag)&&(ChA=="<A>"))); AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr(); } if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} } for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){ //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr()); } // move to x-sell TStrQ PrevStrQ(3); while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ PrevStrQ.Push(ChA); if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers") &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;} } else { PrevStrQ.Clr(); } } // extract x-sell pointers TStrV NextItemIdV; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ TStr RelUrlStr=HtmlLx.GetArg("HREF"); PUrl Url=TUrl::New(RelUrlStr, UrlStr); TStr NextItemId=TAmazonItem::GetItemId(Url); NextItemIdV.Add(NextItemId); } if ((Sym==hsyETag)&&(ChA=="<UL>")){break;} } for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){ //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr()); } // construct item object PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV)); return AmazonItem; }
PRSet TRSet::NewNews(const PWebPg& WebPg){ TStr UrlStr=WebPg->GetUrlStr(); TStr HtmlStr=WebPg->GetHttpBodyAsStr(); return TRSet::NewNews(UrlStr, HtmlStr); }
PGgSchRSet TGgSchRSet::NewScholar(const PWebPg& WebPg){ TStr UrlStr=WebPg->GetUrlStr(); TStr HtmlStr=WebPg->GetHttpBodyAsStr(); return TGgSchRSet::NewScholar(UrlStr, HtmlStr); }