示例#1
0
/////////////////////////////////////////////////
// Web-Page-To-File-Fetch
void TWebPgToFileFetch::OnFetch(const int&, const PWebPg& WebPg){
  // save web-page as http
  if (!OutHttpFNm.Empty()){
    WebPg->SaveAsHttp(OutHttpFNm);}
  // save http-body
  if (!OutHttpBodyFNm.Empty()){
    WebPg->SaveAsHttpBody(OutHttpBodyFNm);}
  // save web-page as xml
  if (!OutXmlFNm.Empty()){
    TStr HtmlStr=WebPg->GetHttpBodyAsStr();
    THtmlDoc::SaveHtmlToXml(HtmlStr, OutXmlFNm, WebPg->GetUrlStr(),
     OutXmlTextP, OutXmlUrlP, OutXmlToksP, OutXmlTagsP, OutXmlArgsP);
  }
  // save web-page as text
  if (!OutTxtFNm.Empty()){
    TStr HtmlStr=WebPg->GetHttpBodyAsStr();
    THtmlDoc::SaveHtmlToTxt(HtmlStr, OutTxtFNm, WebPg->GetUrlStr(),
     OutTxtUrlP, OutXmlTagsP);
  }
  // output to screen
  if (OutScrP){
    printf("%s\n", WebPg->GetUrlStr().CStr());
    printf("-----------------------\n");
    printf("%s", WebPg->GetHttpHdStr().CStr());
    printf("%s", WebPg->GetHttpBodyAsStr().CStr());
  }
}
示例#2
0
文件: google.cpp 项目: Accio/snap
/////////////////////////////////////////////////
// Google-Web-Fetch-Saver
void TGgWebFetchSaver::OnFetch(const int&, const PWebPg& WebPg){
  printf("Fetched [Wait:%d Conn.:%d]: %s\n",
   GetWaitUrls(), GetConnUrls(), WebPg->GetUrlStr().CStr());
  WebPgV.Add(WebPg);
  if (Empty()){
    TSysMsg::Quit();}
}
示例#3
0
void TWebTxtBs::OnFetch(const int& /*FId*/, const PWebPg& WebPg){
  if (!IsUrlStr(WebPg->GetUrlStr())){
    TMem HttpRespMem; WebPg->GetHttpResp()->GetAsMem(HttpRespMem);
    TxtBs->AddDocMem(WebPg->GetUrlStr(), HttpRespMem, WebBsFetch->IsIndexTxtBs());
    if (WebPg->IsTxt()){
      PWebFilter WebFilter=WebBsFetch->GetWebFilter();
      PSIn HtmlDocSIn=TStrIn::New(WebPg->GetHttpBodyAsStr());
      PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlDocSIn, hdtHRef);
      TStr BaseUrlStr=WebPg->GetUrlStr(); TStr RelUrlStr;
      for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
        if (WebFilter->IsUrlTok(HtmlDoc->GetTok(TokN), RelUrlStr)){
          PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
          if (Url->IsOk(usHttp)){
            WebBsFetch->SetUrlAsFinal(Url);
            if (WebFilter->IsUrlOk(Url)&&
             (!WebBsFetch->IsUrlEnqueued(Url))&&
             (!IsUrlStr(Url->GetAsFinalUrlStr()))){
              WebBsFetch->AddFetchUrl(Url);
            }
          }
        }
      }
      WebBsFetch->GoFetch();
    }
  }
}
/////////////////////////////////////////////////
// EuProjects-Web-Fetch
void TCordisEuProjWebFetch::OnFetch(const int&, const PWebPg& WebPg){
  // print url of downloaded page
  printf("%s\n", WebPg->GetUrlStr().CStr());
  printf("-----------------------\n");
  //printf("%s", WebPg->GetHttpHdStr().CStr());
  //printf("%s", WebPg->GetHttpBodyAsStr().CStr());
  // get current page url & set of outgoing urls with descriptions
  TStr WebPgUrlStr=WebPg->GetUrlStr();
  TStrKdV DescUrlStrKdV; WebPg->GetOutDescUrlStrKdV(DescUrlStrKdV);
  // check current page type and react accordingly
  if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){
    // if the page represents project description (ACTION=D)
    // save the document
    TStr HtmlStr=WebPg->GetHttpBodyAsStr();
    WebPgUrlStr.Save(*EuProjSOut);
    HtmlStr.Save(*EuProjSOut);
    EuProjSOut->Flush();
    int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD"));
    if (DescUrlStrKdN!=-1){
      // fetch next document
      FetchDocN++;
      FetchUrlStr=
       TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+
       FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN);
      FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      printf("*** No forward pointer.\n");
      TSysMsg::Quit();
    }
    // search for forward pointer (to the next project description)
    /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT RECORD"));
    if (DescUrlStrKdN!=-1){
      // fetch next project description (usual, most frequent case)
      FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      // last project description doesn't include forward pointer
      printf("*** No forward pointer.\n");
      TSysMsg::Quit();
    }*/
  } else
  if (WebPgUrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=R&")){
    // if the page represents project record-set (ACTION=R)
    // take session id
    FetchSesIdStr=WebPgUrlStr.GetWcMatch("*SESSION=*&*", 1);
    FetchDocN=1;
    FetchUrlStr=
     TStr("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&SESSION=")+
     FetchSesIdStr+"&DOC="+TInt::GetStr(FetchDocN);
    FetchRetries=0;
    FetchUrl(FetchUrlStr);
    // move to the first project-description-url (first record-set only)
    /*int DescUrlStrKdN=0;
    while (DescUrlStrKdN<DescUrlStrKdV.Len()){
      TStr UrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat; DescUrlStrKdN++;
      if (UrlStr.IsPrefix("http://dbs.cordis.lu/fep-cgi/srchidadb?ACTION=D&")){
        // fetch first project-description only
        FetchUrl(UrlStr);
      }
    }
    if (DescUrlStrKdN>=DescUrlStrKdV.Len()){
      // quit downloading if no project descriptions
      printf("*** No project descriptions.\n");
      TSysMsg::Quit();
    }*/
    // fetch next index page
    /*int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS"));
    if (DescUrlStrKdN!=-1){
      FetchUrlStr=DescUrlStrKdV[DescUrlStrKdN].Dat;
      FetchRetries=0;
      FetchUrl(FetchUrlStr);
    } else {
      printf("*** No next 20 records.\n");
    }*/
  } else {
    // get forward pointer to the first project record-set (start only)
    int DescUrlStrKdN=DescUrlStrKdV.SearchForw(TStrKd("NEXT 20 RECORDS"));
    if (DescUrlStrKdN!=-1){
      FetchUrl(DescUrlStrKdV[DescUrlStrKdN].Dat);}
  }
}
示例#5
0
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){
  TStr UrlStr=WebPg->GetUrlStr();
  TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl());
  TStr HtmlStr=WebPg->GetHttpBodyAsStr();
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);
  THtmlLxSym Sym; TChA ChA;

  // move to title
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<FONT>")){
      TStr FaceArg=HtmlLx.GetArg("FACE", "");
      TStr SizeArg=HtmlLx.GetArg("SIZE", "");
      if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;}
    }
  }
  // extract title
  TChA TitleChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
    if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();}
    TitleChA+=ChA;
  }
  TStr TitleStr=TitleChA;
  //printf("'%s'\n", TitleStr.CStr());
  // extract authors
  TStrV AuthorNmV;
  TChA AuthorNmChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      do {
        HtmlLx.GetSym();
        Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
        if (Sym==hsyStr){
          if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();}
          AuthorNmChA+=ChA;
        }
      } while (!((Sym==hsyETag)&&(ChA=="<A>")));
      AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr();
    }
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
  }
  for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){
    //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr());
  }
  // move to x-sell
  TStrQ PrevStrQ(3);
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if (Sym==hsyStr){
      PrevStrQ.Push(ChA);
      if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers")
       &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;}
    } else {
      PrevStrQ.Clr();
    }
  }
  // extract x-sell pointers
  TStrV NextItemIdV;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      TStr RelUrlStr=HtmlLx.GetArg("HREF");
      PUrl Url=TUrl::New(RelUrlStr, UrlStr);
      TStr NextItemId=TAmazonItem::GetItemId(Url);
      NextItemIdV.Add(NextItemId);
    }
    if ((Sym==hsyETag)&&(ChA=="<UL>")){break;}
  }
  for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){
    //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr());
  }

  // construct item object
  PAmazonItem AmazonItem=PAmazonItem(new
   TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV));
  return AmazonItem;
}
示例#6
0
文件: google.cpp 项目: Accio/snap
PRSet TRSet::NewNews(const PWebPg& WebPg){
  TStr UrlStr=WebPg->GetUrlStr();
  TStr HtmlStr=WebPg->GetHttpBodyAsStr();
  return TRSet::NewNews(UrlStr, HtmlStr);
}
示例#7
0
文件: google.cpp 项目: Accio/snap
PGgSchRSet TGgSchRSet::NewScholar(const PWebPg& WebPg){
  TStr UrlStr=WebPg->GetUrlStr();
  TStr HtmlStr=WebPg->GetHttpBodyAsStr();
  return TGgSchRSet::NewScholar(UrlStr, HtmlStr);
}