///////////////////////////////////////////////// // Google-Scholar-Reference TMd5Sig TGgSchRef::GetMd5Sig() const { TChA ChA; ChA+=TitleStr; for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){ ChA+=';'; ChA+=AuthNmV[AuthN];} ChA+=';'; ChA+=PubNm; ChA+=';'; ChA+=YearStr; return TMd5Sig(ChA); }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // get command line parameters Env.PrepArgs("Crawl-Base to Text", 0); TStr InCrawlBsFNm=Env.GetIfArgPrefixStr("-i:", "", "Crawl-Base-FileName"); TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "Crawl.Txt", "Output-Text-Filename"); TStr OutStatFNm=Env.GetIfArgPrefixStr("-os:", "Crawl.Stat.Txt", "Output-Statistics-Text-Filename"); bool SaveContP=Env.GetIfArgPrefixBool("-sc:", false, "Save-Content"); bool SaveContOutUrlP=Env.GetIfArgPrefixBool("-scou:", true, "Save-Content-Outgoing-Urls"); bool SaveContTagP=Env.GetIfArgPrefixBool("-sct:", true, "Save-Content-Tags"); bool SaveOutUrlP=Env.GetIfArgPrefixBool("-sou:", false, "Save-Outgoing-Urls"); bool SaveCTxtP=Env.GetIfArgPrefixBool("-sctc:", false, "Save-Continuos-Text-Content"); int MnCTxtToks=Env.GetIfArgPrefixInt("-mctt:", 100, "Minimal-Continuos-Text-Tokens"); TStrV BlockedDmNmV=Env.GetIfArgPrefixStrV("-bd:", "Blocked-Domain-Names (multiple)"); if (Env.IsEndOfRun()){return 0;} // -i:si -sc:t -scou:n -sct:n -sctc:t -bd:.delo.si -bd:.dnevnik.si -bd:.vecer.si TStr BlobBsFMid=InCrawlBsFNm.GetFMid(); // output file TFOut TxtFOut(OutTxtFNm); FILE* fTxt=TxtFOut.GetFileId(); fprintf(fTxt, "Comment:input=%s\n", InCrawlBsFNm.CStr()); fprintf(fTxt, "Comment:output=%s\n", OutTxtFNm.CStr()); fprintf(fTxt, "BlobBaseName:%s\n", BlobBsFMid.CStr()); // statistics TStrIntH HostNmToFqH; TStrIntH StatusCdToFqH; TStrIntH ContTypeToFqH; PMom HttpContLenMom=TMom::New(); PBlobBs CrawlBBs=TMBlobBs::New(InCrawlBsFNm); TBlobPt TrvCrawlBPt=CrawlBBs->FFirstBlobPt(); TBlobPt CrawlBPt; PSIn CrawlBlobSIn; int CrawlBlobN=0; while (CrawlBBs->FNextBlobPt(TrvCrawlBPt, CrawlBPt, CrawlBlobSIn)){ CrawlBlobN++; printf("%d\r", CrawlBlobN); TStr DateTimeStr(*CrawlBlobSIn); //TStr DateTimeStr; TStr UrlStr(*CrawlBlobSIn); PUrl Url=TUrl::New(UrlStr); IAssert(Url->IsOk(usHttp)); TMem HttpRespMem(*CrawlBlobSIn); PSIn HttpRespSIn=HttpRespMem.GetSIn(); PHttpResp HttpResp=THttpResp::New(HttpRespSIn); // statistics HostNmToFqH.AddDat(Url->GetHostNm())++; StatusCdToFqH.AddDat(TInt::GetStr(HttpResp->GetStatusCd()))++; ContTypeToFqH.AddDat(HttpResp->GetFldVal(THttp::ContTypeFldNm))++; int ContLen=HttpResp->GetFldVal(THttp::ContLenFldNm).GetInt(-1); if (ContLen!=-1){ HttpContLenMom->Add(ContLen);} // check blocked domain-names if (!BlockedDmNmV.Empty()){ TStr DmNm=Url->GetDmNm(); int BlockedDmP=false; for (int BDmNmN=0; BDmNmN<BlockedDmNmV.Len(); BDmNmN++){ if (DmNm.IsSuffix(BlockedDmNmV[BDmNmN])){ BlockedDmP=true; break; } } if (BlockedDmP){ continue; } } // check continuos-text if (SaveCTxtP&&IsCTxtHttpResp(Url, HttpResp, MnCTxtToks)){continue;} if (HttpResp->IsStatusCd_Ok()){ PWebPg WebPg=TWebPg::New(UrlStr, HttpResp); fprintf(fTxt, "Start:HttpOk\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } TMem BodyMem=HttpResp->GetBodyAsMem(); fprintf(fTxt, "BodyMd5:%s\n", TMd5Sig(BodyMem).GetStr().CStr()); // text if (SaveContP){ if (HttpResp->IsContType(THttp::TextHtmlFldVal)){ TStr HtmlStr=BodyMem.GetAsStr(); TStr TxtStr=THtmlDoc::GetTxtLnDoc(HtmlStr, UrlStr, SaveContOutUrlP, SaveContTagP); fprintf(fTxt, "Content:%s\n", TxtStr.CStr()); } } // outgoing-urls if (SaveOutUrlP){ TUrlV OutUrlV; WebPg->GetOutUrlV(OutUrlV); for (int OutUrlN=0; OutUrlN<OutUrlV.Len(); OutUrlN++){ TStr OutUrlStr=OutUrlV[OutUrlN]->GetUrlStr(); fprintf(fTxt, "OutUrl:%s\n", OutUrlStr.CStr()); } } fprintf(fTxt, "End:HttpOk\n"); } else if (HttpResp->IsStatusCd_Redir()){ TStr RedirUrlStr=HttpResp->GetFldVal(THttp::LocFldNm); PUrl RedirUrl=TUrl::New(RedirUrlStr, UrlStr); if (RedirUrl->IsOk(usHttp)){ TStr RedirUrlStr=RedirUrl->GetUrlStr(); fprintf(fTxt, "Start:HttpRedirection\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); fprintf(fTxt, "RedirectionUrl:%s\n", RedirUrlStr.CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } fprintf(fTxt, "End:HttpRedirection\n"); } } } // statistics HttpContLenMom->Def(); if (!OutStatFNm.Empty()){ TFOut StatFOut(OutStatFNm); FILE* fStat=StatFOut.GetFileId(); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); // hosts {fprintf(fStat, "================================================\n"); TIntStrPrV FqHostNmPrV; HostNmToFqH.GetDatKeyPrV(FqHostNmPrV); FqHostNmPrV.Sort(false); int HostNmsSum=0; fprintf(fStat, "Hosts (%d):\n", FqHostNmPrV.Len()); for (int HostNmN=0; HostNmN<FqHostNmPrV.Len(); HostNmN++){ fprintf(fStat, "%7d '%s'\n", FqHostNmPrV[HostNmN].Val1, FqHostNmPrV[HostNmN].Val2.CStr()); HostNmsSum+=FqHostNmPrV[HostNmN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", HostNmsSum, "Sum"); fprintf(fStat, "================================================\n");} // status-code {fprintf(fStat, "================================================\n"); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); FqStatusCdPrV.Sort(false); int StatusCdsSum=0; fprintf(fStat, "Status-Codes (%d):\n", FqStatusCdPrV.Len()); for (int StatusCdN=0; StatusCdN<FqStatusCdPrV.Len(); StatusCdN++){ fprintf(fStat, "%7d '%s'\n", FqStatusCdPrV[StatusCdN].Val1, FqStatusCdPrV[StatusCdN].Val2.CStr()); StatusCdsSum+=FqStatusCdPrV[StatusCdN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", StatusCdsSum, "Sum"); fprintf(fStat, "================================================\n");} // content-type {fprintf(fStat, "================================================\n"); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); FqContTypePrV.Sort(false); int ContTypesSum=0; fprintf(fStat, "Content-Types (%d):\n", FqContTypePrV.Len()); for (int ContTypeN=0; ContTypeN<FqContTypePrV.Len(); ContTypeN++){ fprintf(fStat, "%7d '%s'\n", FqContTypePrV[ContTypeN].Val1, FqContTypePrV[ContTypeN].Val2.CStr()); ContTypesSum+=FqContTypePrV[ContTypeN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", ContTypesSum, "Sum"); fprintf(fStat, "================================================\n");} // content-length {fprintf(fStat, "================================================\n"); fprintf(fStat, "Content-length:\n"); if (HttpContLenMom->IsUsable()){ TStr MomStr=HttpContLenMom->GetStr('\n', ':', true, false, "%g"); fprintf(fStat, "%s\n", MomStr.CStr()); } else { fprintf(fStat, "Statistics not usable.\n"); } fprintf(fStat, "================================================\n");} } return 0; Catch; return 1; }