///////////////////////////////////////////////// // SkyGrid-Document void TSkyGridBinDoc::SaveBinDocV( const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){ printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr()); TFOut SOut(OutBinFNm); TFFile FFile(InXmlFPath, true); TStr FNm; int Docs=0; int DateDocs=0; uint64 PrevTm=0; while (FFile.Next(FNm)){ if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} //printf(" Processing '%s' ...", FNm.CStr()); PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm); PXmlTok ContentTok=XmlDoc->GetTagTok("item|content"); TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value"); TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false); TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false); TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value"); TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV); TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false); // extract date TStr DateStr=SwIdStr.GetSubStr(0, 7); TStr YearStr=DateStr.GetSubStr(0, 3); TStr MonthStr=DateStr.GetSubStr(4, 5); TStr DayStr=DateStr.GetSubStr(6, 7); TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt()); uint64 Tm=TTm::GetMSecsFromTm(DateTm); // extract entities TStrIntH EntNmToFqH; for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){ PXmlTok EntityTok=EntityTokV[EntityTokN]; if (!EntityTok->IsTag("entity")){continue;} TStr CanonicalNm=EntityTok->GetArgVal("canonical", ""); TStr TextStr=EntityTok->GetArgVal("text", ""); TStr TypeNm=EntityTok->GetArgVal("type", ""); TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm; EntNmToFqH.AddDat(EntNm)++; } TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false); // extract headline TChA HeadlineChA=BodyStr.GetSubStr(0, 250); while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){ HeadlineChA.Trunc(HeadlineChA.Len()-1);} HeadlineChA+="..."; // create document TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV); // save document Doc.Save(SOut); // screen log if (PrevTm!=Tm){ if (PrevTm!=0){printf("\n");} PrevTm=Tm; DateDocs=0; } Docs++; DateDocs++; printf(" %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs); } printf("\nDone.\n"); }
void __fastcall TContexterF::NmObjSortRgClick(TObject *Sender){ // select radio-group & list-box TRadioGroup* NmObjSortRg=NULL; TListBox* NmObjLb=NULL; if (Sender==CtxNmObjSortRg){NmObjSortRg=CtxNmObjSortRg; NmObjLb=CtxNmObjLb;} else if (Sender==SrcNmObjSortRg){NmObjSortRg=SrcNmObjSortRg; NmObjLb=SrcNmObjLb;} else if (Sender==DstNmObjSortRg){NmObjSortRg=DstNmObjSortRg; NmObjLb=DstNmObjLb;} else {return;} // determine sort-order bool SortByNameP=false; if (NmObjSortRg->ItemIndex==0){SortByNameP=true;} else if (NmObjSortRg->ItemIndex==1){SortByNameP=false;} else {NmObjSortRg->ItemIndex=1; SortByNameP=false;} // fill NmObjLb NmObjLb->Clear(); EnConceptWordLb->Clear(); EnCoNmObjLb->Clear(); if (SortByNameP){ //...sorted by name TStrIntPrV NmObjStrFqPrV; State->NmObjBs->GetNmObjStrFqPrV(NmObjStrFqPrV, 3); NmObjStrFqPrV.Sort(); for (int NmObjN=0; NmObjN<NmObjStrFqPrV.Len(); NmObjN++){ TStr LbItemStr=NmObjStrFqPrV[NmObjN].Val1+ TInt::GetStr(NmObjStrFqPrV[NmObjN].Val2, " (%d)"); NmObjLb->Items->Add(LbItemStr.CStr()); } } else { //...sorted by frequency TIntStrPrV NmObjFqStrPrV; State->NmObjBs->GetNmObjFqStrPrV(NmObjFqStrPrV, 3); NmObjFqStrPrV.Sort(false); for (int NmObjN=0; NmObjN<NmObjFqStrPrV.Len(); NmObjN++){ TStr LbItemStr=NmObjFqStrPrV[NmObjN].Val2+ TInt::GetStr(NmObjFqStrPrV[NmObjN].Val1, " (%d)"); NmObjLb->Items->Add(LbItemStr.CStr()); } } }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // get command line parameters Env.PrepArgs("Crawl-Base to Text", 0); TStr InCrawlBsFNm=Env.GetIfArgPrefixStr("-i:", "", "Crawl-Base-FileName"); TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "Crawl.Txt", "Output-Text-Filename"); TStr OutStatFNm=Env.GetIfArgPrefixStr("-os:", "Crawl.Stat.Txt", "Output-Statistics-Text-Filename"); bool SaveContP=Env.GetIfArgPrefixBool("-sc:", false, "Save-Content"); bool SaveContOutUrlP=Env.GetIfArgPrefixBool("-scou:", true, "Save-Content-Outgoing-Urls"); bool SaveContTagP=Env.GetIfArgPrefixBool("-sct:", true, "Save-Content-Tags"); bool SaveOutUrlP=Env.GetIfArgPrefixBool("-sou:", false, "Save-Outgoing-Urls"); bool SaveCTxtP=Env.GetIfArgPrefixBool("-sctc:", false, "Save-Continuos-Text-Content"); int MnCTxtToks=Env.GetIfArgPrefixInt("-mctt:", 100, "Minimal-Continuos-Text-Tokens"); TStrV BlockedDmNmV=Env.GetIfArgPrefixStrV("-bd:", "Blocked-Domain-Names (multiple)"); if (Env.IsEndOfRun()){return 0;} // -i:si -sc:t -scou:n -sct:n -sctc:t -bd:.delo.si -bd:.dnevnik.si -bd:.vecer.si TStr BlobBsFMid=InCrawlBsFNm.GetFMid(); // output file TFOut TxtFOut(OutTxtFNm); FILE* fTxt=TxtFOut.GetFileId(); fprintf(fTxt, "Comment:input=%s\n", InCrawlBsFNm.CStr()); fprintf(fTxt, "Comment:output=%s\n", OutTxtFNm.CStr()); fprintf(fTxt, "BlobBaseName:%s\n", BlobBsFMid.CStr()); // statistics TStrIntH HostNmToFqH; TStrIntH StatusCdToFqH; TStrIntH ContTypeToFqH; PMom HttpContLenMom=TMom::New(); PBlobBs CrawlBBs=TMBlobBs::New(InCrawlBsFNm); TBlobPt TrvCrawlBPt=CrawlBBs->FFirstBlobPt(); TBlobPt CrawlBPt; PSIn CrawlBlobSIn; int CrawlBlobN=0; while (CrawlBBs->FNextBlobPt(TrvCrawlBPt, CrawlBPt, CrawlBlobSIn)){ CrawlBlobN++; printf("%d\r", CrawlBlobN); TStr DateTimeStr(*CrawlBlobSIn); //TStr DateTimeStr; TStr UrlStr(*CrawlBlobSIn); PUrl Url=TUrl::New(UrlStr); IAssert(Url->IsOk(usHttp)); TMem HttpRespMem(*CrawlBlobSIn); PSIn HttpRespSIn=HttpRespMem.GetSIn(); PHttpResp HttpResp=THttpResp::New(HttpRespSIn); // statistics HostNmToFqH.AddDat(Url->GetHostNm())++; StatusCdToFqH.AddDat(TInt::GetStr(HttpResp->GetStatusCd()))++; ContTypeToFqH.AddDat(HttpResp->GetFldVal(THttp::ContTypeFldNm))++; int ContLen=HttpResp->GetFldVal(THttp::ContLenFldNm).GetInt(-1); if (ContLen!=-1){ HttpContLenMom->Add(ContLen);} // check blocked domain-names if (!BlockedDmNmV.Empty()){ TStr DmNm=Url->GetDmNm(); int BlockedDmP=false; for (int BDmNmN=0; BDmNmN<BlockedDmNmV.Len(); BDmNmN++){ if (DmNm.IsSuffix(BlockedDmNmV[BDmNmN])){ BlockedDmP=true; break; } } if (BlockedDmP){ continue; } } // check continuos-text if (SaveCTxtP&&IsCTxtHttpResp(Url, HttpResp, MnCTxtToks)){continue;} if (HttpResp->IsStatusCd_Ok()){ PWebPg WebPg=TWebPg::New(UrlStr, HttpResp); fprintf(fTxt, "Start:HttpOk\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } TMem BodyMem=HttpResp->GetBodyAsMem(); fprintf(fTxt, "BodyMd5:%s\n", TMd5Sig(BodyMem).GetStr().CStr()); // text if (SaveContP){ if (HttpResp->IsContType(THttp::TextHtmlFldVal)){ TStr HtmlStr=BodyMem.GetAsStr(); TStr TxtStr=THtmlDoc::GetTxtLnDoc(HtmlStr, UrlStr, SaveContOutUrlP, SaveContTagP); fprintf(fTxt, "Content:%s\n", TxtStr.CStr()); } } // outgoing-urls if (SaveOutUrlP){ TUrlV OutUrlV; WebPg->GetOutUrlV(OutUrlV); for (int OutUrlN=0; OutUrlN<OutUrlV.Len(); OutUrlN++){ TStr OutUrlStr=OutUrlV[OutUrlN]->GetUrlStr(); fprintf(fTxt, "OutUrl:%s\n", OutUrlStr.CStr()); } } fprintf(fTxt, "End:HttpOk\n"); } else if (HttpResp->IsStatusCd_Redir()){ TStr RedirUrlStr=HttpResp->GetFldVal(THttp::LocFldNm); PUrl RedirUrl=TUrl::New(RedirUrlStr, UrlStr); if (RedirUrl->IsOk(usHttp)){ TStr RedirUrlStr=RedirUrl->GetUrlStr(); fprintf(fTxt, "Start:HttpRedirection\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); fprintf(fTxt, "RedirectionUrl:%s\n", RedirUrlStr.CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } fprintf(fTxt, "End:HttpRedirection\n"); } } } // statistics HttpContLenMom->Def(); if (!OutStatFNm.Empty()){ TFOut StatFOut(OutStatFNm); FILE* fStat=StatFOut.GetFileId(); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); // hosts {fprintf(fStat, "================================================\n"); TIntStrPrV FqHostNmPrV; HostNmToFqH.GetDatKeyPrV(FqHostNmPrV); FqHostNmPrV.Sort(false); int HostNmsSum=0; fprintf(fStat, "Hosts (%d):\n", FqHostNmPrV.Len()); for (int HostNmN=0; HostNmN<FqHostNmPrV.Len(); HostNmN++){ fprintf(fStat, "%7d '%s'\n", FqHostNmPrV[HostNmN].Val1, FqHostNmPrV[HostNmN].Val2.CStr()); HostNmsSum+=FqHostNmPrV[HostNmN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", HostNmsSum, "Sum"); fprintf(fStat, "================================================\n");} // status-code {fprintf(fStat, "================================================\n"); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); FqStatusCdPrV.Sort(false); int StatusCdsSum=0; fprintf(fStat, "Status-Codes (%d):\n", FqStatusCdPrV.Len()); for (int StatusCdN=0; StatusCdN<FqStatusCdPrV.Len(); StatusCdN++){ fprintf(fStat, "%7d '%s'\n", FqStatusCdPrV[StatusCdN].Val1, FqStatusCdPrV[StatusCdN].Val2.CStr()); StatusCdsSum+=FqStatusCdPrV[StatusCdN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", StatusCdsSum, "Sum"); fprintf(fStat, "================================================\n");} // content-type {fprintf(fStat, "================================================\n"); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); FqContTypePrV.Sort(false); int ContTypesSum=0; fprintf(fStat, "Content-Types (%d):\n", FqContTypePrV.Len()); for (int ContTypeN=0; ContTypeN<FqContTypePrV.Len(); ContTypeN++){ fprintf(fStat, "%7d '%s'\n", FqContTypePrV[ContTypeN].Val1, FqContTypePrV[ContTypeN].Val2.CStr()); ContTypesSum+=FqContTypePrV[ContTypeN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", ContTypesSum, "Sum"); fprintf(fStat, "================================================\n");} // content-length {fprintf(fStat, "================================================\n"); fprintf(fStat, "Content-length:\n"); if (HttpContLenMom->IsUsable()){ TStr MomStr=HttpContLenMom->GetStr('\n', ':', true, false, "%g"); fprintf(fStat, "%s\n", MomStr.CStr()); } else { fprintf(fStat, "Statistics not usable.\n"); } fprintf(fStat, "================================================\n");} } return 0; Catch; return 1; }
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){ // file-names TStr NrFPath=TStr::GetNrFPath(FPath); TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml"; TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml"; TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml"; TStr CycKBaseFNm=NrFPath+"kb-dump.xml"; // create cyc-base PCycBs CycBs=TCycBs::New(); // lexicon {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr()); PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycWStr; TStr PrevCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn); if (!XmlDoc->IsOk()){ printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("word")); TStr CycWStr=TopTok->GetArgVal("string"); TStr CycLStr=TopTok->GetArgVal("cycl"); PrevCycWStr=CycWStr; PrevCycLStr; // insert data CycBs->AddEdge(CycLStr, "#$nameString", CycWStr); CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr); } printf("%d Docs\nDone.\n", XmlDocs);} // taxonomy {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr()); PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevSrcCycLStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevSrcCycLStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr SrcCycLStr=TopTok->GetArgVal("cycl"); PrevSrcCycLStr=SrcCycLStr; for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){ PXmlTok SubTok=TopTok->GetSubTok(SubTokN); TStr DstCycLStr=SubTok->GetTagNm(); if (SubTok->IsTag("isa")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr); } else if (SubTok->IsTag("genl")){ DstCycLStr=SubTok->GetArgVal("value"); CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr); CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr); } else { Fail; } } } printf("%d Docs\nDone.\n", XmlDocs);} // relevance {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr()); PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycStr; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycStr.CStr()); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("term")); TStr CycStr=TopTok->GetArgVal("cyc"); PrevCycStr=CycStr; //IAssert(CycBs->IsVNm(CycStr)); if (CycBs->IsVNm(CycStr)){ if (TopTok->GetArgVal("thcl")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);} if (TopTok->GetArgVal("irrel")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);} if (TopTok->GetArgVal("clarifying")=="T"){ CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);} if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){ CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);} } else { //printf("%s\n", CycStr.CStr()); } } printf("%d Docs\nDone.\n", XmlDocs);} // knowledge-base {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr()); PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm); PXmlDoc XmlDoc; int XmlDocs=0; TStr PrevCycLStr; TStrV PrevArgCycLStrV; TStrIntH HdCycLToFq; forever{ // statistics XmlDocs++; if (XmlDocs%1000==0){ printf("%d Docs\r", XmlDocs);} //if (XmlDocs>10000){break;} // load xml-tree XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn); if (!XmlDoc->IsOk()){ printf("%s\n", PrevCycLStr.CStr()); for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){ printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());} printf("\n"); Fail; } // extract fields from xml-tree PXmlTok TopTok=XmlDoc->GetTok(); if (TopTok->IsTag("end")){break;} IAssert(TopTok->IsTag("sentence")); TStr CycLStr=TopTok->GetArgVal("cycl"); TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV); TStrV ArgCycLStrV; for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){ PXmlTok Tok=ArgXmlTokV[ArgN]; IAssert(Tok->IsTag("arg")); if (Tok->IsArg("cycl")){ TStr ArgCycLStr=Tok->GetArgVal("cycl"); ArgCycLStrV.Add(ArgCycLStr); } else { ArgCycLStrV.Add("Empty"); } } PrevCycLStr=CycLStr; PrevArgCycLStrV=ArgCycLStrV; if (ArgCycLStrV.Len()>0){ HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;} // insert if (ArgCycLStrV.Len()==3){ TStr PredNm=ArgCycLStrV[0]; if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){ TStr BackLinkPredNm=TStr("~")+PredNm; TStr Arg1=ArgCycLStrV[1]; TStr Arg2=ArgCycLStrV[2]; CycBs->AddEdge(Arg1, PredNm, Arg2); CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1); } } } // output top cycl relations {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId(); TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); FqCycLStrPrV.Sort(false); for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){ fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr()); }} printf("%d Docs\nDone.\n", XmlDocs);} // return cyc-base return CycBs; }