Beispiel #1
0
/////////////////////////////////////////////////
// SkyGrid-Document
void TSkyGridBinDoc::SaveBinDocV(
 const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){
  printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr());
  TFOut SOut(OutBinFNm);
  TFFile FFile(InXmlFPath, true); TStr FNm;
  int Docs=0; int DateDocs=0; uint64 PrevTm=0;
  while (FFile.Next(FNm)){
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    //printf("  Processing '%s' ...", FNm.CStr());
    PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm);
    PXmlTok ContentTok=XmlDoc->GetTagTok("item|content");
    TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value");
    TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false);
    TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false);
    TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value");
    TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV);
    TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false);
    // extract date
    TStr DateStr=SwIdStr.GetSubStr(0, 7);
    TStr YearStr=DateStr.GetSubStr(0, 3);
    TStr MonthStr=DateStr.GetSubStr(4, 5);
    TStr DayStr=DateStr.GetSubStr(6, 7);
    TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt());
    uint64 Tm=TTm::GetMSecsFromTm(DateTm);
    // extract entities
    TStrIntH EntNmToFqH;
    for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){
      PXmlTok EntityTok=EntityTokV[EntityTokN];
      if (!EntityTok->IsTag("entity")){continue;}
      TStr CanonicalNm=EntityTok->GetArgVal("canonical", "");
      TStr TextStr=EntityTok->GetArgVal("text", "");
      TStr TypeNm=EntityTok->GetArgVal("type", "");
      TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm;
      EntNmToFqH.AddDat(EntNm)++;
    }
    TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false);
    // extract headline
    TChA HeadlineChA=BodyStr.GetSubStr(0, 250);
    while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){
      HeadlineChA.Trunc(HeadlineChA.Len()-1);}
    HeadlineChA+="...";
    // create document
    TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV);
    // save document
    Doc.Save(SOut);
    // screen log
    if (PrevTm!=Tm){
      if (PrevTm!=0){printf("\n");}
      PrevTm=Tm; DateDocs=0;
    }
    Docs++; DateDocs++;
    printf("  %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs);
  }
  printf("\nDone.\n");
}
void TNmObjBs::GetNmObjFqStrPrV(TIntStrPrV& NmObjFqStrPrV, const int& MnFq) const {
  int NmObjs=GetNmObjs();
  NmObjFqStrPrV.Gen(NmObjs, 0);
  for (int NmObjId=0; NmObjId<NmObjs; NmObjId++){
    TIntStrPr NmObjFqStrPr;
    NmObjFqStrPr.Val1=GetNmObjDocs(NmObjId);
    NmObjFqStrPr.Val2=GetNmObjStr(NmObjId);
    if (NmObjFqStrPr.Val1>=MnFq){
      NmObjFqStrPrV.Add(NmObjFqStrPr);}
  }
}
Beispiel #3
0
int TSkyGridBs::AddDoc(
 const TStr& SwIdStr, const uint64& Tm,
 const TStr& TitleStr, const TStr& HeadlineStr,
 const TIntStrPrV FqEntNmPrV){
  // create entity-id vector
  TIntPrV EntIdFqPrV(FqEntNmPrV.Len(), 0);
  for (int EntN=0; EntN<FqEntNmPrV.Len(); EntN++){
    TStr EntNm=FqEntNmPrV[EntN].Val2;
    int EntFq=FqEntNmPrV[EntN].Val1;
    if (EntFq>=GetMnEntFqPerDoc()){
      int EntId=AddEntNm(EntNm);
      EntIdFqPrV.Add(TIntPr(EntId, EntFq));
    }
  }
  // check entity-id vector lenght
  if (EntIdFqPrV.Len()<GetMnEntsPerDoc()){
    return -1;}
  // create document
  PSkyGridDoc Doc=
   TSkyGridDoc::New(SwIdStr, Tm, TitleStr, HeadlineStr, EntIdFqPrV);
  // add document to base
  int DocId=GetNewDocId();
  IdToDocH.AddDat(DocId, Doc);
  // get entity
  for (int EntN=0; EntN<EntIdFqPrV.Len(); EntN++){
    int EntId=EntIdFqPrV[EntN].Val1;
    TSkyGridEnt& Ent=GetEnt(EntId);
    Ent.PushDocId(DocId);
  }
  // create link
  int EntLinkWgtSum=0;
  for (int EntN1=0; EntN1<EntIdFqPrV.Len(); EntN1++){
    int EntId1=EntIdFqPrV[EntN1].Val1;
    TSkyGridEnt& Ent1=GetEnt(EntId1);
    int EntWgt1=EntIdFqPrV[EntN1].Val2;
    for (int EntN2=0; EntN2<EntIdFqPrV.Len(); EntN2++){
      if (EntN1==EntN2){continue;}
      int EntId2=EntIdFqPrV[EntN2].Val1;
      int EntWgt2=EntIdFqPrV[EntN2].Val2;
      // entity-link-weight
      int EntLinkWgt=EntWgt1*EntWgt2;
      EntLinkWgtSum+=EntLinkWgt;
      // create entity-link-context
      TSkyGridEntLinkCtx LinkCtx(EntLinkWgt, DocId, Tm);
      Ent1.AddLink(EntId2, LinkCtx);
    }
  }

  // return doc-id
  return DocId;
}
Beispiel #4
0
PBowMd TBowWinnowMd::NewMulti(
 const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){
  // create model
  TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd);
  // traverse categories
  TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV);
  for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){
    // get category data
    TStr CatNm=FqCatNmPrV[CatN].Val2;
    int CId=BowDocBs->GetCId(CatNm);
    // output header
    printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n",
     CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats());
    // create model
    PBowMd BowMd=New(BowDocBs, CatNm, Beta);
    // add model to model-set
    MultiMd->AddBowMd(BowMd);
  }
  // return model
  return BowMd;
}
Beispiel #5
0
void __fastcall TContexterF::NmObjSortRgClick(TObject *Sender){
  // select radio-group & list-box
  TRadioGroup* NmObjSortRg=NULL; TListBox* NmObjLb=NULL;
  if (Sender==CtxNmObjSortRg){NmObjSortRg=CtxNmObjSortRg; NmObjLb=CtxNmObjLb;}
  else if (Sender==SrcNmObjSortRg){NmObjSortRg=SrcNmObjSortRg; NmObjLb=SrcNmObjLb;}
  else if (Sender==DstNmObjSortRg){NmObjSortRg=DstNmObjSortRg; NmObjLb=DstNmObjLb;}
  else {return;}

  // determine sort-order
  bool SortByNameP=false;
  if (NmObjSortRg->ItemIndex==0){SortByNameP=true;}
  else if (NmObjSortRg->ItemIndex==1){SortByNameP=false;}
  else {NmObjSortRg->ItemIndex=1; SortByNameP=false;}
  // fill NmObjLb
  NmObjLb->Clear();
  EnConceptWordLb->Clear();
  EnCoNmObjLb->Clear();
  if (SortByNameP){
    //...sorted by name
    TStrIntPrV NmObjStrFqPrV;
    State->NmObjBs->GetNmObjStrFqPrV(NmObjStrFqPrV, 3);
    NmObjStrFqPrV.Sort();
    for (int NmObjN=0; NmObjN<NmObjStrFqPrV.Len(); NmObjN++){
      TStr LbItemStr=NmObjStrFqPrV[NmObjN].Val1+
       TInt::GetStr(NmObjStrFqPrV[NmObjN].Val2, " (%d)");
      NmObjLb->Items->Add(LbItemStr.CStr());
    }
  } else {
    //...sorted by frequency
    TIntStrPrV NmObjFqStrPrV;
    State->NmObjBs->GetNmObjFqStrPrV(NmObjFqStrPrV, 3);
    NmObjFqStrPrV.Sort(false);
    for (int NmObjN=0; NmObjN<NmObjFqStrPrV.Len(); NmObjN++){
      TStr LbItemStr=NmObjFqStrPrV[NmObjN].Val2+
       TInt::GetStr(NmObjFqStrPrV[NmObjN].Val1, " (%d)");
      NmObjLb->Items->Add(LbItemStr.CStr());
    }
  }
}
Beispiel #6
0
int main(int argc, char* argv[]){
  Try;
  // create environment
  Env=TEnv(argc, argv, TNotify::StdNotify);
  // get command line parameters
  Env.PrepArgs("Crawl-Base to Text", 0);
  TStr InCrawlBsFNm=Env.GetIfArgPrefixStr("-i:", "", "Crawl-Base-FileName");
  TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "Crawl.Txt", "Output-Text-Filename");
  TStr OutStatFNm=Env.GetIfArgPrefixStr("-os:", "Crawl.Stat.Txt", "Output-Statistics-Text-Filename");
  bool SaveContP=Env.GetIfArgPrefixBool("-sc:", false, "Save-Content");
  bool SaveContOutUrlP=Env.GetIfArgPrefixBool("-scou:", true, "Save-Content-Outgoing-Urls");
  bool SaveContTagP=Env.GetIfArgPrefixBool("-sct:", true, "Save-Content-Tags");
  bool SaveOutUrlP=Env.GetIfArgPrefixBool("-sou:", false, "Save-Outgoing-Urls");
  bool SaveCTxtP=Env.GetIfArgPrefixBool("-sctc:", false, "Save-Continuos-Text-Content");
  int MnCTxtToks=Env.GetIfArgPrefixInt("-mctt:", 100, "Minimal-Continuos-Text-Tokens");
  TStrV BlockedDmNmV=Env.GetIfArgPrefixStrV("-bd:", "Blocked-Domain-Names (multiple)");
  if (Env.IsEndOfRun()){return 0;}
  // -i:si -sc:t -scou:n -sct:n -sctc:t -bd:.delo.si -bd:.dnevnik.si -bd:.vecer.si

  TStr BlobBsFMid=InCrawlBsFNm.GetFMid();
  // output file
  TFOut TxtFOut(OutTxtFNm); FILE* fTxt=TxtFOut.GetFileId();
  fprintf(fTxt, "Comment:input=%s\n", InCrawlBsFNm.CStr());
  fprintf(fTxt, "Comment:output=%s\n", OutTxtFNm.CStr());
  fprintf(fTxt, "BlobBaseName:%s\n", BlobBsFMid.CStr());
  // statistics
  TStrIntH HostNmToFqH;
  TStrIntH StatusCdToFqH;
  TStrIntH ContTypeToFqH;
  PMom HttpContLenMom=TMom::New();

  PBlobBs CrawlBBs=TMBlobBs::New(InCrawlBsFNm);
  TBlobPt TrvCrawlBPt=CrawlBBs->FFirstBlobPt();
  TBlobPt CrawlBPt; PSIn CrawlBlobSIn; int CrawlBlobN=0;
  while (CrawlBBs->FNextBlobPt(TrvCrawlBPt, CrawlBPt, CrawlBlobSIn)){
    CrawlBlobN++; printf("%d\r", CrawlBlobN);
    TStr DateTimeStr(*CrawlBlobSIn); //TStr DateTimeStr;
    TStr UrlStr(*CrawlBlobSIn);
    PUrl Url=TUrl::New(UrlStr); IAssert(Url->IsOk(usHttp));
    TMem HttpRespMem(*CrawlBlobSIn);
    PSIn HttpRespSIn=HttpRespMem.GetSIn();
    PHttpResp HttpResp=THttpResp::New(HttpRespSIn);
    // statistics
    HostNmToFqH.AddDat(Url->GetHostNm())++;
    StatusCdToFqH.AddDat(TInt::GetStr(HttpResp->GetStatusCd()))++;
    ContTypeToFqH.AddDat(HttpResp->GetFldVal(THttp::ContTypeFldNm))++;
    int ContLen=HttpResp->GetFldVal(THttp::ContLenFldNm).GetInt(-1);
    if (ContLen!=-1){
      HttpContLenMom->Add(ContLen);}
    // check blocked domain-names
    if (!BlockedDmNmV.Empty()){
      TStr DmNm=Url->GetDmNm(); int BlockedDmP=false;
      for (int BDmNmN=0; BDmNmN<BlockedDmNmV.Len(); BDmNmN++){
        if (DmNm.IsSuffix(BlockedDmNmV[BDmNmN])){
          BlockedDmP=true; break;
        }
      }
      if (BlockedDmP){
        continue;
      }
    }
    // check continuos-text
    if (SaveCTxtP&&IsCTxtHttpResp(Url, HttpResp, MnCTxtToks)){continue;}
    if (HttpResp->IsStatusCd_Ok()){
      PWebPg WebPg=TWebPg::New(UrlStr, HttpResp);
      fprintf(fTxt, "Start:HttpOk\n");
      fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n",
       BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr());
      fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr());
      fprintf(fTxt, "Url:%s\n", UrlStr.CStr());
      fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr());
      fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr());
      for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){
        TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal);
        fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr());
      }
      TMem BodyMem=HttpResp->GetBodyAsMem();
      fprintf(fTxt, "BodyMd5:%s\n", TMd5Sig(BodyMem).GetStr().CStr());
      // text
      if (SaveContP){
        if (HttpResp->IsContType(THttp::TextHtmlFldVal)){
          TStr HtmlStr=BodyMem.GetAsStr();
          TStr TxtStr=THtmlDoc::GetTxtLnDoc(HtmlStr, UrlStr, SaveContOutUrlP, SaveContTagP);
          fprintf(fTxt, "Content:%s\n", TxtStr.CStr());
        }
      }
      // outgoing-urls
      if (SaveOutUrlP){
        TUrlV OutUrlV; WebPg->GetOutUrlV(OutUrlV);
        for (int OutUrlN=0; OutUrlN<OutUrlV.Len(); OutUrlN++){
          TStr OutUrlStr=OutUrlV[OutUrlN]->GetUrlStr();
          fprintf(fTxt, "OutUrl:%s\n", OutUrlStr.CStr());
        }
      }
      fprintf(fTxt, "End:HttpOk\n");
    } else
    if (HttpResp->IsStatusCd_Redir()){
      TStr RedirUrlStr=HttpResp->GetFldVal(THttp::LocFldNm);
      PUrl RedirUrl=TUrl::New(RedirUrlStr, UrlStr);
      if (RedirUrl->IsOk(usHttp)){
        TStr RedirUrlStr=RedirUrl->GetUrlStr();
        fprintf(fTxt, "Start:HttpRedirection\n");
        fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n",
         BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr());
        fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr());
        fprintf(fTxt, "Url:%s\n", UrlStr.CStr());
        fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr());
        fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr());
        fprintf(fTxt, "RedirectionUrl:%s\n", RedirUrlStr.CStr());
        for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){
          TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal);
          fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr());
        }
        fprintf(fTxt, "End:HttpRedirection\n");
      }
    }
  }

  // statistics
  HttpContLenMom->Def();
  if (!OutStatFNm.Empty()){
    TFOut StatFOut(OutStatFNm); FILE* fStat=StatFOut.GetFileId();
    TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV);
    TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV);
    // hosts
    {fprintf(fStat, "================================================\n");
    TIntStrPrV FqHostNmPrV; HostNmToFqH.GetDatKeyPrV(FqHostNmPrV);
    FqHostNmPrV.Sort(false); int HostNmsSum=0;
    fprintf(fStat, "Hosts (%d):\n", FqHostNmPrV.Len());
    for (int HostNmN=0; HostNmN<FqHostNmPrV.Len(); HostNmN++){
      fprintf(fStat, "%7d   '%s'\n",
       FqHostNmPrV[HostNmN].Val1, FqHostNmPrV[HostNmN].Val2.CStr());
      HostNmsSum+=FqHostNmPrV[HostNmN].Val1;
    }
    fprintf(fStat, "----------\n");
    fprintf(fStat, "%7d   %s\n", HostNmsSum, "Sum");
    fprintf(fStat, "================================================\n");}
    // status-code
    {fprintf(fStat, "================================================\n");
    TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV);
    FqStatusCdPrV.Sort(false); int StatusCdsSum=0;
    fprintf(fStat, "Status-Codes (%d):\n", FqStatusCdPrV.Len());
    for (int StatusCdN=0; StatusCdN<FqStatusCdPrV.Len(); StatusCdN++){
      fprintf(fStat, "%7d   '%s'\n",
       FqStatusCdPrV[StatusCdN].Val1, FqStatusCdPrV[StatusCdN].Val2.CStr());
      StatusCdsSum+=FqStatusCdPrV[StatusCdN].Val1;
    }
    fprintf(fStat, "----------\n");
    fprintf(fStat, "%7d   %s\n", StatusCdsSum, "Sum");
    fprintf(fStat, "================================================\n");}
    // content-type
    {fprintf(fStat, "================================================\n");
    TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV);
    FqContTypePrV.Sort(false); int ContTypesSum=0;
    fprintf(fStat, "Content-Types (%d):\n", FqContTypePrV.Len());
    for (int ContTypeN=0; ContTypeN<FqContTypePrV.Len(); ContTypeN++){
      fprintf(fStat, "%7d   '%s'\n",
       FqContTypePrV[ContTypeN].Val1, FqContTypePrV[ContTypeN].Val2.CStr());
      ContTypesSum+=FqContTypePrV[ContTypeN].Val1;
    }
    fprintf(fStat, "----------\n");
    fprintf(fStat, "%7d   %s\n", ContTypesSum, "Sum");
    fprintf(fStat, "================================================\n");}
    // content-length
    {fprintf(fStat, "================================================\n");
    fprintf(fStat, "Content-length:\n");
    if (HttpContLenMom->IsUsable()){
      TStr MomStr=HttpContLenMom->GetStr('\n', ':', true, false, "%g");
      fprintf(fStat, "%s\n", MomStr.CStr());
    } else {
      fprintf(fStat, "Statistics not usable.\n");
    }
    fprintf(fStat, "================================================\n");}
  }

  return 0;
  Catch;
  return 1;
}
Beispiel #7
0
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){
  // file-names
  TStr NrFPath=TStr::GetNrFPath(FPath);
  TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml";
  TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml";
  TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml";
  TStr CycKBaseFNm=NrFPath+"kb-dump.xml";                                                 

  // create cyc-base
  PCycBs CycBs=TCycBs::New();

  // lexicon
  {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr());
  PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycWStr; TStr PrevCycLStr;
  forever{
    // statistics
    XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn);
    if (!XmlDoc->IsOk()){
      printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("word"));
    TStr CycWStr=TopTok->GetArgVal("string");
    TStr CycLStr=TopTok->GetArgVal("cycl");
    PrevCycWStr=CycWStr; PrevCycLStr;
    // insert data
    CycBs->AddEdge(CycLStr, "#$nameString", CycWStr);
    CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr);
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // taxonomy
  {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr());
  PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevSrcCycLStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevSrcCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr SrcCycLStr=TopTok->GetArgVal("cycl");
    PrevSrcCycLStr=SrcCycLStr;
    for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){
      PXmlTok SubTok=TopTok->GetSubTok(SubTokN);
      TStr DstCycLStr=SubTok->GetTagNm();
      if (SubTok->IsTag("isa")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr);
      } else
      if (SubTok->IsTag("genl")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr);
      } else {
        Fail;
      }
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // relevance
  {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr());
  PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr CycStr=TopTok->GetArgVal("cyc");
    PrevCycStr=CycStr;
    //IAssert(CycBs->IsVNm(CycStr));
    if (CycBs->IsVNm(CycStr)){
      if (TopTok->GetArgVal("thcl")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);}
      if (TopTok->GetArgVal("irrel")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);}
      if (TopTok->GetArgVal("clarifying")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);}
      if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);}
    } else {
      //printf("%s\n", CycStr.CStr());
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // knowledge-base
  {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr());
  PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycLStr; TStrV PrevArgCycLStrV;
  TStrIntH HdCycLToFq;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    //if (XmlDocs>10000){break;}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycLStr.CStr());
      for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){
        printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());}
      printf("\n");
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("sentence"));
    TStr CycLStr=TopTok->GetArgVal("cycl");
    TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV);
    TStrV ArgCycLStrV;
    for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){
      PXmlTok Tok=ArgXmlTokV[ArgN];
      IAssert(Tok->IsTag("arg"));
      if (Tok->IsArg("cycl")){
        TStr ArgCycLStr=Tok->GetArgVal("cycl");
        ArgCycLStrV.Add(ArgCycLStr);
      } else {
        ArgCycLStrV.Add("Empty");
      }
    }
    PrevCycLStr=CycLStr;
    PrevArgCycLStrV=ArgCycLStrV;
    if (ArgCycLStrV.Len()>0){
      HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;}
    // insert
    if (ArgCycLStrV.Len()==3){
      TStr PredNm=ArgCycLStrV[0];
      if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){
        TStr BackLinkPredNm=TStr("~")+PredNm;
        TStr Arg1=ArgCycLStrV[1];
        TStr Arg2=ArgCycLStrV[2];
        CycBs->AddEdge(Arg1, PredNm, Arg2);
        CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1);
      }
    }
  }
  // output top cycl relations
  {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId();
  TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); 
  FqCycLStrPrV.Sort(false);
  for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){
    fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr());
  }}
  printf("%d Docs\nDone.\n", XmlDocs);}

  // return cyc-base
  return CycBs;
}