Ejemplo n.º 1
0
void TSkyGridBs::GetWordStrWgtPrVChA(
 const TStrFltPrV& WordStrWgtPrV, TChA& WordStrWgtPrVChA){
  WordStrWgtPrVChA.Clr();
  for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){
    TStr WStr=WordStrWgtPrV[WordN].Val1;
    double WWgt=WordStrWgtPrV[WordN].Val2;
    if (WordN>0){WordStrWgtPrVChA+=' ';}
    WordStrWgtPrVChA+=TStr::Fmt("['%s':%.3f]", WStr.CStr(), WWgt);
  }
}
Ejemplo n.º 2
0
void __fastcall TContexterF::CtxNmObjLbClick(TObject *Sender){
  TListBox* NmObjLb=CtxNmObjLb;
  // get named-object string
  if ((NmObjLb->ItemIndex<0)||(NmObjLb->Items->Count<=NmObjLb->ItemIndex)){
    return;}
  TStr LbItemStr=NmObjLb->Items->Strings[NmObjLb->ItemIndex].c_str();
  TStr FqStr; LbItemStr.SplitOnCh(State->EnRootNmObjStr, ' ', FqStr);
  // get concept-vector
  PBowSpV ConceptSpV=State->NmObjBs->GetNmObjConcept(
   State->BowDocBs, State->BowDocWgtBs, State->EnRootNmObjStr);
  TStrFltPrV WordStrWgtPrV;
  ConceptSpV->GetWordStrWgtPrV(State->BowDocBs, 100, 0.66, WordStrWgtPrV);
  // fill concept-list-box
  EnConceptWordLb->Clear();
  for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){
    TStr LbItemStr=WordStrWgtPrV[WordN].Val1+
     TFlt::GetStr(WordStrWgtPrV[WordN].Val2, " (%.3f)");
    EnConceptWordLb->Items->Add(LbItemStr.CStr());
  }
  // get coref-named-objects
  TIntPrV FqNmObjIdPrV;
  State->NmObjBs->GetFqNmObjIdPrV(State->EnRootNmObjStr, FqNmObjIdPrV);
  FqNmObjIdPrV.Sort(false); FqNmObjIdPrV.Trunc(100);
  // fill coref-named-objects
  EnCoNmObjLb->Clear();
  for (int NmObjN=0; NmObjN<FqNmObjIdPrV.Len(); NmObjN++){
    TStr CoNmObjStr=State->NmObjBs->GetNmObjStr(FqNmObjIdPrV[NmObjN].Val2);
    if (State->EnRootNmObjStr!=CoNmObjStr){
      TStr LbItemStr=CoNmObjStr+TInt::GetStr(FqNmObjIdPrV[NmObjN].Val1, " (%d)");
      EnCoNmObjLb->Items->Add(LbItemStr.CStr());
    }
  }
  // context-tree
  State->EnCtxTree=GetCtxTreeGraph(State->NmObjBs, State->EnRootNmObjStr, State->EnDrawLevels-1);
  EnPbPaint(Sender);
}
Ejemplo n.º 3
0
void TSkyGridBs::GetWordStrWgtPrVDiff(
 const TStrFltPrV& OldWordStrWgtPrV, const TStrFltPrV& NewWordStrWgtPrV,
 TStrFltPrV& NegDiffWordStrWgtPrV, TStrFltPrV& PosDiffWordStrWgtPrV){
  TStrFltH WordStrToWgtH;
  // set previous-vector
  for (int WordN=0; WordN<NewWordStrWgtPrV.Len(); WordN++){
    TStr WStr=NewWordStrWgtPrV[WordN].Val1;
    double WWgt=NewWordStrWgtPrV[WordN].Val2;
    WordStrToWgtH.AddDat(WStr, WWgt);
  }
  // diff current-vector
  for (int WordN=0; WordN<OldWordStrWgtPrV.Len(); WordN++){
    TStr WStr=OldWordStrWgtPrV[WordN].Val1;
    double WWgt=OldWordStrWgtPrV[WordN].Val2;
    double CurWWgt=WordStrToWgtH.AddDat(WStr);
    WordStrToWgtH.AddDat(WStr, CurWWgt-WWgt);
  }
  // extract vector
  TFltStrPrV DiffWordWgtStrPrV; WordStrToWgtH.GetDatKeyPrV(DiffWordWgtStrPrV);
  // positive-vector
  DiffWordWgtStrPrV.Sort(true);
  NegDiffWordStrWgtPrV.Gen(DiffWordWgtStrPrV.Len(), 0);
  for (int WordN=0; WordN<DiffWordWgtStrPrV.Len(); WordN++){
    TStr WStr=DiffWordWgtStrPrV[WordN].Val2;
    double WWgt=DiffWordWgtStrPrV[WordN].Val1;
    if (WWgt!=0){NegDiffWordStrWgtPrV.Add(TStrFltPr(WStr, WWgt));}
  }
  // negative-vector
  DiffWordWgtStrPrV.Sort(false);
  PosDiffWordStrWgtPrV.Gen(DiffWordWgtStrPrV.Len(), 0);
  for (int WordN=0; WordN<DiffWordWgtStrPrV.Len(); WordN++){
    TStr WStr=DiffWordWgtStrPrV[WordN].Val2;
    double WWgt=DiffWordWgtStrPrV[WordN].Val1;
    if (WWgt!=0){PosDiffWordStrWgtPrV.Add(TStrFltPr(WStr, WWgt));}
  }
}
Ejemplo n.º 4
0
void __fastcall TContexterF::VizualizeBtClick(TObject *Sender){
  if (!State->BowDocBs.Empty()){
    // parameters
    int Clusts=TStr(VizClustsEd->Text.c_str()).GetInt(10);
    double ClustSimSumPrc=TStr(VizClustSimSumPrcEd->Text.c_str()).GetFlt(0.3)/100;

    // get doc-ids
    TIntV AllDIdV; State->BowDocBs->GetAllDIdV(AllDIdV);

    // get document partition
    PBowSim BowSim=TBowSim::New(bstCos); // similarity object
    TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
    State->TmBowDocPart=TBowClust::GetKMeansPart(
     TNotify::StdNotify, // log output
     State->BowDocBs, // document data
     BowSim, // similarity function
     TRnd(1), // random generator
     Clusts, // number of clusters
     1, // trials per k-means
     10, // convergence epsilon for k-means
     1, // min. documents per cluster
     WordWgtType, // word weighting
     0.5, // cut-word-weights percentage
     5, // minimal word frequency
     AllDIdV); // training documents

    // create graph
    PGraph Graph=TGGraph::New();

    // create vertices
    TVrtxV VrtxV;
    for (int ClustN=0; ClustN<State->TmBowDocPart->GetClusts(); ClustN++){
      // get cluster
      PBowDocPartClust Clust=State->TmBowDocPart->GetClust(ClustN);
      // get best words string
      TStrFltPrV WordStrWgtPrV;
      Clust->GetTopWordStrWgtPrV(State->BowDocBs, -1, 1.0, WordStrWgtPrV);
      TChA BestWordVChA;
      BestWordVChA+=TInt::GetStr(Clust->GetDocs())+" Docs\n";
      TStrV UcWordStrSfV;
      for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){
        // get word
        TStr UcWordStr=WordStrWgtPrV[WordN].Val1;
        // remove duplicates
        bool Ok=true;
        for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){
          if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;}
          if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;}
        }
        if (!Ok){continue;}
        // add word
        UcWordStrSfV.Add(UcWordStr);
        BestWordVChA+=WordStrWgtPrV[WordN].Val1;
        BestWordVChA+="\n";
        // finish if limit reached
        if (UcWordStrSfV.Len()>=15){break;}
      }
      // create vertex
      TStr ClustNm=BestWordVChA;
      PVrtx Vrtx=new TGVrtx(ClustNm);
      Graph->AddVrtx(Vrtx);
      VrtxV.Add(Vrtx);
    }

    // create edges
    TFltIntIntTrV ClustSimN1N2TrV;
    State->TmBowDocPart->GetTopClustSimV(ClustSimSumPrc, ClustSimN1N2TrV);
    for (int ClustSimN=0; ClustSimN<ClustSimN1N2TrV.Len(); ClustSimN++){
      double Sim=ClustSimN1N2TrV[ClustSimN].Val1;
      double ClustN1=ClustSimN1N2TrV[ClustSimN].Val2;
      double ClustN2=ClustSimN1N2TrV[ClustSimN].Val3;
      TStr EdgeNm=TFlt::GetStr(Sim, "%.2f");
      PEdge Edge=new TGEdge(VrtxV[ClustN1], VrtxV[ClustN2], EdgeNm, false);
      Graph->AddEdge(Edge);
      Edge->PutWgt(TMath::Sqr(Sim));
    }

    // place graph
    State->TmClustGraph=Graph;
    TRnd Rnd(1);
    State->TmClustGraph->PlaceSimAnnXY(Rnd, State->TmGks);

    // get area-partition
    UpdateClustRectPrV();

    // draw graph
    State->TmGks->Clr();
    TmPbPaint(Sender);
  }
}
Ejemplo n.º 5
0
/////////////////////////////////////////////////
// Context-Tree
PGraph GetCtxTreeGraph(
 const PNmObjBs& NmObjBs, const TStr& RootNmObjStr, const int& MxDist){
  // create distance graph
  PGraph Graph=TGraph::New();
  // create root note
  int RootNmObjId=NmObjBs->GetNmObjId(RootNmObjStr);
  PVrtx RootVrtx=TGVrtx::New(RootNmObjId, RootNmObjStr);
  Graph->AddVrtx(RootVrtx);
  // create distance vector
  TIntV NmObjDistV(NmObjBs->GetNmObjs()); NmObjDistV.PutAll(-1);
  NmObjDistV[RootNmObjId]=0;
  // create queue
  TIntPrQ OpenNmObjIdDistPrQ; OpenNmObjIdDistPrQ.Push(TIntPr(RootNmObjId, 0));
  while (!OpenNmObjIdDistPrQ.Empty()){
    // get name-object-id from queue
    int NmObjId=OpenNmObjIdDistPrQ.Top().Val1;
    int NmObjDist=OpenNmObjIdDistPrQ.Top().Val2;
    OpenNmObjIdDistPrQ.Pop();
    IAssert(NmObjDistV[NmObjId]==NmObjDist);
    // get named-object string
    TStr NmObjStr=NmObjBs->GetNmObjStr(NmObjId);
    TStr UcNmObjStr=NmObjStr.GetUc();
    printf("[%s:%d] ", NmObjStr.CStr(), NmObjDist);
    // check distance
    if (NmObjDist>MxDist){continue;}
    // get named-object vertex
    PVrtx SrcVrtx=Graph->GetVrtx(NmObjId);
    // get named-object children
    TIntPrV FqNmObjIdPrV; NmObjBs->GetFqNmObjIdPrV(NmObjStr, FqNmObjIdPrV);
    int SubNmObjs=FqNmObjIdPrV.Len();
    // traverse named-object children
    int CreatedSubNmObjs=0;
    for (int SubNmObjN=0; SubNmObjN<SubNmObjs; SubNmObjN++){
      // get child data
      int SubNmObjFq=FqNmObjIdPrV[SubNmObjN].Val1;
      int SubNmObjId=FqNmObjIdPrV[SubNmObjN].Val2;
      TStr SubNmObjStr=NmObjBs->GetNmObjStr(SubNmObjId);
      TStr UcSubNmObjStr=SubNmObjStr.GetUc();
      TStr SubNmObjVNm=SubNmObjStr;
      // calculate and add context string formed from coref-named-objects
      {TChA CtxChA; TIntPrV FqNmObjIdPrV;
      ContexterF->State->NmObjBs->GetFqNmObjIdPrV(SubNmObjStr, FqNmObjIdPrV);
      FqNmObjIdPrV.Sort(false);
      FqNmObjIdPrV.Trunc(ContexterF->State->EnCtxLen); FqNmObjIdPrV.Clr();
      for (int NmObjN=0; NmObjN<FqNmObjIdPrV.Len(); NmObjN++){
        TStr CoNmObjStr=ContexterF->State->NmObjBs->GetNmObjStr(FqNmObjIdPrV[NmObjN].Val2);
        if (SubNmObjStr!=CoNmObjStr){
          CtxChA+='['; CtxChA+=CoNmObjStr; CtxChA+=']'; CtxChA+='\\';}
      }
      if (!CtxChA.Empty()){
        SubNmObjVNm=SubNmObjStr+"\\"+CtxChA;}}
      // push child named-object-id if necessary
      if (NmObjDistV[SubNmObjId]==-1){
        // check number of subnodes
        int MxCreatedSubNmObjs=0;
        switch (NmObjDist){
          case 0: MxCreatedSubNmObjs=/*20;*/ContexterF->State->EnSubNodes; break;
          case 1: MxCreatedSubNmObjs=4; break;
          case 2: MxCreatedSubNmObjs=2; break;
          case 3: MxCreatedSubNmObjs=1; break;
          case 4: MxCreatedSubNmObjs=1; break;
          default: MxCreatedSubNmObjs=0; break;
        }
        // check if stop creating branches
        CreatedSubNmObjs++;
        if (CreatedSubNmObjs>MxCreatedSubNmObjs){break;}
        // push edge
        OpenNmObjIdDistPrQ.Push(TIntPr(SubNmObjId, NmObjDist+1));
        NmObjDistV[SubNmObjId]=NmObjDist+1;
        // create vertex
        TStr VNm=SubNmObjVNm;
        PVrtx DstVrtx=TGVrtx::New(SubNmObjId, VNm);
        Graph->AddVrtx(DstVrtx);
        // create edge
        //TStr ENm=TStr("_")+TInt::GetStr(NmObjId)+"-"+TInt::GetStr(SubNmObjId);
        TStr ENm=TInt::GetStr(SubNmObjFq);
        // calculate and add context string formed from coref-named-objects
        if (ContexterF->EnInterNmObjContextCb->Checked){
          TChA CtxChA;
          TStr SrcNmObjStr=NmObjStr;
          TChA DstNmObjChA=DstVrtx->GetVNm();
          if (DstNmObjChA.IsChIn('\\')){
            DstNmObjChA.Trunc(DstNmObjChA.SearchCh('\\'));}
          TStr DstNmObjStr=DstNmObjChA;
          PBowSpV ConceptSpV=ContexterF->State->NmObjBs->GetNmObjConcept(
           ContexterF->State->BowDocBs, ContexterF->State->BowDocWgtBs,
           SrcNmObjStr, DstNmObjStr);
          TStrFltPrV WordStrWgtPrV;
          ConceptSpV->GetWordStrWgtPrV(
           ContexterF->State->BowDocBs, -1, 1, WordStrWgtPrV);
          TStrV UcWordStrSfV;
          for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){
            // get word
            TStr UcWordStr=WordStrWgtPrV[WordN].Val1;
            // remove duplicates
            if (UcWordStr.IsStrIn(UcNmObjStr)){continue;}
            if (UcWordStr.IsStrIn(UcSubNmObjStr)){continue;}
            if (UcNmObjStr.IsStrIn(UcWordStr)){continue;}
            if (UcSubNmObjStr.IsStrIn(UcWordStr)){continue;}
            bool Ok=true;
            for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){
              if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;}
              if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;}
            }
            if (!Ok){continue;}
            // add word
            UcWordStrSfV.Add(UcWordStr);
            CtxChA+='['; CtxChA+=UcWordStr; CtxChA+=']'; CtxChA+='\n';
            // finish if limit reached
            if (UcWordStrSfV.Len()>=ContexterF->State->EnCtxLen){break;}
          }
          ENm=ENm+"\n"+CtxChA;
        }
        // create and add edge to the graph
        PEdge Edge=TGEdge::New(SrcVrtx, DstVrtx, ENm);
        Edge->PutWgt(1+log(SubNmObjFq));
        Graph->AddEdge(Edge);
      }
    }
  }
  Graph->SetEdgeWidth(5);
  Graph->PlaceTreeAsStar();
  Graph->RescaleXY(0.1, RootVrtx);
  // return graph
  return Graph;
}
Ejemplo n.º 6
0
void TSkyGridBs::SaveTxt(const TStr& FNm, const uint64& CurTm){
  // time-limit
  TStr CurTmStr=TTm::GetTmFromMSecs(CurTm).GetWebLogDateTimeStr();
  uint64 CurDateTm=TTm::GetMSecsFromTm(TTm::GetTmFromWebLogDateTimeStr(TTm::GetTmFromMSecs(CurTm).GetWebLogDateStr()));
  TStr CurDateTmStr=TTm::GetTmFromMSecs(CurDateTm).GetWebLogDateTimeStr();
  TUInt64V MnTmV;
  MnTmV.Add(CurDateTm-0*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-1*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-2*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-4*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-8*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-16*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-32*TTmInfo::GetDayMSecs());

  // get bow
  //PBowDocBs BowDocBs=GetBowDocBs(3, 5);
  PBowDocBs BowDocBs=GetBowDocBs();
  PBowDocWgtBs BowDocWgtBs=GetBowDocWgtBs(BowDocBs);

  // open file
  TFOut FOut(FNm); FILE* fOut=FOut.GetFileId();
  // get docs-entities sorted vector
  TIntPrV DocsEntIdPrV; GetSorted_DocsEntIdPrV(DocsEntIdPrV);
  // traverse entities
  for (int EntN=0; EntN<DocsEntIdPrV.Len(); EntN++){
    int EntId=DocsEntIdPrV[EntN].Val2;
    TStr EntNm=GetEntNm(EntId);
    int EntDocs=DocsEntIdPrV[EntN].Val1;
    TSkyGridEnt& Ent=GetEnt(EntId);
    int LinkEnts=Ent.GetLinkEnts();
    fprintf(fOut, "'%s' [%d docs] [%d ents]\n", EntNm.CStr(), EntDocs, LinkEnts);

    // output docs over dates
    {TStrIntPrV DateStrDocsPrV; int _EntDocs;
    Ent.GetDocsPerDateV(this, DateStrDocsPrV, _EntDocs);
    fprintf(fOut, "   Docs per Date (%d docs):", _EntDocs);
    for (int DateN=0; DateN<DateStrDocsPrV.Len(); DateN++){
      TStr DateStr=DateStrDocsPrV[DateN].Val1;
      int Docs=DateStrDocsPrV[DateN].Val2;
      fprintf(fOut, " [%s:%d]", DateStr.CStr(), Docs);
    }
    fprintf(fOut, "\n");}

    fprintf(fOut, "   [Now: %s]\n", CurTmStr.CStr());
    TIntPrV PrevLinkWgtDstEntIdPrV;
    TStrFltPrV PrevWordStrWgtPrV;
    for (int MnTmN=0; MnTmN<MnTmV.Len(); MnTmN++){
      uint64 MnTm=MnTmV[MnTmN];
      double PastDays=(CurDateTm-MnTm)/double(TTmInfo::GetDayMSecs());
      TStr MnTmStr=TTm::GetTmFromMSecs(MnTm).GetWebLogDateTimeStr();
      // get linked entities
      TIntPrV LinkWgtDstEntIdPrV;
      Ent.GetSorted_LinkWgtDstEntIdPrV(MnTm, 0.9, LinkWgtDstEntIdPrV);
      // output difference between previous and current centroid
      if (MnTmN>0){
        TIntPrV NegDiffLinkWgtDstEntIdPrV; TIntPrV PosDiffLinkWgtDstEntIdPrV;
        GetLinkWgtDstEntIdPrVDiff(LinkWgtDstEntIdPrV, PrevLinkWgtDstEntIdPrV,
         NegDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrV);
        // output positive change
        TChA PosDiffLinkWgtDstEntIdPrVChA;
        GetLinkWgtDstEntIdPrVChA(PosDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrVChA);
        fprintf(fOut, "         Pos-Diff: %s\n", PosDiffLinkWgtDstEntIdPrVChA.CStr());
        // output negative change
        TChA NegDiffLinkWgtDstEntIdPrVChA;
        GetLinkWgtDstEntIdPrVChA(NegDiffLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrVChA);
        fprintf(fOut, "         Neg-Diff: %s\n", NegDiffLinkWgtDstEntIdPrVChA.CStr());
      }
      PrevLinkWgtDstEntIdPrV=LinkWgtDstEntIdPrV;
      // output linked entities
      int TopLinkEnts=LinkWgtDstEntIdPrV.Len();
      TChA LinkWgtDstEntIdPrVChA;
      GetLinkWgtDstEntIdPrVChA(LinkWgtDstEntIdPrV, LinkWgtDstEntIdPrVChA);
      fprintf(fOut, "      Entities (%d ents): %s\n",
       TopLinkEnts, LinkWgtDstEntIdPrVChA.CStr());
      // get text centroid
      int CtrDocs; TStrFltPrV WordStrWgtPrV;
      Ent.GetDocCentroid(this, BowDocBs, BowDocWgtBs, MnTm, 150, 0.9, CtrDocs, WordStrWgtPrV);
      // output difference between previous and current centroid
      if (MnTmN>0){
        TStrFltPrV NegDiffWordStrWgtPrV; TStrFltPrV PosDiffWordStrWgtPrV;
        GetWordStrWgtPrVDiff(WordStrWgtPrV, PrevWordStrWgtPrV,
         NegDiffWordStrWgtPrV, PosDiffWordStrWgtPrV);
        // output positive change
        TChA PosDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(PosDiffWordStrWgtPrV, PosDiffWordStrWgtPrVChA);
        fprintf(fOut, "         Pos-Diff: %s\n", PosDiffWordStrWgtPrVChA.CStr());
        // output negative change
        TChA NegDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(NegDiffWordStrWgtPrV, NegDiffWordStrWgtPrVChA);
        fprintf(fOut, "         Neg-Diff: %s\n", NegDiffWordStrWgtPrVChA.CStr());
      }
      PrevWordStrWgtPrV=WordStrWgtPrV;
      // output centroid
      TChA WordStrWgtPrVChA; GetWordStrWgtPrVChA(WordStrWgtPrV, WordStrWgtPrVChA);
      fprintf(fOut, "      Centroid (%d docs, %d words): %s\n",
       CtrDocs, WordStrWgtPrV.Len(), WordStrWgtPrVChA.CStr());
      // output time
      fprintf(fOut, "   [-%.1f days: %s]\n", PastDays, MnTmStr.CStr());
    }
    // entity clustering
    /*TVec<TStrFltPrV> EntNmWgtPrVV;
    Ent.GetEntClustV(this, MnTmV.Last(), 100, 1000, 10, EntNmWgtPrVV);
    for (int ClustN=0; ClustN<EntNmWgtPrVV.Len(); ClustN++){
      TStrFltPrV& EntNmWgtPrV=EntNmWgtPrVV[ClustN];
      fprintf(fOut, "   Clust-%d:", ClustN);
      for (int EntN=0; EntN<EntNmWgtPrV.Len(); EntN++){
        TStr EntNm=EntNmWgtPrV[EntN].Val1;
        double Wgt=EntNmWgtPrV[EntN].Val2;
        fprintf(fOut, " ['%s':%.3f]", EntNm.CStr(), Wgt);
      }
      fprintf(fOut, "\n");
    }*/
    fprintf(fOut, "\n");
  }
}