void TSkyGridBs::GetWordStrWgtPrVChA( const TStrFltPrV& WordStrWgtPrV, TChA& WordStrWgtPrVChA){ WordStrWgtPrVChA.Clr(); for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){ TStr WStr=WordStrWgtPrV[WordN].Val1; double WWgt=WordStrWgtPrV[WordN].Val2; if (WordN>0){WordStrWgtPrVChA+=' ';} WordStrWgtPrVChA+=TStr::Fmt("['%s':%.3f]", WStr.CStr(), WWgt); } }
void __fastcall TContexterF::CtxNmObjLbClick(TObject *Sender){ TListBox* NmObjLb=CtxNmObjLb; // get named-object string if ((NmObjLb->ItemIndex<0)||(NmObjLb->Items->Count<=NmObjLb->ItemIndex)){ return;} TStr LbItemStr=NmObjLb->Items->Strings[NmObjLb->ItemIndex].c_str(); TStr FqStr; LbItemStr.SplitOnCh(State->EnRootNmObjStr, ' ', FqStr); // get concept-vector PBowSpV ConceptSpV=State->NmObjBs->GetNmObjConcept( State->BowDocBs, State->BowDocWgtBs, State->EnRootNmObjStr); TStrFltPrV WordStrWgtPrV; ConceptSpV->GetWordStrWgtPrV(State->BowDocBs, 100, 0.66, WordStrWgtPrV); // fill concept-list-box EnConceptWordLb->Clear(); for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){ TStr LbItemStr=WordStrWgtPrV[WordN].Val1+ TFlt::GetStr(WordStrWgtPrV[WordN].Val2, " (%.3f)"); EnConceptWordLb->Items->Add(LbItemStr.CStr()); } // get coref-named-objects TIntPrV FqNmObjIdPrV; State->NmObjBs->GetFqNmObjIdPrV(State->EnRootNmObjStr, FqNmObjIdPrV); FqNmObjIdPrV.Sort(false); FqNmObjIdPrV.Trunc(100); // fill coref-named-objects EnCoNmObjLb->Clear(); for (int NmObjN=0; NmObjN<FqNmObjIdPrV.Len(); NmObjN++){ TStr CoNmObjStr=State->NmObjBs->GetNmObjStr(FqNmObjIdPrV[NmObjN].Val2); if (State->EnRootNmObjStr!=CoNmObjStr){ TStr LbItemStr=CoNmObjStr+TInt::GetStr(FqNmObjIdPrV[NmObjN].Val1, " (%d)"); EnCoNmObjLb->Items->Add(LbItemStr.CStr()); } } // context-tree State->EnCtxTree=GetCtxTreeGraph(State->NmObjBs, State->EnRootNmObjStr, State->EnDrawLevels-1); EnPbPaint(Sender); }
void TSkyGridBs::GetWordStrWgtPrVDiff( const TStrFltPrV& OldWordStrWgtPrV, const TStrFltPrV& NewWordStrWgtPrV, TStrFltPrV& NegDiffWordStrWgtPrV, TStrFltPrV& PosDiffWordStrWgtPrV){ TStrFltH WordStrToWgtH; // set previous-vector for (int WordN=0; WordN<NewWordStrWgtPrV.Len(); WordN++){ TStr WStr=NewWordStrWgtPrV[WordN].Val1; double WWgt=NewWordStrWgtPrV[WordN].Val2; WordStrToWgtH.AddDat(WStr, WWgt); } // diff current-vector for (int WordN=0; WordN<OldWordStrWgtPrV.Len(); WordN++){ TStr WStr=OldWordStrWgtPrV[WordN].Val1; double WWgt=OldWordStrWgtPrV[WordN].Val2; double CurWWgt=WordStrToWgtH.AddDat(WStr); WordStrToWgtH.AddDat(WStr, CurWWgt-WWgt); } // extract vector TFltStrPrV DiffWordWgtStrPrV; WordStrToWgtH.GetDatKeyPrV(DiffWordWgtStrPrV); // positive-vector DiffWordWgtStrPrV.Sort(true); NegDiffWordStrWgtPrV.Gen(DiffWordWgtStrPrV.Len(), 0); for (int WordN=0; WordN<DiffWordWgtStrPrV.Len(); WordN++){ TStr WStr=DiffWordWgtStrPrV[WordN].Val2; double WWgt=DiffWordWgtStrPrV[WordN].Val1; if (WWgt!=0){NegDiffWordStrWgtPrV.Add(TStrFltPr(WStr, WWgt));} } // negative-vector DiffWordWgtStrPrV.Sort(false); PosDiffWordStrWgtPrV.Gen(DiffWordWgtStrPrV.Len(), 0); for (int WordN=0; WordN<DiffWordWgtStrPrV.Len(); WordN++){ TStr WStr=DiffWordWgtStrPrV[WordN].Val2; double WWgt=DiffWordWgtStrPrV[WordN].Val1; if (WWgt!=0){PosDiffWordStrWgtPrV.Add(TStrFltPr(WStr, WWgt));} } }
void __fastcall TContexterF::VizualizeBtClick(TObject *Sender){ if (!State->BowDocBs.Empty()){ // parameters int Clusts=TStr(VizClustsEd->Text.c_str()).GetInt(10); double ClustSimSumPrc=TStr(VizClustSimSumPrcEd->Text.c_str()).GetFlt(0.3)/100; // get doc-ids TIntV AllDIdV; State->BowDocBs->GetAllDIdV(AllDIdV); // get document partition PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting State->TmBowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output State->BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 10, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0.5, // cut-word-weights percentage 5, // minimal word frequency AllDIdV); // training documents // create graph PGraph Graph=TGGraph::New(); // create vertices TVrtxV VrtxV; for (int ClustN=0; ClustN<State->TmBowDocPart->GetClusts(); ClustN++){ // get cluster PBowDocPartClust Clust=State->TmBowDocPart->GetClust(ClustN); // get best words string TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(State->BowDocBs, -1, 1.0, WordStrWgtPrV); TChA BestWordVChA; BestWordVChA+=TInt::GetStr(Clust->GetDocs())+" Docs\n"; TStrV UcWordStrSfV; for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){ // get word TStr UcWordStr=WordStrWgtPrV[WordN].Val1; // remove duplicates bool Ok=true; for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){ if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;} if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;} } if (!Ok){continue;} // add word UcWordStrSfV.Add(UcWordStr); BestWordVChA+=WordStrWgtPrV[WordN].Val1; BestWordVChA+="\n"; // finish if limit reached if (UcWordStrSfV.Len()>=15){break;} } // create vertex TStr ClustNm=BestWordVChA; PVrtx Vrtx=new TGVrtx(ClustNm); Graph->AddVrtx(Vrtx); VrtxV.Add(Vrtx); } // create edges TFltIntIntTrV ClustSimN1N2TrV; State->TmBowDocPart->GetTopClustSimV(ClustSimSumPrc, ClustSimN1N2TrV); for (int ClustSimN=0; ClustSimN<ClustSimN1N2TrV.Len(); ClustSimN++){ double Sim=ClustSimN1N2TrV[ClustSimN].Val1; double ClustN1=ClustSimN1N2TrV[ClustSimN].Val2; double ClustN2=ClustSimN1N2TrV[ClustSimN].Val3; TStr EdgeNm=TFlt::GetStr(Sim, "%.2f"); PEdge Edge=new TGEdge(VrtxV[ClustN1], VrtxV[ClustN2], EdgeNm, false); Graph->AddEdge(Edge); Edge->PutWgt(TMath::Sqr(Sim)); } // place graph State->TmClustGraph=Graph; TRnd Rnd(1); State->TmClustGraph->PlaceSimAnnXY(Rnd, State->TmGks); // get area-partition UpdateClustRectPrV(); // draw graph State->TmGks->Clr(); TmPbPaint(Sender); } }
///////////////////////////////////////////////// // Context-Tree PGraph GetCtxTreeGraph( const PNmObjBs& NmObjBs, const TStr& RootNmObjStr, const int& MxDist){ // create distance graph PGraph Graph=TGraph::New(); // create root note int RootNmObjId=NmObjBs->GetNmObjId(RootNmObjStr); PVrtx RootVrtx=TGVrtx::New(RootNmObjId, RootNmObjStr); Graph->AddVrtx(RootVrtx); // create distance vector TIntV NmObjDistV(NmObjBs->GetNmObjs()); NmObjDistV.PutAll(-1); NmObjDistV[RootNmObjId]=0; // create queue TIntPrQ OpenNmObjIdDistPrQ; OpenNmObjIdDistPrQ.Push(TIntPr(RootNmObjId, 0)); while (!OpenNmObjIdDistPrQ.Empty()){ // get name-object-id from queue int NmObjId=OpenNmObjIdDistPrQ.Top().Val1; int NmObjDist=OpenNmObjIdDistPrQ.Top().Val2; OpenNmObjIdDistPrQ.Pop(); IAssert(NmObjDistV[NmObjId]==NmObjDist); // get named-object string TStr NmObjStr=NmObjBs->GetNmObjStr(NmObjId); TStr UcNmObjStr=NmObjStr.GetUc(); printf("[%s:%d] ", NmObjStr.CStr(), NmObjDist); // check distance if (NmObjDist>MxDist){continue;} // get named-object vertex PVrtx SrcVrtx=Graph->GetVrtx(NmObjId); // get named-object children TIntPrV FqNmObjIdPrV; NmObjBs->GetFqNmObjIdPrV(NmObjStr, FqNmObjIdPrV); int SubNmObjs=FqNmObjIdPrV.Len(); // traverse named-object children int CreatedSubNmObjs=0; for (int SubNmObjN=0; SubNmObjN<SubNmObjs; SubNmObjN++){ // get child data int SubNmObjFq=FqNmObjIdPrV[SubNmObjN].Val1; int SubNmObjId=FqNmObjIdPrV[SubNmObjN].Val2; TStr SubNmObjStr=NmObjBs->GetNmObjStr(SubNmObjId); TStr UcSubNmObjStr=SubNmObjStr.GetUc(); TStr SubNmObjVNm=SubNmObjStr; // calculate and add context string formed from coref-named-objects {TChA CtxChA; TIntPrV FqNmObjIdPrV; ContexterF->State->NmObjBs->GetFqNmObjIdPrV(SubNmObjStr, FqNmObjIdPrV); FqNmObjIdPrV.Sort(false); FqNmObjIdPrV.Trunc(ContexterF->State->EnCtxLen); FqNmObjIdPrV.Clr(); for (int NmObjN=0; NmObjN<FqNmObjIdPrV.Len(); NmObjN++){ TStr CoNmObjStr=ContexterF->State->NmObjBs->GetNmObjStr(FqNmObjIdPrV[NmObjN].Val2); if (SubNmObjStr!=CoNmObjStr){ CtxChA+='['; CtxChA+=CoNmObjStr; CtxChA+=']'; CtxChA+='\\';} } if (!CtxChA.Empty()){ SubNmObjVNm=SubNmObjStr+"\\"+CtxChA;}} // push child named-object-id if necessary if (NmObjDistV[SubNmObjId]==-1){ // check number of subnodes int MxCreatedSubNmObjs=0; switch (NmObjDist){ case 0: MxCreatedSubNmObjs=/*20;*/ContexterF->State->EnSubNodes; break; case 1: MxCreatedSubNmObjs=4; break; case 2: MxCreatedSubNmObjs=2; break; case 3: MxCreatedSubNmObjs=1; break; case 4: MxCreatedSubNmObjs=1; break; default: MxCreatedSubNmObjs=0; break; } // check if stop creating branches CreatedSubNmObjs++; if (CreatedSubNmObjs>MxCreatedSubNmObjs){break;} // push edge OpenNmObjIdDistPrQ.Push(TIntPr(SubNmObjId, NmObjDist+1)); NmObjDistV[SubNmObjId]=NmObjDist+1; // create vertex TStr VNm=SubNmObjVNm; PVrtx DstVrtx=TGVrtx::New(SubNmObjId, VNm); Graph->AddVrtx(DstVrtx); // create edge //TStr ENm=TStr("_")+TInt::GetStr(NmObjId)+"-"+TInt::GetStr(SubNmObjId); TStr ENm=TInt::GetStr(SubNmObjFq); // calculate and add context string formed from coref-named-objects if (ContexterF->EnInterNmObjContextCb->Checked){ TChA CtxChA; TStr SrcNmObjStr=NmObjStr; TChA DstNmObjChA=DstVrtx->GetVNm(); if (DstNmObjChA.IsChIn('\\')){ DstNmObjChA.Trunc(DstNmObjChA.SearchCh('\\'));} TStr DstNmObjStr=DstNmObjChA; PBowSpV ConceptSpV=ContexterF->State->NmObjBs->GetNmObjConcept( ContexterF->State->BowDocBs, ContexterF->State->BowDocWgtBs, SrcNmObjStr, DstNmObjStr); TStrFltPrV WordStrWgtPrV; ConceptSpV->GetWordStrWgtPrV( ContexterF->State->BowDocBs, -1, 1, WordStrWgtPrV); TStrV UcWordStrSfV; for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){ // get word TStr UcWordStr=WordStrWgtPrV[WordN].Val1; // remove duplicates if (UcWordStr.IsStrIn(UcNmObjStr)){continue;} if (UcWordStr.IsStrIn(UcSubNmObjStr)){continue;} if (UcNmObjStr.IsStrIn(UcWordStr)){continue;} if (UcSubNmObjStr.IsStrIn(UcWordStr)){continue;} bool Ok=true; for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){ if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;} if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;} } if (!Ok){continue;} // add word UcWordStrSfV.Add(UcWordStr); CtxChA+='['; CtxChA+=UcWordStr; CtxChA+=']'; CtxChA+='\n'; // finish if limit reached if (UcWordStrSfV.Len()>=ContexterF->State->EnCtxLen){break;} } ENm=ENm+"\n"+CtxChA; } // create and add edge to the graph PEdge Edge=TGEdge::New(SrcVrtx, DstVrtx, ENm); Edge->PutWgt(1+log(SubNmObjFq)); Graph->AddEdge(Edge); } } } Graph->SetEdgeWidth(5); Graph->PlaceTreeAsStar(); Graph->RescaleXY(0.1, RootVrtx); // return graph return Graph; }
void TSkyGridBs::SaveTxt(const TStr& FNm, const uint64& CurTm){ // time-limit TStr CurTmStr=TTm::GetTmFromMSecs(CurTm).GetWebLogDateTimeStr(); uint64 CurDateTm=TTm::GetMSecsFromTm(TTm::GetTmFromWebLogDateTimeStr(TTm::GetTmFromMSecs(CurTm).GetWebLogDateStr())); TStr CurDateTmStr=TTm::GetTmFromMSecs(CurDateTm).GetWebLogDateTimeStr(); TUInt64V MnTmV; MnTmV.Add(CurDateTm-0*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-1*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-2*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-4*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-8*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-16*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-32*TTmInfo::GetDayMSecs()); // get bow //PBowDocBs BowDocBs=GetBowDocBs(3, 5); PBowDocBs BowDocBs=GetBowDocBs(); PBowDocWgtBs BowDocWgtBs=GetBowDocWgtBs(BowDocBs); // open file TFOut FOut(FNm); FILE* fOut=FOut.GetFileId(); // get docs-entities sorted vector TIntPrV DocsEntIdPrV; GetSorted_DocsEntIdPrV(DocsEntIdPrV); // traverse entities for (int EntN=0; EntN<DocsEntIdPrV.Len(); EntN++){ int EntId=DocsEntIdPrV[EntN].Val2; TStr EntNm=GetEntNm(EntId); int EntDocs=DocsEntIdPrV[EntN].Val1; TSkyGridEnt& Ent=GetEnt(EntId); int LinkEnts=Ent.GetLinkEnts(); fprintf(fOut, "'%s' [%d docs] [%d ents]\n", EntNm.CStr(), EntDocs, LinkEnts); // output docs over dates {TStrIntPrV DateStrDocsPrV; int _EntDocs; Ent.GetDocsPerDateV(this, DateStrDocsPrV, _EntDocs); fprintf(fOut, " Docs per Date (%d docs):", _EntDocs); for (int DateN=0; DateN<DateStrDocsPrV.Len(); DateN++){ TStr DateStr=DateStrDocsPrV[DateN].Val1; int Docs=DateStrDocsPrV[DateN].Val2; fprintf(fOut, " [%s:%d]", DateStr.CStr(), Docs); } fprintf(fOut, "\n");} fprintf(fOut, " [Now: %s]\n", CurTmStr.CStr()); TIntPrV PrevLinkWgtDstEntIdPrV; TStrFltPrV PrevWordStrWgtPrV; for (int MnTmN=0; MnTmN<MnTmV.Len(); MnTmN++){ uint64 MnTm=MnTmV[MnTmN]; double PastDays=(CurDateTm-MnTm)/double(TTmInfo::GetDayMSecs()); TStr MnTmStr=TTm::GetTmFromMSecs(MnTm).GetWebLogDateTimeStr(); // get linked entities TIntPrV LinkWgtDstEntIdPrV; Ent.GetSorted_LinkWgtDstEntIdPrV(MnTm, 0.9, LinkWgtDstEntIdPrV); // output difference between previous and current centroid if (MnTmN>0){ TIntPrV NegDiffLinkWgtDstEntIdPrV; TIntPrV PosDiffLinkWgtDstEntIdPrV; GetLinkWgtDstEntIdPrVDiff(LinkWgtDstEntIdPrV, PrevLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrV); // output positive change TChA PosDiffLinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(PosDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrVChA); fprintf(fOut, " Pos-Diff: %s\n", PosDiffLinkWgtDstEntIdPrVChA.CStr()); // output negative change TChA NegDiffLinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(NegDiffLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrVChA); fprintf(fOut, " Neg-Diff: %s\n", NegDiffLinkWgtDstEntIdPrVChA.CStr()); } PrevLinkWgtDstEntIdPrV=LinkWgtDstEntIdPrV; // output linked entities int TopLinkEnts=LinkWgtDstEntIdPrV.Len(); TChA LinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(LinkWgtDstEntIdPrV, LinkWgtDstEntIdPrVChA); fprintf(fOut, " Entities (%d ents): %s\n", TopLinkEnts, LinkWgtDstEntIdPrVChA.CStr()); // get text centroid int CtrDocs; TStrFltPrV WordStrWgtPrV; Ent.GetDocCentroid(this, BowDocBs, BowDocWgtBs, MnTm, 150, 0.9, CtrDocs, WordStrWgtPrV); // output difference between previous and current centroid if (MnTmN>0){ TStrFltPrV NegDiffWordStrWgtPrV; TStrFltPrV PosDiffWordStrWgtPrV; GetWordStrWgtPrVDiff(WordStrWgtPrV, PrevWordStrWgtPrV, NegDiffWordStrWgtPrV, PosDiffWordStrWgtPrV); // output positive change TChA PosDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(PosDiffWordStrWgtPrV, PosDiffWordStrWgtPrVChA); fprintf(fOut, " Pos-Diff: %s\n", PosDiffWordStrWgtPrVChA.CStr()); // output negative change TChA NegDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(NegDiffWordStrWgtPrV, NegDiffWordStrWgtPrVChA); fprintf(fOut, " Neg-Diff: %s\n", NegDiffWordStrWgtPrVChA.CStr()); } PrevWordStrWgtPrV=WordStrWgtPrV; // output centroid TChA WordStrWgtPrVChA; GetWordStrWgtPrVChA(WordStrWgtPrV, WordStrWgtPrVChA); fprintf(fOut, " Centroid (%d docs, %d words): %s\n", CtrDocs, WordStrWgtPrV.Len(), WordStrWgtPrVChA.CStr()); // output time fprintf(fOut, " [-%.1f days: %s]\n", PastDays, MnTmStr.CStr()); } // entity clustering /*TVec<TStrFltPrV> EntNmWgtPrVV; Ent.GetEntClustV(this, MnTmV.Last(), 100, 1000, 10, EntNmWgtPrVV); for (int ClustN=0; ClustN<EntNmWgtPrVV.Len(); ClustN++){ TStrFltPrV& EntNmWgtPrV=EntNmWgtPrVV[ClustN]; fprintf(fOut, " Clust-%d:", ClustN); for (int EntN=0; EntN<EntNmWgtPrV.Len(); EntN++){ TStr EntNm=EntNmWgtPrV[EntN].Val1; double Wgt=EntNmWgtPrV[EntN].Val2; fprintf(fOut, " ['%s':%.3f]", EntNm.CStr(), Wgt); } fprintf(fOut, "\n"); }*/ fprintf(fOut, "\n"); } }