void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){ TFOut SOut(FNm); int Docs=BowDocBs->GetDocs(); for (int DId=0; DId<Docs; DId++){ printf("%d/%d\r", DId+1, Docs); // output document-name TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId)); SOut.PutStr(DocNm); // output categories for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){ int CId=BowDocBs->GetDocCId(DId, CIdN); TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId)); SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm); } // output words if (UseDocStrP){ TStr DocStr=BowDocBs->GetDocStr(DId); // DocStr.DelChAll('\n'); DocStr.DelChAll('\r'); SOut.PutCh(' '); SOut.PutStr(DocStr); } else { int DocWIds=BowDocBs->GetDocWIds(DId); int WId; double WordFq; for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq); TStr WordStr=BowDocBs->GetWordStr(WId); for (int WordFqN=0; WordFqN<WordFq; WordFqN++){ SOut.PutCh(' '); SOut.PutStr(WordStr); } } } SOut.PutLn(); } printf("\n"); }
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm, const TStr& CatFNm, const TIntV& _DIdV) { TIntV DIdV; if (_DIdV.Empty()) { BowDocBs->GetAllDIdV(DIdV); } else { DIdV = _DIdV; } // generate map of row-ids to words TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat")); for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) { TStr WdStr = BowDocBs->GetWordStr(WId); WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1, WdStr.CStr())); } WdMapSOut.Flush(); // generate map of col-ids to document names TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat")); for (int DocN = 0; DocN < DIdV.Len(); DocN++) { const int DId = DIdV[DocN]; TStr DocNm = BowDocBs->GetDocNm(DId); DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId, DocNm.CStr())); } DocMapSOut.Flush(); // save documents' sparse vectors TFOut SOut(FNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId); const int DocWIds = DocSpV->GetWIds(); for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ const int WId = DocSpV->GetWId(DocWIdN); const double WordWgt = DocSpV->GetWgt(DocWIdN); SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt)); } } SOut.Flush(); // save documents' category sparse vectors if (!CatFNm.Empty()) { TFOut CatSOut(CatFNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; const int DocCIds = BowDocBs->GetDocCIds(DId); for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){ const int CId = BowDocBs->GetDocCId(DId, DocCIdN); const double CatWgt = 1.0; CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt)); } } CatSOut.Flush(); } }
PLwOntoGround TLwOntoGround::GetOntoGround( const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm, const bool& DocCatIsTermIdP, const double& CutWordWgtSumPrc){ printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); int Terms=TermBs->GetTerms(); PLwLinkBs LinkBs=LwOnto->GetLinkBs(); PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs(); int LangId=LwOnto->GetLangBs()->GetLangId(LangNm); int Docs=BowDocBs->GetDocs(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ...\n"); TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0; for (int DId=0; DId<Docs; DId++){ printf(" Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats); for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){ // get document-category int CId=BowDocBs->GetDocCId(DId, DocCIdN); TStr CatNm=BowDocBs->GetCatNm(CId); // get term-id if (DocCatIsTermIdP){ int TermId=CatNm.GetInt(); if (TermBs->IsTermId(TermId)){ TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } else { if (TermBs->IsTermId(CatNm, LangId)){ int TermId=TermBs->GetTermId(CatNm, LangId); TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } } } printf(" Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats); printf(" Done.\n"); // create sub-terms & up-terms vectors printf(" Creating sub-terms & up-terms vectors ..."); TIntIntVH Const_TermIdToSubTermIdVH; TIntIntVH TermIdToSubTermIdVH; TIntIntVH TermIdToUpTermIdVH; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){ int LinkTypeId; int DstTermId; LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId); TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm(); if (LinkTypeNm=="NT"){ Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId); } } } printf(" Done.\n"); // create centroids printf(" Creating centroids ...\n"); THash<TInt, PBowSpV> TermIdToConceptSpVH; TIntIntVH TermIdToSubTermDIdVH; TIntH ProcTermIdH; int PrevActiveTerms=-1; forever{ // count active nodes for processing int ActiveTerms=0; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if ((TermIdToSubTermIdVH.IsKey(TermId))&& (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){ ActiveTerms++; } } // stop if no change from previous round printf(" Active-Terms:%d\n", ActiveTerms); if (ActiveTerms==PrevActiveTerms){break;} PrevActiveTerms=ActiveTerms; // reduce active-nodes with zero-ancestors for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if (ProcTermIdH.IsKey(TermId)){continue;} if ((!TermIdToSubTermIdVH.IsKey(TermId))|| (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){ printf(" %d/%d\r", 1+TermN, Terms); ProcTermIdH.AddKey(TermId); // collect document-ids TIntV TermDIdV; if (TermIdToDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));} if (TermIdToSubTermDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));} // create concept-vector if any documents if (TermDIdV.Len()>0){ PBowSpV ConceptSpV= TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc); TermIdToConceptSpVH.AddDat(TermId, ConceptSpV); } // correct upper-term if (TermIdToUpTermIdVH.IsKey(TermId)){ TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId); for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){ int UpTermId=UpTermIdV[UpTermIdN]; TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId); if (TermDIdV.Len()>0){ TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);} } } } } } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround= TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }
void TVizMapContext::PaintCatNms(PGks Gks, const int& KeyWdFontSize, TVec<TFltRect>& PointNmRectV) { // calculate frequency of categories TIntH CatH; TIntFltPrH CatPosH; PBowDocBs BowDocBs = VizMapFrame->GetKeyWdBow(); const int Points = VizMapFrame->GetPoints(); for (int PointN = 0; PointN < Points; PointN++) { PVizMapPoint Point = VizMapFrame->GetPoint(PointN); const int DId = Point->GetDocId(); const int CIds = BowDocBs->GetDocCIds(DId); for (int CIdN = 0; CIdN < CIds; CIdN++) { const int CId = BowDocBs->GetDocCId(DId, CIdN); CatH.AddDat(CId)++; CatPosH.AddDat(CId).Val1 += Point->GetPointX(); CatPosH.AddDat(CId).Val2 += Point->GetPointY(); } } CatH.SortByDat(false); // draw the top cats const int TopCats = Points > 100 ? 6 : 4; TFltRect ZoomRect = GetZoomRect(); Gks->SetFont(TGksFont::New("ARIAL", KeyWdFontSize + 3, ColorCatNmFont)); TVec<TFltRect> CatNmRectV; TVec<TFltV> CatNmPosV; const int MnSize = TInt::GetMn(Gks->GetWidth(), Gks->GetHeight()); const int MnDist = TFlt::Round(0.3 * double(MnSize)); int Cats = 0, CatKeyId = CatH.FFirstKeyId(); while (CatH.FNextKeyId(CatKeyId)) { if (Cats == TopCats) { break; } if (double(CatH[CatKeyId]) / double(Points) < 0.05) { break; } const int CId = CatH.GetKey(CatKeyId); // get name TStr CatNm = BowDocBs->GetCatNm(CId); if (CatFullNmH.IsKey(CatNm)) { CatNm = CatFullNmH.GetDat(CatNm); } else { continue; } // get position TFltPr CatPos = CatPosH.GetDat(CId); const int CatCount = CatH.GetDat(CId); IAssert(CatCount > 0); const double CatX = CatPos.Val1 / double(CatCount); const double CatY = CatPos.Val2 / double(CatCount); // is it within the zoom? if (!ZoomRect.IsXYIn(CatX, CatY)) { continue; } // calculate string size on the screen const int HalfTxtWidth = Gks->GetTxtWidth(CatNm) / 2; const int HalfTxtHeight = Gks->GetTxtHeight(CatNm) / 2; // get coordinates in pixels const int X = GetScreenCoord(CatX , ZoomRect.GetMnX(), ZoomRect.GetXLen(), Gks->GetWidth()); const int Y = GetScreenCoord(CatY, ZoomRect.GetMnY(), ZoomRect.GetYLen(), Gks->GetHeight()); // is it to close to any of the most prominent categories int CatNmDist = MnSize; TFltV CatNmPos = TFltV::GetV(double(X), double(Y)); for (int CatNmPosN = 0; CatNmPosN < CatNmPosV.Len(); CatNmPosN++) { const double Dist = TLinAlg::EuclDist(CatNmPosV[CatNmPosN], CatNmPos); CatNmDist = TInt::GetMn(TFlt::Round(Dist), CatNmDist); } if (CatNmDist < MnDist) { continue; } // does it overlap with any of hte most prominent categories TFltRect CatNmRect(X - HalfTxtWidth, Y - HalfTxtHeight, X + HalfTxtWidth, Y + HalfTxtHeight); bool DoDraw = true; const int Rects = CatNmRectV.Len(); for (int RectN = 0; (RectN < Rects) && DoDraw; RectN++) { DoDraw = !TFltRect::Intersection(CatNmRect, CatNmRectV[RectN]); } if (!DoDraw) { continue; } // draw it! Gks->PutTxt(CatNm, X - HalfTxtWidth, Y - HalfTxtHeight); // remember string area CatNmRectV.Add(CatNmRect); Cats++; // remember string position CatNmPosV.Add(CatNmPos); } PointNmRectV.AddV(CatNmRectV); }