void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){ TFOut SOut(FNm); int Docs=BowDocBs->GetDocs(); for (int DId=0; DId<Docs; DId++){ printf("%d/%d\r", DId+1, Docs); // output document-name TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId)); SOut.PutStr(DocNm); // output categories for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){ int CId=BowDocBs->GetDocCId(DId, CIdN); TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId)); SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm); } // output words if (UseDocStrP){ TStr DocStr=BowDocBs->GetDocStr(DId); // DocStr.DelChAll('\n'); DocStr.DelChAll('\r'); SOut.PutCh(' '); SOut.PutStr(DocStr); } else { int DocWIds=BowDocBs->GetDocWIds(DId); int WId; double WordFq; for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq); TStr WordStr=BowDocBs->GetWordStr(WId); for (int WordFqN=0; WordFqN<WordFq; WordFqN++){ SOut.PutCh(' '); SOut.PutStr(WordStr); } } } SOut.PutLn(); } printf("\n"); }
PLwOntoGround TLwOntoGround::GetOntoGround( const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm, const bool& DocCatIsTermIdP, const double& CutWordWgtSumPrc){ printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); int Terms=TermBs->GetTerms(); PLwLinkBs LinkBs=LwOnto->GetLinkBs(); PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs(); int LangId=LwOnto->GetLangBs()->GetLangId(LangNm); int Docs=BowDocBs->GetDocs(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ...\n"); TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0; for (int DId=0; DId<Docs; DId++){ printf(" Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats); for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){ // get document-category int CId=BowDocBs->GetDocCId(DId, DocCIdN); TStr CatNm=BowDocBs->GetCatNm(CId); // get term-id if (DocCatIsTermIdP){ int TermId=CatNm.GetInt(); if (TermBs->IsTermId(TermId)){ TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } else { if (TermBs->IsTermId(CatNm, LangId)){ int TermId=TermBs->GetTermId(CatNm, LangId); TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } } } printf(" Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats); printf(" Done.\n"); // create sub-terms & up-terms vectors printf(" Creating sub-terms & up-terms vectors ..."); TIntIntVH Const_TermIdToSubTermIdVH; TIntIntVH TermIdToSubTermIdVH; TIntIntVH TermIdToUpTermIdVH; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){ int LinkTypeId; int DstTermId; LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId); TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm(); if (LinkTypeNm=="NT"){ Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId); } } } printf(" Done.\n"); // create centroids printf(" Creating centroids ...\n"); THash<TInt, PBowSpV> TermIdToConceptSpVH; TIntIntVH TermIdToSubTermDIdVH; TIntH ProcTermIdH; int PrevActiveTerms=-1; forever{ // count active nodes for processing int ActiveTerms=0; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if ((TermIdToSubTermIdVH.IsKey(TermId))&& (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){ ActiveTerms++; } } // stop if no change from previous round printf(" Active-Terms:%d\n", ActiveTerms); if (ActiveTerms==PrevActiveTerms){break;} PrevActiveTerms=ActiveTerms; // reduce active-nodes with zero-ancestors for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if (ProcTermIdH.IsKey(TermId)){continue;} if ((!TermIdToSubTermIdVH.IsKey(TermId))|| (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){ printf(" %d/%d\r", 1+TermN, Terms); ProcTermIdH.AddKey(TermId); // collect document-ids TIntV TermDIdV; if (TermIdToDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));} if (TermIdToSubTermDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));} // create concept-vector if any documents if (TermDIdV.Len()>0){ PBowSpV ConceptSpV= TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc); TermIdToConceptSpVH.AddDat(TermId, ConceptSpV); } // correct upper-term if (TermIdToUpTermIdVH.IsKey(TermId)){ TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId); for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){ int UpTermId=UpTermIdV[UpTermIdN]; TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId); if (TermDIdV.Len()>0){ TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);} } } } } } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround= TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }