void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){ TFOut SOut(FNm); int Docs=BowDocBs->GetDocs(); for (int DId=0; DId<Docs; DId++){ printf("%d/%d\r", DId+1, Docs); // output document-name TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId)); SOut.PutStr(DocNm); // output categories for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){ int CId=BowDocBs->GetDocCId(DId, CIdN); TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId)); SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm); } // output words if (UseDocStrP){ TStr DocStr=BowDocBs->GetDocStr(DId); // DocStr.DelChAll('\n'); DocStr.DelChAll('\r'); SOut.PutCh(' '); SOut.PutStr(DocStr); } else { int DocWIds=BowDocBs->GetDocWIds(DId); int WId; double WordFq; for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq); TStr WordStr=BowDocBs->GetWordStr(WId); for (int WordFqN=0; WordFqN<WordFq; WordFqN++){ SOut.PutCh(' '); SOut.PutStr(WordStr); } } } SOut.PutLn(); } printf("\n"); }
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm, const TStr& CatFNm, const TIntV& _DIdV) { TIntV DIdV; if (_DIdV.Empty()) { BowDocBs->GetAllDIdV(DIdV); } else { DIdV = _DIdV; } // generate map of row-ids to words TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat")); for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) { TStr WdStr = BowDocBs->GetWordStr(WId); WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1, WdStr.CStr())); } WdMapSOut.Flush(); // generate map of col-ids to document names TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat")); for (int DocN = 0; DocN < DIdV.Len(); DocN++) { const int DId = DIdV[DocN]; TStr DocNm = BowDocBs->GetDocNm(DId); DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId, DocNm.CStr())); } DocMapSOut.Flush(); // save documents' sparse vectors TFOut SOut(FNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId); const int DocWIds = DocSpV->GetWIds(); for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ const int WId = DocSpV->GetWId(DocWIdN); const double WordWgt = DocSpV->GetWgt(DocWIdN); SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt)); } } SOut.Flush(); // save documents' category sparse vectors if (!CatFNm.Empty()) { TFOut CatSOut(CatFNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; const int DocCIds = BowDocBs->GetDocCIds(DId); for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){ const int CId = BowDocBs->GetDocCId(DId, DocCIdN); const double CatWgt = 1.0; CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt)); } } CatSOut.Flush(); } }
TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs, const TStr& CatNm, const TIntV& DIdV, TFltV& ClsV): TMatrix() { RowN = BowDocBs->GetWords(); ClsV.Gen(DIdV.Len(), 0); ColSpVV.Gen(DIdV.Len(), 0); IAssert(BowDocBs->IsCatNm(CatNm)); int CatId = BowDocBs->GetCId(CatNm); for (int i = 0; i < DIdV.Len(); i++) { ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i])); ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99); } }
void TNmObjBs::GetNmObjDIdV( const PBowDocBs& BowDocBs, TIntV& BowDIdV, const TStr& NmObjStr1, const TStr& NmObjStr2) const { // get first named-object-id int NmObjId1=GetNmObjId(NmObjStr1); TIntV NmObjDocIdV1; GetNmObjDocIdV(NmObjId1, NmObjDocIdV1); NmObjDocIdV1.Sort(); // get second named-object-id TIntV NmObjDocIdV2; if (!NmObjStr2.Empty()){ int NmObjId2=GetNmObjId(NmObjStr2); GetNmObjDocIdV(NmObjId2, NmObjDocIdV2); NmObjDocIdV2.Sort(); } // create joint doc-id-vector TIntV NmObjDocIdV; if (NmObjDocIdV2.Empty()){ NmObjDocIdV=NmObjDocIdV1; } else { NmObjDocIdV1.Intrs(NmObjDocIdV2, NmObjDocIdV); } // traverse named-object-documents to collect bow-document-ids BowDIdV.Gen(NmObjDocIdV.Len(), 0); for (int NmObjDocIdN=0; NmObjDocIdN<NmObjDocIdV.Len(); NmObjDocIdN++){ TStr DocNm=GetDocNm(NmObjDocIdV[NmObjDocIdN]); int DId=BowDocBs->GetDId(DocNm); if (DId!=-1){ BowDIdV.Add(DId); } } }
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm, TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) { // open line-doc file NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0; while (!FIn.Eof()){ Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} printf("%d\r", Docs); // document name TChA DocNm; Ch=FIn.GetCh(); if (NamedP){ while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ DocNm+=Ch; Ch=FIn.GetCh();} DocNm.Trunc(); if (DocNm.Empty()){Docs--; continue;} } else { DocNm = TInt::GetStr(Docs); } // categories TStrV CatNmV; forever { while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();} if (Ch=='!'){ if (!FIn.Eof()){Ch=FIn.GetCh();} TChA CatNm; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ CatNm+=Ch; Ch=FIn.GetCh();} if (!CatNm.Empty()){CatNmV.Add(CatNm);} } else { break; } } // document text TChA DocChA; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){ DocChA+=Ch; Ch=FIn.GetCh();} // skip empty documents (empty lines) if (DocNm.Empty()&&DocChA.Empty()){ continue;} // add document to document-base NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP)); } // return document-base BowDocBs->AssertOk(); printf("\n"); }
void TFtrGen::AddWds(const TStr& Prefix, const PBowDocBs& BowDocBs, int& Offset) const { const int Vals = GetVals(); for (int ValN = 0; ValN < Vals; ValN++) { const int WId = BowDocBs->AddWordStr( TStr::Fmt("%s-%s", Prefix.CStr(), GetVal(ValN).CStr())); IAssert(Offset == WId); Offset++; } }
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, const TIntV& TrainDIdV) { // create model TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm; // compute centroid TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm); for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) { const int DId = TrainDIdV[TrainDIdN]; if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); } } PBowSim BowSim = TBowSim::New(bstCos); PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV); CentroidMd->CentroidV.Gen(BowDocBs->GetWords()); CentroidMd->CentroidV.PutAll(0.0); TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV); return CentroidMd; }
PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm) { printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); const int Terms = TermBs->GetTerms(); const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm); const int Words = BowDocBs->GetWords(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ... "); THash<TInt, PBowSpV> TermIdToConceptSpVH; for (int TermN = 0; TermN < Terms; TermN++){ int TermId = TermBs->GetTermId(TermN); PLwTerm Term = TermBs->GetTerm(TermId); if (Term->GetLangId() != LangId) { continue; } // do nearest neighbour search PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr( Term->GetTermNm(), BowDocWgtBs); TFltIntKdV SimDIdKdV; BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false); TFltV TermV(Words); TermV.PutAll(0.0); for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) { PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat); const double Sim = SimDIdKdV[SimDIdKdN].Key; TBowLinAlg::AddVec(Sim, DocSpV, TermV); } TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps)); } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs, const TStr& DocNm, const TStrV& FtrValV) const { TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV); // make KdV to PrV const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0); for (int WIdN = 0; WIdN < WIds; WIdN++) { WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat)); } // add the feature vector to trainsets BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV); }
PBowMd TBowWinnowMd::NewMulti( const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){ // create model TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd); // traverse categories TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV); for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){ // get category data TStr CatNm=FqCatNmPrV[CatN].Val2; int CId=BowDocBs->GetCId(CatNm); // output header printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n", CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats()); // create model PBowMd BowMd=New(BowDocBs, CatNm, Beta); // add model to model-set MultiMd->AddBowMd(BowMd); } // return model return BowMd; }
///////////////////////////////////////////////// // BagOfWords-Files void TBowFl::LoadHtmlTxt( PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV, const bool& RecurseDirP, const int& MxDocs, const bool& SaveDocP, const PNotify& Notify) { // prepare file-directory traversal TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc(); Notify->OnStatus("Creating Bow from file-path " + FPath + " ..."); TFFile FFile(FPath, "", RecurseDirP); // traverse files TStr FNm; int Docs=0; NewDIdV.Clr(); while (FFile.Next(FNm)){ Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} Notify->OnStatus(TStr::Fmt("%d\r", Docs)); // prepare document-name if (TFile::Exists(FNm)) { //B: TStr DocNm=FNm.GetLc(); if (DocNm.IsPrefix(LcNrFPath)){ DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);} // categories TStrV CatNmV; TStr CatNm; if (DocNm.IsChIn('/')){ TStr Str; DocNm.SplitOnCh(CatNm, '/', Str); } else if (DocNm.IsChIn('\\')){ TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str); } if (!CatNm.Empty()){ CatNmV.Add(CatNm);} // load document-content TStr DocStr=TStr::LoadTxt(FNm); // add document to bow NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP)); } } Notify->OnStatus(TStr::Fmt("%d", Docs)); // return results Notify->OnStatus("Done."); BowDocBs->AssertOk(); }
void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs, const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const uint64& MnTm, const int& TopWords, const double& TopWordsWgtSumPrc, int& Docs, TStrFltPrV& WordStrWgtPrV) const { // get doc-ids TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV); TIntV BowDIdV(DocIdV.Len(), 0); for (int DocN=0; DocN<DocIdV.Len(); DocN++){ int DocId=DocIdV[DocN]; TStr BowDocNm=TInt::GetStr(DocId); int BowDId=BowDocBs->GetDId(BowDocNm); BowDIdV.Add(BowDId); } // create concept vector PBowSim BowSim=TBowSim::New(bstCos); // similarity object PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, BowDIdV); // get docs & word-vector Docs=DocIdV.Len(); ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV); }
PBowMd TBowWinnowMd::New( const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){ // create model TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd); WinnowMd->CatNm=CatNm; WinnowMd->Beta=Beta; WinnowMd->VoteTsh=0.5; // prepare Winnow parameters const double MnExpertWgtSum=1e-15; // get cat-id int CId=BowDocBs->GetCId(CatNm); if (CId==-1){ TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));} // get training documents TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV); int TrainDocs=TrainDIdV.Len(); // prepare mini-experts int Words=BowDocBs->GetWords(); WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1); WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1); // winnow loop double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0; const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0; const int MxIters=50; int IterN=0; while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){ IterN++; int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0; for (int DIdN=0; DIdN<TrainDocs; DIdN++){ int DId=TrainDIdV[DIdN]; bool ClassVal=BowDocBs->IsCatInDoc(DId, CId); double PosWgt=0; double NegWgt=0; double OldSum=0; double NewSum=0; int WIds=BowDocBs->GetDocWIds(DId); // change only experts of words that occur in the document for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; // penalize expert giving wrong class prediction if (ClassVal){ WinnowMd->NegExpertWgtV[WId]*=Beta; } else { WinnowMd->PosExpertWgtV[WId]*=Beta; } NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; PosWgt+=WinnowMd->PosExpertWgtV[WId]; NegWgt+=WinnowMd->NegExpertWgtV[WId]; } // normalize all experts if (NewSum>MnExpertWgtSum){ for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum; WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum; } } bool PredClassVal; if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();} else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;} if (PredClassVal==ClassVal){ if (PredClassVal){TruePos++;} else {TrueNeg++;} } else { if (PredClassVal){FalsePos++;} else {FalseNeg++;} } } // calculate temporary results if (TrainDocs==0){break;} double Acc=0; double Prec=0; double Rec=0; double F1=0; if (TrainDocs>0){ Acc=100*(TruePos+TrueNeg)/double(TrainDocs); if (TruePos+FalsePos>0){ Prec=(TruePos/double(TruePos+FalsePos)); Rec=(TruePos/double(TruePos+FalseNeg)); if (Prec+Rec>0){ F1=(2*Prec*Rec/(Prec+Rec)); } } } // check if the current iteration gave worse results then the previous if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&& ((Rec-PrevRec)<MxDiff))){WorseIters++;} else {WorseIters=0;} PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1; printf("%d. Precision:%0.3f Recall:%0.3f F1:%0.3f Accuracy:%0.3f%%\n", IterN, Prec, Rec, F1, Acc); } // return model return BowMd; }
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, const TIntV& IgnoreIdV, const int& TrainLen) { // feature generators PFtrGenBs FtrGenBs = TFtrGenBs::New(); // CSV parsing stuff PSIn SIn = TFIn::New(FNm); char SsCh = ' '; TStrV FldValV; // read the headers and initialise the feature generators TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { if (FldVal == "NOM") { FtrGenBs->PutClsFtrGen(TFtrGenNominal::New()); } else if (FldVal == "MULTI-NOM") { FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!"); } } else if (!IgnoreIdV.IsIn(FldValN)) { if (FldVal == TFtrGenNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNumeric::New()); } else if (FldVal == TFtrGenNominal::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNominal::New()); } else if (FldVal == TFtrGenToken::GetType()) { FtrGenBs->AddFtrGen(TFtrGenToken::New( TSwSet::New(swstNone), TStemmer::New(stmtNone))); } else if (FldVal == TFtrGenSparseNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New()); } else if (FldVal == TFtrGenMultiNom::GetType()) { FtrGenBs->AddFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong type '" + FldVal + "'!"); } } } const int Flds = FldValV.Len(); // read the lines and feed them to the feature generators int Recs = 0; while (!SIn->Eof()) { if (Recs == TrainLen) { break; } Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines try { TStrV FtrValV; for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { FtrGenBs->UpdateCls(FldVal); } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } FtrGenBs->Update(FtrValV); } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } } // read the file again and feed it to the training set PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs(); // we read and ignore the headers since we parsed them already SIn = TFIn::New(FNm); SsCh = ' '; TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // read the lines and feed them to the training set Recs = 0; while (!SIn->Eof()){ Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines and construct the sparse vector TStrV FtrValV; TStr ClsFtrVal; try { for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { ClsFtrVal = FldVal; } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } // add the feature vector to trainsets FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal); } // prepare training and testing doc ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted()); TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen); BowDocBs->PutTrainDIdV(TrainDIdV); TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV); BowDocBs->PutTestDIdV(TestDIdV); return BowDocBs; }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // command line parameters Env.PrepArgs("Text To Bag-Of-Words"); TStr InFPath=Env.GetIfArgPrefixStr("-ihtml:", "", "Input-Html-Path"); TStr InMtxFNm=Env.GetIfArgPrefixStr("-imtx:", "", "Input-Matrix-File"); TStr InTabFNm=Env.GetIfArgPrefixStr("-itab:", "", "Input-Tab-File"); TStr InTsactFNm=Env.GetIfArgPrefixStr("-itsc:", "", "Input-Transaction-File"); TStr InSparseFNm=Env.GetIfArgPrefixStr("-ispr:", "", "Input-Sparse-File"); TStr InSvmLightFNm=Env.GetIfArgPrefixStr("-isvml:", "", "Input-SvmLight-File"); TStr InCpdFNm=Env.GetIfArgPrefixStr("-icpd:", "", "Input-CompactDocuments-File"); TStr InTBsFNm=Env.GetIfArgPrefixStr("-itbs:", "", "Input-TextBase-File"); TStr InLnDocFNm=Env.GetIfArgPrefixStr("-ilndoc:", "", "Input-LineDocuments-File"); TStr InNmLnDocFNm=Env.GetIfArgPrefixStr("-inlndoc:", "", "Input-Named-LineDocuments-File"); TStr InReuters21578FPath=Env.GetIfArgPrefixStr("-ir21578:", "", "Input-Reuters21578-Path"); TStr InCiaWFBFPath=Env.GetIfArgPrefixStr("-iciawfb:", "", "Input-CIA-World-Fact-Book-Path"); TStr InDaxFNm=Env.GetIfArgPrefixStr("-idax:", "", "Input-DocumentAtlasXML-File"); TStr OutBowFNm=Env.GetIfArgPrefixStr("-o:", "Out.Bow", "Bow-Output-File (.Bow)"); bool OutStatP=Env.GetIfArgPrefixBool("-ostat:", true, "Output-Statistics (*.Txt)"); int Recs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents-To-Process"); bool RecurseDirP=Env.GetIfArgPrefixBool("-recurse:", false, "Recurse-Directories"); TStr SwSetTypeNm=Env.GetIfArgPrefixStr("-stopword:", "en523", "Stop-Word-Set "+TSwSet::GetSwSetTypeNmVStr()); TStr SwSetFNm=Env.GetIfArgPrefixStr("-istopword:", "", "External-Stop-Word-Set-File"); TStr StemmerTypeNm=Env.GetIfArgPrefixStr("-stemmer:", "porter", "Stemmer "+TStemmer::GetStemmerTypeNmVStr()); int MxNGramLen=Env.GetIfArgPrefixInt("-ngramlen:", 3, "Max-NGram-Length"); int MnNGramFq=Env.GetIfArgPrefixInt("-ngramfq:", 5, "Min-NGram-Frequency"); bool SaveDocP=Env.GetIfArgPrefixBool("-savedoc:", false, "Save-Document-Text"); if (Env.IsEndOfRun()){return 0;} // -idir:f:\data\ciawfb\print -o:CiaWfb.Bow -docs:50 // -isvml:SvmLightTrain.Dat -o:SvmLight.Bow // -ir21578:f:\data\Reuters21578 -o:Reuters21578.Bow // -inlndoc:c:\data\yahoocompanies\CompProfilesSymbols.txt // -ihtml:c:\data\cordis\fp6 // bag-of-words to create PBowDocBs BowDocBs; // load input data if (!InFPath.Empty()){ // directory-files // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); if (!SwSetFNm.Empty()) { SwSet->LoadFromFile(SwSetFNm); } // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // load bow BowDocBs=TBowFl::LoadHtmlTxt(InFPath, RecurseDirP, Recs, SwSet, Stemmer, MxNGramLen, MnNGramFq, SaveDocP); } else if (!InMtxFNm.Empty()){ // matrix-file PBowSimMtx BowSimMtx=TBowSimMtx::LoadTxt(InMtxFNm); BowDocBs=TBowFl::LoadFromSimMtx(BowSimMtx); } else if (!InTabFNm.Empty()){ // tab-file BowDocBs=TBowFl::LoadTabTxt(InTabFNm, Recs); } else if (!InTsactFNm.Empty()){ // transaction-file BowDocBs=TBowFl::LoadTsactTxt(InTsactFNm, Recs); } else if (!InSparseFNm.Empty()){ // sparse-file TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup"); TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var"); TStr TrainDataFNm=InSparseFNm; BowDocBs=TBowFl::LoadSparseTxt(DocDefFNm, WordDefFNm, TrainDataFNm, Recs); } else if (!InSvmLightFNm.Empty()){ // SvmLight-file TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup"); TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var"); TStr TrainDataFNm=InSvmLightFNm; BowDocBs=TBowFl::LoadSvmLightTxt(DocDefFNm, WordDefFNm, TrainDataFNm, "", Recs); } else if (!InTBsFNm.Empty()){ // Text-Base-file //BowDocBs=TBowFl::LoadTBsTxt(InTBsFNm, Recs); } else if (!InCpdFNm.Empty()){ // Compact-Doc-file BowDocBs=TBowFl::LoadCpdTxt(InCpdFNm, Recs, SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq); } else if (!InLnDocFNm.Empty()){ // Line-Documents-file BowDocBs=TBowFl::LoadLnDocTxt(InLnDocFNm, false, Recs, SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP); } else if (!InNmLnDocFNm.Empty()){ // Named-Line-Documents-file BowDocBs=TBowFl::LoadLnDocTxt(InNmLnDocFNm, true, Recs, SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP); } else if (!InReuters21578FPath.Empty()){ // Reuters-21578-file BowDocBs=TBowFl::LoadReuters21578Txt(InReuters21578FPath, Recs, SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq); } else if (!InCiaWFBFPath.Empty()){ // CIA-World-Fact-Book BowDocBs=TBowFl::LoadCiaWFBTxt(InCiaWFBFPath, Recs, SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq); } else if (!InDaxFNm.Empty()) { // DocumentAtlasXml-File PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); BowDocBs=TVizMapXmlDocBs::LoadBowDocBs(InDaxFNm, SwSet, Stemmer, MxNGramLen, MnNGramFq); } else { TExcept::Throw("No Input-File specified!"); } // save bow-file if (!OutBowFNm.Empty()){ TStr::PutFExtIfEmpty(OutBowFNm, ".Bow"); printf("Saving Bow to '%s' ...", OutBowFNm.CStr()); BowDocBs->SaveBin(OutBowFNm); printf(" Done.\n"); } // save statistics if ((OutStatP)&&(!OutBowFNm.Empty())){ // save bow-statistics TStr OutBowStatFNm=TStr::PutFExt(OutBowFNm, ".Bow.Txt"); TStr::PutFExt(OutBowStatFNm, ".Bow.Txt"); if (!OutBowStatFNm.Empty()){ printf("Saving Bow-Statistics to '%s' ...", OutBowStatFNm.CStr()); BowDocBs->SaveTxtStat(OutBowStatFNm); printf(" Done.\n"); } // save ngram-statistics PNGramBs NGramBs=BowDocBs->GetNGramBs(); TStr OutNGramStatFNm=TStr::PutFExt(OutBowFNm, ".NGram.Txt"); if (!NGramBs.Empty()){ printf("Saving NGram-Statistics to '%s' ...", OutNGramStatFNm.CStr()); NGramBs->SaveTxt(OutNGramStatFNm, true); printf(" Done.\n"); } } return 0; Catch; return 1; }
PLwOntoGround TLwOntoGround::GetOntoGround( const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm, const bool& DocCatIsTermIdP, const double& CutWordWgtSumPrc){ printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); int Terms=TermBs->GetTerms(); PLwLinkBs LinkBs=LwOnto->GetLinkBs(); PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs(); int LangId=LwOnto->GetLangBs()->GetLangId(LangNm); int Docs=BowDocBs->GetDocs(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ...\n"); TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0; for (int DId=0; DId<Docs; DId++){ printf(" Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats); for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){ // get document-category int CId=BowDocBs->GetDocCId(DId, DocCIdN); TStr CatNm=BowDocBs->GetCatNm(CId); // get term-id if (DocCatIsTermIdP){ int TermId=CatNm.GetInt(); if (TermBs->IsTermId(TermId)){ TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } else { if (TermBs->IsTermId(CatNm, LangId)){ int TermId=TermBs->GetTermId(CatNm, LangId); TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } } } printf(" Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats); printf(" Done.\n"); // create sub-terms & up-terms vectors printf(" Creating sub-terms & up-terms vectors ..."); TIntIntVH Const_TermIdToSubTermIdVH; TIntIntVH TermIdToSubTermIdVH; TIntIntVH TermIdToUpTermIdVH; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){ int LinkTypeId; int DstTermId; LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId); TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm(); if (LinkTypeNm=="NT"){ Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId); } } } printf(" Done.\n"); // create centroids printf(" Creating centroids ...\n"); THash<TInt, PBowSpV> TermIdToConceptSpVH; TIntIntVH TermIdToSubTermDIdVH; TIntH ProcTermIdH; int PrevActiveTerms=-1; forever{ // count active nodes for processing int ActiveTerms=0; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if ((TermIdToSubTermIdVH.IsKey(TermId))&& (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){ ActiveTerms++; } } // stop if no change from previous round printf(" Active-Terms:%d\n", ActiveTerms); if (ActiveTerms==PrevActiveTerms){break;} PrevActiveTerms=ActiveTerms; // reduce active-nodes with zero-ancestors for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if (ProcTermIdH.IsKey(TermId)){continue;} if ((!TermIdToSubTermIdVH.IsKey(TermId))|| (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){ printf(" %d/%d\r", 1+TermN, Terms); ProcTermIdH.AddKey(TermId); // collect document-ids TIntV TermDIdV; if (TermIdToDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));} if (TermIdToSubTermDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));} // create concept-vector if any documents if (TermDIdV.Len()>0){ PBowSpV ConceptSpV= TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc); TermIdToConceptSpVH.AddDat(TermId, ConceptSpV); } // correct upper-term if (TermIdToUpTermIdVH.IsKey(TermId)){ TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId); for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){ int UpTermId=UpTermIdV[UpTermIdN]; TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId); if (TermDIdV.Len()>0){ TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);} } } } } } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround= TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }
void TVizMapContext::PaintCatNms(PGks Gks, const int& KeyWdFontSize, TVec<TFltRect>& PointNmRectV) { // calculate frequency of categories TIntH CatH; TIntFltPrH CatPosH; PBowDocBs BowDocBs = VizMapFrame->GetKeyWdBow(); const int Points = VizMapFrame->GetPoints(); for (int PointN = 0; PointN < Points; PointN++) { PVizMapPoint Point = VizMapFrame->GetPoint(PointN); const int DId = Point->GetDocId(); const int CIds = BowDocBs->GetDocCIds(DId); for (int CIdN = 0; CIdN < CIds; CIdN++) { const int CId = BowDocBs->GetDocCId(DId, CIdN); CatH.AddDat(CId)++; CatPosH.AddDat(CId).Val1 += Point->GetPointX(); CatPosH.AddDat(CId).Val2 += Point->GetPointY(); } } CatH.SortByDat(false); // draw the top cats const int TopCats = Points > 100 ? 6 : 4; TFltRect ZoomRect = GetZoomRect(); Gks->SetFont(TGksFont::New("ARIAL", KeyWdFontSize + 3, ColorCatNmFont)); TVec<TFltRect> CatNmRectV; TVec<TFltV> CatNmPosV; const int MnSize = TInt::GetMn(Gks->GetWidth(), Gks->GetHeight()); const int MnDist = TFlt::Round(0.3 * double(MnSize)); int Cats = 0, CatKeyId = CatH.FFirstKeyId(); while (CatH.FNextKeyId(CatKeyId)) { if (Cats == TopCats) { break; } if (double(CatH[CatKeyId]) / double(Points) < 0.05) { break; } const int CId = CatH.GetKey(CatKeyId); // get name TStr CatNm = BowDocBs->GetCatNm(CId); if (CatFullNmH.IsKey(CatNm)) { CatNm = CatFullNmH.GetDat(CatNm); } else { continue; } // get position TFltPr CatPos = CatPosH.GetDat(CId); const int CatCount = CatH.GetDat(CId); IAssert(CatCount > 0); const double CatX = CatPos.Val1 / double(CatCount); const double CatY = CatPos.Val2 / double(CatCount); // is it within the zoom? if (!ZoomRect.IsXYIn(CatX, CatY)) { continue; } // calculate string size on the screen const int HalfTxtWidth = Gks->GetTxtWidth(CatNm) / 2; const int HalfTxtHeight = Gks->GetTxtHeight(CatNm) / 2; // get coordinates in pixels const int X = GetScreenCoord(CatX , ZoomRect.GetMnX(), ZoomRect.GetXLen(), Gks->GetWidth()); const int Y = GetScreenCoord(CatY, ZoomRect.GetMnY(), ZoomRect.GetYLen(), Gks->GetHeight()); // is it to close to any of the most prominent categories int CatNmDist = MnSize; TFltV CatNmPos = TFltV::GetV(double(X), double(Y)); for (int CatNmPosN = 0; CatNmPosN < CatNmPosV.Len(); CatNmPosN++) { const double Dist = TLinAlg::EuclDist(CatNmPosV[CatNmPosN], CatNmPos); CatNmDist = TInt::GetMn(TFlt::Round(Dist), CatNmDist); } if (CatNmDist < MnDist) { continue; } // does it overlap with any of hte most prominent categories TFltRect CatNmRect(X - HalfTxtWidth, Y - HalfTxtHeight, X + HalfTxtWidth, Y + HalfTxtHeight); bool DoDraw = true; const int Rects = CatNmRectV.Len(); for (int RectN = 0; (RectN < Rects) && DoDraw; RectN++) { DoDraw = !TFltRect::Intersection(CatNmRect, CatNmRectV[RectN]); } if (!DoDraw) { continue; } // draw it! Gks->PutTxt(CatNm, X - HalfTxtWidth, Y - HalfTxtHeight); // remember string area CatNmRectV.Add(CatNmRect); Cats++; // remember string position CatNmPosV.Add(CatNmPos); } PointNmRectV.AddV(CatNmRectV); }
PBowMd TBowMd::LoadBin(const TStr& FNm, const PBowDocBs& BowDocBs){ PBowMd BowMd; {TFIn SIn(FNm); BowMd=Load(SIn);} if ((!BowDocBs.Empty())&&(BowMd->GetBowDocBsSig()!=BowDocBs->GetSig())){ TExcept::Throw("Bow-Model and Bow-Data signatures don't match!");} return BowMd; }