void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm, const TStr& CatFNm, const TIntV& _DIdV) { TIntV DIdV; if (_DIdV.Empty()) { BowDocBs->GetAllDIdV(DIdV); } else { DIdV = _DIdV; } // generate map of row-ids to words TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat")); for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) { TStr WdStr = BowDocBs->GetWordStr(WId); WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1, WdStr.CStr())); } WdMapSOut.Flush(); // generate map of col-ids to document names TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat")); for (int DocN = 0; DocN < DIdV.Len(); DocN++) { const int DId = DIdV[DocN]; TStr DocNm = BowDocBs->GetDocNm(DId); DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId, DocNm.CStr())); } DocMapSOut.Flush(); // save documents' sparse vectors TFOut SOut(FNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId); const int DocWIds = DocSpV->GetWIds(); for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ const int WId = DocSpV->GetWId(DocWIdN); const double WordWgt = DocSpV->GetWgt(DocWIdN); SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt)); } } SOut.Flush(); // save documents' category sparse vectors if (!CatFNm.Empty()) { TFOut CatSOut(CatFNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; const int DocCIds = BowDocBs->GetDocCIds(DId); for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){ const int CId = BowDocBs->GetDocCId(DId, DocCIdN); const double CatWgt = 1.0; CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt)); } } CatSOut.Flush(); } }
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, const TIntV& IgnoreIdV, const int& TrainLen) { // feature generators PFtrGenBs FtrGenBs = TFtrGenBs::New(); // CSV parsing stuff PSIn SIn = TFIn::New(FNm); char SsCh = ' '; TStrV FldValV; // read the headers and initialise the feature generators TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { if (FldVal == "NOM") { FtrGenBs->PutClsFtrGen(TFtrGenNominal::New()); } else if (FldVal == "MULTI-NOM") { FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!"); } } else if (!IgnoreIdV.IsIn(FldValN)) { if (FldVal == TFtrGenNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNumeric::New()); } else if (FldVal == TFtrGenNominal::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNominal::New()); } else if (FldVal == TFtrGenToken::GetType()) { FtrGenBs->AddFtrGen(TFtrGenToken::New( TSwSet::New(swstNone), TStemmer::New(stmtNone))); } else if (FldVal == TFtrGenSparseNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New()); } else if (FldVal == TFtrGenMultiNom::GetType()) { FtrGenBs->AddFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong type '" + FldVal + "'!"); } } } const int Flds = FldValV.Len(); // read the lines and feed them to the feature generators int Recs = 0; while (!SIn->Eof()) { if (Recs == TrainLen) { break; } Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines try { TStrV FtrValV; for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { FtrGenBs->UpdateCls(FldVal); } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } FtrGenBs->Update(FtrValV); } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } } // read the file again and feed it to the training set PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs(); // we read and ignore the headers since we parsed them already SIn = TFIn::New(FNm); SsCh = ' '; TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // read the lines and feed them to the training set Recs = 0; while (!SIn->Eof()){ Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines and construct the sparse vector TStrV FtrValV; TStr ClsFtrVal; try { for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { ClsFtrVal = FldVal; } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } // add the feature vector to trainsets FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal); } // prepare training and testing doc ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted()); TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen); BowDocBs->PutTrainDIdV(TrainDIdV); TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV); BowDocBs->PutTestDIdV(TestDIdV); return BowDocBs; }
PBowMd TBowWinnowMd::New( const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){ // create model TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd); WinnowMd->CatNm=CatNm; WinnowMd->Beta=Beta; WinnowMd->VoteTsh=0.5; // prepare Winnow parameters const double MnExpertWgtSum=1e-15; // get cat-id int CId=BowDocBs->GetCId(CatNm); if (CId==-1){ TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));} // get training documents TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV); int TrainDocs=TrainDIdV.Len(); // prepare mini-experts int Words=BowDocBs->GetWords(); WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1); WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1); // winnow loop double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0; const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0; const int MxIters=50; int IterN=0; while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){ IterN++; int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0; for (int DIdN=0; DIdN<TrainDocs; DIdN++){ int DId=TrainDIdV[DIdN]; bool ClassVal=BowDocBs->IsCatInDoc(DId, CId); double PosWgt=0; double NegWgt=0; double OldSum=0; double NewSum=0; int WIds=BowDocBs->GetDocWIds(DId); // change only experts of words that occur in the document for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; // penalize expert giving wrong class prediction if (ClassVal){ WinnowMd->NegExpertWgtV[WId]*=Beta; } else { WinnowMd->PosExpertWgtV[WId]*=Beta; } NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; PosWgt+=WinnowMd->PosExpertWgtV[WId]; NegWgt+=WinnowMd->NegExpertWgtV[WId]; } // normalize all experts if (NewSum>MnExpertWgtSum){ for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum; WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum; } } bool PredClassVal; if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();} else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;} if (PredClassVal==ClassVal){ if (PredClassVal){TruePos++;} else {TrueNeg++;} } else { if (PredClassVal){FalsePos++;} else {FalseNeg++;} } } // calculate temporary results if (TrainDocs==0){break;} double Acc=0; double Prec=0; double Rec=0; double F1=0; if (TrainDocs>0){ Acc=100*(TruePos+TrueNeg)/double(TrainDocs); if (TruePos+FalsePos>0){ Prec=(TruePos/double(TruePos+FalsePos)); Rec=(TruePos/double(TruePos+FalseNeg)); if (Prec+Rec>0){ F1=(2*Prec*Rec/(Prec+Rec)); } } } // check if the current iteration gave worse results then the previous if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&& ((Rec-PrevRec)<MxDiff))){WorseIters++;} else {WorseIters=0;} PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1; printf("%d. Precision:%0.3f Recall:%0.3f F1:%0.3f Accuracy:%0.3f%%\n", IterN, Prec, Rec, F1, Acc); } // return model return BowMd; }