TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs, const TStr& CatNm, const TIntV& DIdV, TFltV& ClsV): TMatrix() { RowN = BowDocBs->GetWords(); ClsV.Gen(DIdV.Len(), 0); ColSpVV.Gen(DIdV.Len(), 0); IAssert(BowDocBs->IsCatNm(CatNm)); int CatId = BowDocBs->GetCId(CatNm); for (int i = 0; i < DIdV.Len(); i++) { ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i])); ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99); } }
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, const TIntV& TrainDIdV) { // create model TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm; // compute centroid TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm); for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) { const int DId = TrainDIdV[TrainDIdN]; if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); } } PBowSim BowSim = TBowSim::New(bstCos); PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV); CentroidMd->CentroidV.Gen(BowDocBs->GetWords()); CentroidMd->CentroidV.PutAll(0.0); TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV); return CentroidMd; }
PBowMd TBowWinnowMd::NewMulti( const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){ // create model TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd); // traverse categories TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV); for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){ // get category data TStr CatNm=FqCatNmPrV[CatN].Val2; int CId=BowDocBs->GetCId(CatNm); // output header printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n", CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats()); // create model PBowMd BowMd=New(BowDocBs, CatNm, Beta); // add model to model-set MultiMd->AddBowMd(BowMd); } // return model return BowMd; }
PBowMd TBowWinnowMd::New( const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){ // create model TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd); WinnowMd->CatNm=CatNm; WinnowMd->Beta=Beta; WinnowMd->VoteTsh=0.5; // prepare Winnow parameters const double MnExpertWgtSum=1e-15; // get cat-id int CId=BowDocBs->GetCId(CatNm); if (CId==-1){ TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));} // get training documents TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV); int TrainDocs=TrainDIdV.Len(); // prepare mini-experts int Words=BowDocBs->GetWords(); WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1); WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1); // winnow loop double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0; const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0; const int MxIters=50; int IterN=0; while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){ IterN++; int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0; for (int DIdN=0; DIdN<TrainDocs; DIdN++){ int DId=TrainDIdV[DIdN]; bool ClassVal=BowDocBs->IsCatInDoc(DId, CId); double PosWgt=0; double NegWgt=0; double OldSum=0; double NewSum=0; int WIds=BowDocBs->GetDocWIds(DId); // change only experts of words that occur in the document for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; // penalize expert giving wrong class prediction if (ClassVal){ WinnowMd->NegExpertWgtV[WId]*=Beta; } else { WinnowMd->PosExpertWgtV[WId]*=Beta; } NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; PosWgt+=WinnowMd->PosExpertWgtV[WId]; NegWgt+=WinnowMd->NegExpertWgtV[WId]; } // normalize all experts if (NewSum>MnExpertWgtSum){ for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum; WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum; } } bool PredClassVal; if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();} else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;} if (PredClassVal==ClassVal){ if (PredClassVal){TruePos++;} else {TrueNeg++;} } else { if (PredClassVal){FalsePos++;} else {FalseNeg++;} } } // calculate temporary results if (TrainDocs==0){break;} double Acc=0; double Prec=0; double Rec=0; double F1=0; if (TrainDocs>0){ Acc=100*(TruePos+TrueNeg)/double(TrainDocs); if (TruePos+FalsePos>0){ Prec=(TruePos/double(TruePos+FalsePos)); Rec=(TruePos/double(TruePos+FalseNeg)); if (Prec+Rec>0){ F1=(2*Prec*Rec/(Prec+Rec)); } } } // check if the current iteration gave worse results then the previous if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&& ((Rec-PrevRec)<MxDiff))){WorseIters++;} else {WorseIters=0;} PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1; printf("%d. Precision:%0.3f Recall:%0.3f F1:%0.3f Accuracy:%0.3f%%\n", IterN, Prec, Rec, F1, Acc); } // return model return BowMd; }