TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs, const TStr& CatNm, const TIntV& DIdV, TFltV& ClsV): TMatrix() { RowN = BowDocBs->GetWords(); ClsV.Gen(DIdV.Len(), 0); ColSpVV.Gen(DIdV.Len(), 0); IAssert(BowDocBs->IsCatNm(CatNm)); int CatId = BowDocBs->GetCId(CatNm); for (int i = 0; i < DIdV.Len(); i++) { ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i])); ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99); } }
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, const TIntV& TrainDIdV) { // create model TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm; // compute centroid TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm); for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) { const int DId = TrainDIdV[TrainDIdN]; if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); } } PBowSim BowSim = TBowSim::New(bstCos); PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV); CentroidMd->CentroidV.Gen(BowDocBs->GetWords()); CentroidMd->CentroidV.PutAll(0.0); TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV); return CentroidMd; }
PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm) { printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); const int Terms = TermBs->GetTerms(); const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm); const int Words = BowDocBs->GetWords(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ... "); THash<TInt, PBowSpV> TermIdToConceptSpVH; for (int TermN = 0; TermN < Terms; TermN++){ int TermId = TermBs->GetTermId(TermN); PLwTerm Term = TermBs->GetTerm(TermId); if (Term->GetLangId() != LangId) { continue; } // do nearest neighbour search PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr( Term->GetTermNm(), BowDocWgtBs); TFltIntKdV SimDIdKdV; BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false); TFltV TermV(Words); TermV.PutAll(0.0); for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) { PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat); const double Sim = SimDIdKdV[SimDIdKdN].Key; TBowLinAlg::AddVec(Sim, DocSpV, TermV); } TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps)); } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }
PBowMd TBowWinnowMd::New( const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){ // create model TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd); WinnowMd->CatNm=CatNm; WinnowMd->Beta=Beta; WinnowMd->VoteTsh=0.5; // prepare Winnow parameters const double MnExpertWgtSum=1e-15; // get cat-id int CId=BowDocBs->GetCId(CatNm); if (CId==-1){ TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));} // get training documents TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV); int TrainDocs=TrainDIdV.Len(); // prepare mini-experts int Words=BowDocBs->GetWords(); WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1); WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1); // winnow loop double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0; const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0; const int MxIters=50; int IterN=0; while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){ IterN++; int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0; for (int DIdN=0; DIdN<TrainDocs; DIdN++){ int DId=TrainDIdV[DIdN]; bool ClassVal=BowDocBs->IsCatInDoc(DId, CId); double PosWgt=0; double NegWgt=0; double OldSum=0; double NewSum=0; int WIds=BowDocBs->GetDocWIds(DId); // change only experts of words that occur in the document for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; // penalize expert giving wrong class prediction if (ClassVal){ WinnowMd->NegExpertWgtV[WId]*=Beta; } else { WinnowMd->PosExpertWgtV[WId]*=Beta; } NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; PosWgt+=WinnowMd->PosExpertWgtV[WId]; NegWgt+=WinnowMd->NegExpertWgtV[WId]; } // normalize all experts if (NewSum>MnExpertWgtSum){ for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum; WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum; } } bool PredClassVal; if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();} else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;} if (PredClassVal==ClassVal){ if (PredClassVal){TruePos++;} else {TrueNeg++;} } else { if (PredClassVal){FalsePos++;} else {FalseNeg++;} } } // calculate temporary results if (TrainDocs==0){break;} double Acc=0; double Prec=0; double Rec=0; double F1=0; if (TrainDocs>0){ Acc=100*(TruePos+TrueNeg)/double(TrainDocs); if (TruePos+FalsePos>0){ Prec=(TruePos/double(TruePos+FalsePos)); Rec=(TruePos/double(TruePos+FalseNeg)); if (Prec+Rec>0){ F1=(2*Prec*Rec/(Prec+Rec)); } } } // check if the current iteration gave worse results then the previous if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&& ((Rec-PrevRec)<MxDiff))){WorseIters++;} else {WorseIters=0;} PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1; printf("%d. Precision:%0.3f Recall:%0.3f F1:%0.3f Accuracy:%0.3f%%\n", IterN, Prec, Rec, F1, Acc); } // return model return BowMd; }