예제 #1
0
TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs,
        const TStr& CatNm,  const TIntV& DIdV, TFltV& ClsV): TMatrix() {

    RowN = BowDocBs->GetWords();
    ClsV.Gen(DIdV.Len(), 0);
    ColSpVV.Gen(DIdV.Len(), 0);
    IAssert(BowDocBs->IsCatNm(CatNm));
    int CatId = BowDocBs->GetCId(CatNm);
    for (int i = 0; i < DIdV.Len(); i++) {
        ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i]));
        ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99);
    }
}
예제 #2
0
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, 
		const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, 
		const TIntV& TrainDIdV) {

	// create model
	TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); 
	PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm;
	// compute centroid
	TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm);
	for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) {
		const int DId = TrainDIdV[TrainDIdN];
		if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); }
	}
	PBowSim BowSim = TBowSim::New(bstCos);
	PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV);	
	CentroidMd->CentroidV.Gen(BowDocBs->GetWords());
	CentroidMd->CentroidV.PutAll(0.0);
	TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV);
	return CentroidMd;
}
예제 #3
0
PBowMd TBowWinnowMd::NewMulti(
 const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){
  // create model
  TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd);
  // traverse categories
  TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV);
  for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){
    // get category data
    TStr CatNm=FqCatNmPrV[CatN].Val2;
    int CId=BowDocBs->GetCId(CatNm);
    // output header
    printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n",
     CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats());
    // create model
    PBowMd BowMd=New(BowDocBs, CatNm, Beta);
    // add model to model-set
    MultiMd->AddBowMd(BowMd);
  }
  // return model
  return BowMd;
}
예제 #4
0
PBowMd TBowWinnowMd::New(
 const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){
  // create model
  TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd);
  WinnowMd->CatNm=CatNm;
  WinnowMd->Beta=Beta;
  WinnowMd->VoteTsh=0.5;
  // prepare Winnow parameters
  const double MnExpertWgtSum=1e-15;
  // get cat-id
  int CId=BowDocBs->GetCId(CatNm);
  if (CId==-1){
    TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));}
  // get training documents
  TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV);
  int TrainDocs=TrainDIdV.Len();
  // prepare mini-experts
  int Words=BowDocBs->GetWords();
  WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1);
  WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1);
  // winnow loop
  double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0;
  const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0;
  const int MxIters=50; int IterN=0;
  while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){
    IterN++;
    int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0;
    for (int DIdN=0; DIdN<TrainDocs; DIdN++){
      int DId=TrainDIdV[DIdN];
      bool ClassVal=BowDocBs->IsCatInDoc(DId, CId);
      double PosWgt=0; double NegWgt=0;
      double OldSum=0; double NewSum=0;
      int WIds=BowDocBs->GetDocWIds(DId);
      // change only experts of words that occur in the document
      for (int WIdN=0; WIdN<WIds; WIdN++){
        int WId=BowDocBs->GetDocWId(DId, WIdN);
        OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        // penalize expert giving wrong class prediction
        if (ClassVal){
          WinnowMd->NegExpertWgtV[WId]*=Beta;
        } else {
          WinnowMd->PosExpertWgtV[WId]*=Beta;
        }
        NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        PosWgt+=WinnowMd->PosExpertWgtV[WId];
        NegWgt+=WinnowMd->NegExpertWgtV[WId];
      }
      // normalize all experts
      if (NewSum>MnExpertWgtSum){
        for (int WIdN=0; WIdN<WIds; WIdN++){
          int WId=BowDocBs->GetDocWId(DId, WIdN);
          WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum;
          WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum;
        }
      }
      bool PredClassVal;
      if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();}
      else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;}
      if (PredClassVal==ClassVal){
        if (PredClassVal){TruePos++;} else {TrueNeg++;}
      } else {
        if (PredClassVal){FalsePos++;} else {FalseNeg++;}
      }
    }
    // calculate temporary results
    if (TrainDocs==0){break;}
    double Acc=0; double Prec=0; double Rec=0; double F1=0;
    if (TrainDocs>0){
      Acc=100*(TruePos+TrueNeg)/double(TrainDocs);
      if (TruePos+FalsePos>0){
        Prec=(TruePos/double(TruePos+FalsePos));
        Rec=(TruePos/double(TruePos+FalseNeg));
        if (Prec+Rec>0){
          F1=(2*Prec*Rec/(Prec+Rec));
        }
      }
    }
    // check if the current iteration gave worse results then the previous
    if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&&
     ((Rec-PrevRec)<MxDiff))){WorseIters++;}
    else {WorseIters=0;}
    PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1;
    printf("%d. Precision:%0.3f   Recall:%0.3f   F1:%0.3f   Accuracy:%0.3f%%\n",
     IterN, Prec, Rec, F1, Acc);
  }
  // return model
  return BowMd;
}