示例#1
0
TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs,
        const TStr& CatNm,  const TIntV& DIdV, TFltV& ClsV): TMatrix() {

    RowN = BowDocBs->GetWords();
    ClsV.Gen(DIdV.Len(), 0);
    ColSpVV.Gen(DIdV.Len(), 0);
    IAssert(BowDocBs->IsCatNm(CatNm));
    int CatId = BowDocBs->GetCId(CatNm);
    for (int i = 0; i < DIdV.Len(); i++) {
        ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i]));
        ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99);
    }
}
示例#2
0
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, 
		const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, 
		const TIntV& TrainDIdV) {

	// create model
	TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); 
	PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm;
	// compute centroid
	TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm);
	for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) {
		const int DId = TrainDIdV[TrainDIdN];
		if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); }
	}
	PBowSim BowSim = TBowSim::New(bstCos);
	PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV);	
	CentroidMd->CentroidV.Gen(BowDocBs->GetWords());
	CentroidMd->CentroidV.PutAll(0.0);
	TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV);
	return CentroidMd;
}
示例#3
0
文件: ontolight.cpp 项目: Accio/snap
PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto, 
        const PBowDocBs& BowDocBs, const TStr& LangNm) {

    printf("Generating Ontology-Classifier...\n");
    // shortcuts
    PLwTermBs TermBs=LwOnto->GetTermBs();
    const int Terms = TermBs->GetTerms();
    const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm);
    const int Words = BowDocBs->GetWords();
    // create tfidf
    printf("  Creating BowDocWgtBs ...");
    PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
    PBowSim BowSim=TBowSim::New(bstCos);
    printf(" Done.\n");
    // collect documents per ontology-term
    printf("  Collecting documents per ontology-term ... ");
    THash<TInt, PBowSpV> TermIdToConceptSpVH;
    for (int TermN = 0; TermN < Terms; TermN++){
        int TermId = TermBs->GetTermId(TermN);
        PLwTerm Term = TermBs->GetTerm(TermId);
        if (Term->GetLangId() != LangId) { continue; }
        // do nearest neighbour search
        PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr(
            Term->GetTermNm(), BowDocWgtBs);
        TFltIntKdV SimDIdKdV;
        BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false);
        TFltV TermV(Words); TermV.PutAll(0.0);
        for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) {
            PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat);
            const double Sim = SimDIdKdV[SimDIdKdN].Key;
            TBowLinAlg::AddVec(Sim, DocSpV, TermV);
        }
        TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps));
    }
    printf("  Done.\n");
    // create & return classifier
    PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto, 
        BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
    printf("Done.\n");
    return OntoGround;
}
示例#4
0
PBowMd TBowWinnowMd::New(
 const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){
  // create model
  TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd);
  WinnowMd->CatNm=CatNm;
  WinnowMd->Beta=Beta;
  WinnowMd->VoteTsh=0.5;
  // prepare Winnow parameters
  const double MnExpertWgtSum=1e-15;
  // get cat-id
  int CId=BowDocBs->GetCId(CatNm);
  if (CId==-1){
    TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));}
  // get training documents
  TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV);
  int TrainDocs=TrainDIdV.Len();
  // prepare mini-experts
  int Words=BowDocBs->GetWords();
  WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1);
  WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1);
  // winnow loop
  double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0;
  const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0;
  const int MxIters=50; int IterN=0;
  while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){
    IterN++;
    int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0;
    for (int DIdN=0; DIdN<TrainDocs; DIdN++){
      int DId=TrainDIdV[DIdN];
      bool ClassVal=BowDocBs->IsCatInDoc(DId, CId);
      double PosWgt=0; double NegWgt=0;
      double OldSum=0; double NewSum=0;
      int WIds=BowDocBs->GetDocWIds(DId);
      // change only experts of words that occur in the document
      for (int WIdN=0; WIdN<WIds; WIdN++){
        int WId=BowDocBs->GetDocWId(DId, WIdN);
        OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        // penalize expert giving wrong class prediction
        if (ClassVal){
          WinnowMd->NegExpertWgtV[WId]*=Beta;
        } else {
          WinnowMd->PosExpertWgtV[WId]*=Beta;
        }
        NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        PosWgt+=WinnowMd->PosExpertWgtV[WId];
        NegWgt+=WinnowMd->NegExpertWgtV[WId];
      }
      // normalize all experts
      if (NewSum>MnExpertWgtSum){
        for (int WIdN=0; WIdN<WIds; WIdN++){
          int WId=BowDocBs->GetDocWId(DId, WIdN);
          WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum;
          WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum;
        }
      }
      bool PredClassVal;
      if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();}
      else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;}
      if (PredClassVal==ClassVal){
        if (PredClassVal){TruePos++;} else {TrueNeg++;}
      } else {
        if (PredClassVal){FalsePos++;} else {FalseNeg++;}
      }
    }
    // calculate temporary results
    if (TrainDocs==0){break;}
    double Acc=0; double Prec=0; double Rec=0; double F1=0;
    if (TrainDocs>0){
      Acc=100*(TruePos+TrueNeg)/double(TrainDocs);
      if (TruePos+FalsePos>0){
        Prec=(TruePos/double(TruePos+FalsePos));
        Rec=(TruePos/double(TruePos+FalseNeg));
        if (Prec+Rec>0){
          F1=(2*Prec*Rec/(Prec+Rec));
        }
      }
    }
    // check if the current iteration gave worse results then the previous
    if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&&
     ((Rec-PrevRec)<MxDiff))){WorseIters++;}
    else {WorseIters=0;}
    PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1;
    printf("%d. Precision:%0.3f   Recall:%0.3f   F1:%0.3f   Accuracy:%0.3f%%\n",
     IterN, Prec, Rec, F1, Acc);
  }
  // return model
  return BowMd;
}