void TUStr::GetWordUStrV(TUStrV& WordUStrV){
  // clear word vector
  WordUStrV.Clr();
  // create boundaries
  TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV);
  IAssert(Len()==WordBoundPV.Len()-1);
  IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last()));
  // traverse characters and bounds
  int UniChs=Len(); TIntV WordUniChV;
  for (int UniChN=0; UniChN<=UniChs; UniChN++){
    if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary
      if (UniChN<UniChs){ // if not finish
        // if last-word-char or single-alphabetic-char
        if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){
          WordUniChV.Add(UniChV[UniChN]); // add char
        }
      }
      if (!WordUniChV.Empty()){ // add current word to vector
        TUStr WordUStr(WordUniChV); // construct word from char-vector
        WordUStrV.Add(WordUStr); // add word to word-vector
        WordUniChV.Clr(false); // clear char-vector
      }
    } else {
      // add character to char-vector
      WordUniChV.Add(UniChV[UniChN]);
    }
  }
}
示例#2
0
/////////////////////////////////////////////////
// Trawling the web for emerging communities
// graph, left points to right
TTrawling::TTrawling(const PNGraph& Graph, const int& MinSupport) : MinSup(MinSupport) {
  TIntH ItemCntH;
  for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
    IAssert(NI.GetOutDeg()==0 || NI.GetInDeg()==0); // edges only point from left to right
    if (NI.GetOutDeg()==0) { continue; }
    for (int e = 0; e < NI.GetOutDeg(); e++) {
      ItemCntH.AddDat(NI.GetOutNId(e)) += 1;
    }
  }

  TIntV RightV;
  for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
    IAssert(NI.GetOutDeg()==0 || NI.GetInDeg()==0); // edges only point from left to right
    if (NI.GetOutDeg()==0) { continue; }
    RightV.Clr(false);
    for (int e = 0; e < NI.GetOutDeg(); e++) {
      const int itm = NI.GetOutNId(e);
      // only include items that already are above minimum support
      if (ItemCntH.GetDat(itm) >= MinSup) {
        RightV.Add(itm); }
    }
    if (! RightV.Empty()) {
      NIdSetH.AddDat(NI.GetId(), RightV);
    }
  }
  //
  for (int n = 0; n < NIdSetH.Len(); n++) {
    const TIntV& Set = NIdSetH[n];
    for (int s = 0; s < Set.Len(); s++) {
      SetNIdH.AddDat(Set[s]).Add(n);
    }
  }
}
示例#3
0
void TDecisionTree::TNode::Fit(const TFltVV& FtrVV, const TFltV& ClassV, const TIntV& InstNV) {
	EAssert(!InstNV.Empty());

	const int Dim = FtrVV.GetRows();

	NExamples = InstNV.Len();

	ClassHist.Gen(2);
	FtrHist.Gen(Dim);

	{
		int TotalPos = 0;
		double BestScore = TFlt::NInf, CutVal = TFlt::NInf, Score = TFlt::NInf;

		for (int i = 0; i < NExamples; i++) {
			AssertR(0 <= InstNV[i] && InstNV[i] < FtrVV.GetCols(), "Invalid instance index: " + TInt::GetStr(InstNV[i]) + "!");
			TotalPos += (int) ClassV[InstNV[i]];
		}

		ClassHist[0] = 1 - double(TotalPos) / NExamples;
		ClassHist[1] = 1 - ClassHist[0];

		TFltIntPrV ValClassPrV(NExamples);

		// get the best score and cut value
		int InstN;
		for (int FtrN = 0; FtrN < Dim; FtrN++) {
			double FtrSum = 0;

			for (int i = 0; i < NExamples; i++) {
				InstN = InstNV[i];

				AssertR(0 <= InstN && InstN < FtrVV.GetCols(), "Invalid instance index: " + TInt::GetStr(InstN) + "!");

				ValClassPrV[i].Val1 = FtrVV(FtrN, InstN);
				ValClassPrV[i].Val2 = (int) ClassV[InstN];
				FtrSum += FtrVV(FtrN, InstN);
			}

			ValClassPrV.Sort(true);	// have to sort to speed up the calculation

			if (CanSplitNumFtr(ValClassPrV, TotalPos, CutVal, Score) && Score > BestScore) {
				BestScore = Score;
				CutFtrN = FtrN;
				CutFtrVal = CutVal;
			}

			FtrHist[FtrN] = FtrSum / NExamples;
		}
	}

	// cut the dataset into left and right and build the tree recursively
	if (ShouldGrow() && CutFtrN >= 0) {
		EAssert(CutFtrN < Dim);
		// the best attribute is now selected, calculate the correlation between the
		// selected attribute and other attributes, then split the node
		CalcCorrFtrV(FtrVV, InstNV);
		Split(FtrVV, ClassV, InstNV);
	}
}
示例#4
0
文件: ff.cpp 项目: hdravna/CommDet
// burn each link independently (forward with FwdBurnProb, backward with BckBurnProb)
void TForestFire::BurnExpFire() {
  const double OldFwdBurnProb = FwdBurnProb;
  const double OldBckBurnProb = BckBurnProb;
  const int NInfect = InfectNIdV.Len();
  const TNGraph& G = *Graph;
  TIntH BurnedNIdH;               // burned nodes
  TIntV BurningNIdV = InfectNIdV; // currently burning nodes
  TIntV NewBurnedNIdV;            // nodes newly burned in current step
  bool HasAliveNbrs;              // has unburned neighbors
  int NBurned = NInfect, NDiedFire=0;
  for (int i = 0; i < InfectNIdV.Len(); i++) {
    BurnedNIdH.AddDat(InfectNIdV[i]); }
  NBurnedTmV.Clr(false);  NBurningTmV.Clr(false);  NewBurnedTmV.Clr(false);
  for (int time = 0; ; time++) {
    NewBurnedNIdV.Clr(false);
    // for each burning node
    for (int node = 0; node < BurningNIdV.Len(); node++) {
      const int& BurningNId = BurningNIdV[node];
      const TNGraph::TNodeI Node = G.GetNI(BurningNId);
      HasAliveNbrs = false;
      NDiedFire = 0;
      // burn forward links  (out-links)
      for (int e = 0; e < Node.GetOutDeg(); e++) {
        const int OutNId = Node.GetOutNId(e);
        if (! BurnedNIdH.IsKey(OutNId)) { // not yet burned
          HasAliveNbrs = true;
          if (Rnd.GetUniDev() < FwdBurnProb) {
            BurnedNIdH.AddDat(OutNId);  NewBurnedNIdV.Add(OutNId);  NBurned++; }
        }
      }
      // burn backward links (in-links)
      if (BckBurnProb > 0.0) {
        for (int e = 0; e < Node.GetInDeg(); e++) {
          const int InNId = Node.GetInNId(e);
          if (! BurnedNIdH.IsKey(InNId)) { // not yet burned
            HasAliveNbrs = true;
            if (Rnd.GetUniDev() < BckBurnProb) {
              BurnedNIdH.AddDat(InNId);  NewBurnedNIdV.Add(InNId);  NBurned++; }
          }
        }
      }
      if (! HasAliveNbrs) { NDiedFire++; }
    }
    NBurnedTmV.Add(NBurned);
    NBurningTmV.Add(BurningNIdV.Len() - NDiedFire);
    NewBurnedTmV.Add(NewBurnedNIdV.Len());
    //BurningNIdV.AddV(NewBurnedNIdV);   // node is burning eternally
    BurningNIdV.Swap(NewBurnedNIdV);    // node is burning just 1 time step
    if (BurningNIdV.Empty()) break;
    FwdBurnProb = FwdBurnProb * ProbDecay;
    BckBurnProb = BckBurnProb * ProbDecay;
  }
  BurnedNIdV.Gen(BurnedNIdH.Len(), 0);
  for (int i = 0; i < BurnedNIdH.Len(); i++) {
    BurnedNIdV.Add(BurnedNIdH.GetKey(i)); }
  FwdBurnProb = OldFwdBurnProb;
  BckBurnProb = OldBckBurnProb;
}
示例#5
0
文件: bowflx.cpp 项目: Accio/snap
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs,
    const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm,
    const TStr& CatFNm, const TIntV& _DIdV) {

  TIntV DIdV;
  if (_DIdV.Empty()) {
      BowDocBs->GetAllDIdV(DIdV);
  } else {
      DIdV = _DIdV;
  }
  // generate map of row-ids to words
  TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat"));
  for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) {
    TStr WdStr = BowDocBs->GetWordStr(WId);
    WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1,  WdStr.CStr()));
  }
  WdMapSOut.Flush();
  // generate map of col-ids to document names
  TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat"));
  for (int DocN = 0; DocN < DIdV.Len(); DocN++) {
    const int DId = DIdV[DocN];
    TStr DocNm = BowDocBs->GetDocNm(DId);
    DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId,  DocNm.CStr()));
  }
  DocMapSOut.Flush();
  // save documents' sparse vectors
  TFOut SOut(FNm);
  for (int DocN = 0; DocN < DIdV.Len(); DocN++){
    const int DId = DIdV[DocN];
    PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId);
    const int DocWIds = DocSpV->GetWIds();
    for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
      const int WId = DocSpV->GetWId(DocWIdN);
      const double WordWgt = DocSpV->GetWgt(DocWIdN);
      SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt));
    }
  }
  SOut.Flush();
  // save documents' category sparse vectors
  if (!CatFNm.Empty()) {
    TFOut CatSOut(CatFNm);
    for (int DocN = 0; DocN < DIdV.Len(); DocN++){
      const int DId = DIdV[DocN];
      const int DocCIds = BowDocBs->GetDocCIds(DId);
      for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){
        const int CId = BowDocBs->GetDocCId(DId, DocCIdN);
        const double CatWgt = 1.0;
        CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt));
      }
    }
    CatSOut.Flush();
  }
}
示例#6
0
void TUStr::GetWordUStrLst(TLst<TUStr>& WordUStrV, TLst<TBool> &TerminalV){ //TBoolV& TerminalV){
  
// clear word vector
  WordUStrV.Clr();
  // create boundaries
  TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV);
  //TerminalV.Reserve(WordBoundPV.Len());
  IAssert(Len()==WordBoundPV.Len()-1);
  IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last()));
  // traverse characters and bounds
  int UniChs=Len(); TIntV WordUniChV;
  bool terminal = false;

  for (int UniChN=0; UniChN<=UniChs; UniChN++){
    if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary
      if (UniChN<UniChs){ // if not finish
        // if last-word-char or single-alphabetic-char
        if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){
          WordUniChV.Add(UniChV[UniChN]); // add char
        }
		else{
			if(WordUStrV.Len() > 0){
				if(IsTerminal(UniChV[UniChN])) terminal = true;
			}
		}
      }
      if (!WordUniChV.Empty()){ // add current word to vector
        TUStr WordUStr(WordUniChV); // construct word from char-vector
		WordUStrV.AddBack(WordUStr); // add word to word-vector
        WordUniChV.Clr(false); // clear char-vector
		if(terminal){ TerminalV.AddBack(true);}
		else{ TerminalV.AddBack(false);}
		terminal = false;
      }
    } else {
      // add character to char-vector
      WordUniChV.Add(UniChV[UniChN]);
    }
  }
}
示例#7
0
void TGraphCascade::Print(const TIntV& SortV) {
    printf("graph start:\n");
    if (SortV.Empty()) {
        for (TNGraph::TNodeI NI = Graph.BegNI(); NI < Graph.EndNI(); NI++) {
            printf("%s %d %d\n", NodeIdNmH.GetDat(NI.GetId()).CStr(), NI.GetId(), NodeNmIdH.GetDat(NodeIdNmH.GetDat(NI.GetId())).Val);
        }
    } else {
        for (int NodeN = 0; NodeN < SortV.Len(); NodeN++) {
            printf("%s %d\n", NodeIdNmH.GetDat(SortV[NodeN]).CStr(), SortV[NodeN].Val);
        }
    }
    printf("graph end\n");
}
void TWnBs::SaveTxtRel(const TWnRelType& RelType, const int& SynSetP,
 const bool& Recurse, FILE* fOut, const int& LevelN, TIntS& SynSetPS){
  PWnSynSet SynSet=GetSynSetFromP(SynSetP);
  TIntV SubSynSetPV; SynSet->GetDstSynSetPV(RelType, SubSynSetPV);
  if ((LevelN==0)&&(!SubSynSetPV.Empty())){
    TStr RelTypeNm=GetRelTypeNm(RelType);
    fprintf(fOut, "  ---%s--------------------------\n", RelTypeNm.CStr());
  }
  for (int SubSynSetPN=0; SubSynSetPN<SubSynSetPV.Len(); SubSynSetPN++){
    int SubSynSetP=SubSynSetPV[SubSynSetPN];
    SaveTxtSynSet(SubSynSetP, fOut, LevelN+1);
    if (Recurse){
      if (!SynSetPS.IsIn(SubSynSetP)){
        SynSetPS.Push(SubSynSetP);
        SaveTxtRel(RelType, SubSynSetP, Recurse, fOut, LevelN+1, SynSetPS);
        SynSetPS.Pop();
      } else {
        fprintf(fOut, "  ***Cycling\n");
      }
    }
  }
}
示例#9
0
// Node selects N~geometric(1.0-FwdBurnProb)-1 out-links and burns them. Then same for in-links.
// geometirc(p) has mean 1/(p), so for given FwdBurnProb, we burn 1/(1-FwdBurnProb)
void TForestFire::BurnGeoFire() {
	const double OldFwdBurnProb = FwdBurnProb;
	const double OldBckBurnProb = BckBurnProb;
	const int& NInfect = InfectNIdV.Len();
	const TNGraph& G = *Graph;
	TIntH BurnedNIdH;               // burned nodes
	TIntV BurningNIdV = InfectNIdV; // currently burning nodes
	TIntV NewBurnedNIdV;            // nodes newly burned in current step
	bool HasAliveInNbrs, HasAliveOutNbrs; // has unburned neighbors
	TIntV AliveNIdV;                // NIds of alive neighbors
	int NBurned = NInfect, time;
	for (int i = 0; i < InfectNIdV.Len(); i++) {
		BurnedNIdH.AddDat(InfectNIdV[i]);
	}
	NBurnedTmV.Clr(false);  NBurningTmV.Clr(false);  NewBurnedTmV.Clr(false);
	for (time = 0;; time++) {
		NewBurnedNIdV.Clr(false);
		for (int node = 0; node < BurningNIdV.Len(); node++) {
			const int& BurningNId = BurningNIdV[node];
			const TNGraph::TNodeI Node = G.GetNI(BurningNId);
			// find unburned links
			HasAliveOutNbrs = false;
			AliveNIdV.Clr(false); // unburned links
			for (int e = 0; e < Node.GetOutDeg(); e++) {
				const int OutNId = Node.GetOutNId(e);
				if (!BurnedNIdH.IsKey(OutNId)) {
					HasAliveOutNbrs = true;  AliveNIdV.Add(OutNId);
				}
			}
			// number of links to burn (geometric coin). Can also burn 0 links
			const int BurnNFwdLinks = Rnd.GetGeoDev(1.0 - FwdBurnProb) - 1;
			if (HasAliveOutNbrs && BurnNFwdLinks > 0) {
				AliveNIdV.Shuffle(Rnd);
				for (int i = 0; i < TMath::Mn(BurnNFwdLinks, AliveNIdV.Len()); i++) {
					BurnedNIdH.AddDat(AliveNIdV[i]);
					NewBurnedNIdV.Add(AliveNIdV[i]);  NBurned++;
				}
			}
			// backward links
			if (BckBurnProb > 0.0) {
				// find unburned links
				HasAliveInNbrs = false;
				AliveNIdV.Clr(false);
				for (int e = 0; e < Node.GetInDeg(); e++) {
					const int InNId = Node.GetInNId(e);
					if (!BurnedNIdH.IsKey(InNId)) {
						HasAliveInNbrs = true;  AliveNIdV.Add(InNId);
					}
				}
				// number of links to burn (geometric coin). Can also burn 0 links
				const int BurnNBckLinks = Rnd.GetGeoDev(1.0 - BckBurnProb) - 1;
				if (HasAliveInNbrs && BurnNBckLinks > 0) {
					AliveNIdV.Shuffle(Rnd);
					for (int i = 0; i < TMath::Mn(BurnNBckLinks, AliveNIdV.Len()); i++) {
						BurnedNIdH.AddDat(AliveNIdV[i]);
						NewBurnedNIdV.Add(AliveNIdV[i]);  NBurned++;
					}
				}
			}
		}
		NBurnedTmV.Add(NBurned);  NBurningTmV.Add(BurningNIdV.Len());  NewBurnedTmV.Add(NewBurnedNIdV.Len());
		// BurningNIdV.AddV(NewBurnedNIdV);   // node is burning eternally
		BurningNIdV.Swap(NewBurnedNIdV);   // node is burning just 1 time step
		if (BurningNIdV.Empty()) break;
		FwdBurnProb = FwdBurnProb * ProbDecay;
		BckBurnProb = BckBurnProb * ProbDecay;
	}
	BurnedNIdV.Gen(BurnedNIdH.Len(), 0);
	for (int i = 0; i < BurnedNIdH.Len(); i++) {
		BurnedNIdV.Add(BurnedNIdH.GetKey(i));
	}
	FwdBurnProb = OldFwdBurnProb;
	BckBurnProb = OldBckBurnProb;
}
/////////////////////////////////////////////////
// BLEU-score
double TEvalScoreBleu::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) {
    // check if the corpus has translations
    IAssert(TransCorpus->IsTrans());

    // ngram counts (cliped and full)
    TIntH ClipCountNGramH, CountNGramH;
    // candidate and effective reference length
    int FullTransLen = 0, FullRefLen = 0;

    // iterate over sentences
    TIntV SentIdV = _SentIdV;
    if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); }
    const int Sents = SentIdV.Len();
    for (int SentIdN = 0; SentIdN < Sents; SentIdN++) {
        const int SentId = SentIdV[SentIdN];
        // tokenize translation
        TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV);
        TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH);
        TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams
        // counters for getting the closest length of reference sentences
        const int TransLen = TransWIdV.Len(); 
        int BestLen = 0, BestLenDiff = TInt::Mx;
        // go over reference translations and count ngram matches
        TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId);
        for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) {
            // parse reference translation sentence
            TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV);
            TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH);
            // check for matches
            int TransNGramKeyId = TransNGramH.FFirstKeyId();
            while(TransNGramH.FNextKeyId(TransNGramKeyId)) {
                const int NGramId = TransNGramH.GetKey(TransNGramKeyId);
                const int FreeTransNGrams = FreeTransNGramH(NGramId);
                if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) {
                    // ngram match and still some free ngrams left to clip
                    const int RefNGrams = RefNGramH(NGramId);
                    FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams);
                }
            }
            // check the length difference
            const int RefLen = RefWIdV.Len();
            const int LenDiff = TInt::Abs(TransLen - RefLen);
            if (LenDiff < BestLenDiff) { 
                BestLen = RefLen; BestLenDiff = LenDiff; 
            }
        }
        // count ngrams
        int TransNGramKeyId = TransNGramH.FFirstKeyId();
        while(TransNGramH.FNextKeyId(TransNGramKeyId)) {
            // get ngram
            const int NGramId = TransNGramH.GetKey(TransNGramKeyId);
            IAssert(NGramId != -1);
            // check if two hash tables are aligned (should be...)
            const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId);
            IAssert(NGramId == FreeNGramId);
            // get ngram count and clip-count
            const int Count = TransNGramH[TransNGramKeyId];
            const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId];
            // add ngram to the coprus ngram counts
            CountNGramH.AddDat(NGramId) += Count;
            ClipCountNGramH.AddDat(NGramId) += ClipCount;
        }
        // count length
        FullTransLen += TransLen;
        FullRefLen += BestLen;
    }

    // calcualte ngram precisions
    TIntV ClipCountV(MxNGramLen); ClipCountV.PutAll(0);
    int ClipCountKeyId = ClipCountNGramH.FFirstKeyId();
    while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) {
        const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId);
        const int NGramLen = GetNGramLen(NGramId);
        IAssert(0 < NGramLen && NGramLen <= MxNGramLen);
        ClipCountV[NGramLen-1] += ClipCountNGramH[ClipCountKeyId];
    }
    TIntV CountV(MxNGramLen); CountV.PutAll(0);
    int CountKeyId = CountNGramH.FFirstKeyId();
    while (CountNGramH.FNextKeyId(CountKeyId)) {
        const int NGramId = CountNGramH.GetKey(CountKeyId);
        const int NGramLen = GetNGramLen(NGramId);
        IAssert(0 < NGramLen && NGramLen <= MxNGramLen);
        CountV[NGramLen-1] += CountNGramH[CountKeyId];
    }
    TFltV PrecV(MxNGramLen, 0);
    for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) {
        const int ClipCount = ClipCountV[NGramLen];
        const int Count = CountV[NGramLen];
        const double Prec = (Count > 0) ? double(ClipCount)/double(Count) : 0.0;
        PrecV.Add(Prec);
        //printf("%d-gram Match:%d Total:%d Prec:%.5f\n", NGramLen+1, ClipCount, Count, Prec);
    }

    // calcualte brevity penalty
    double LogBP = TFlt::GetMn(0.0, 1.0 - double(FullRefLen)/double(FullTransLen));
    double BP = exp(LogBP);

    // calculate full BLEU score
    double BleuScore = BP; 
    const double Wgt = 1.0 / double(MxNGramLen);
    for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) {
        BleuScore *= pow(PrecV[NGramLen], Wgt);
    }    
    printf("BLEU Score: %.5f\n", BleuScore);
    
    // done!
    return BleuScore;
}
/////////////////////////////////////////////////
// NIST-score
double TEvalScoreNist::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) {
    // check if the corpus has translations
    IAssert(TransCorpus->IsTrans());

    // ngram counts (cliped and full)
    TIntH ClipCountNGramH, CountNGramH;
    // ngram info score
    TIntFltH NGramInfoH;
    // candidate and effective reference length
    double FullTransLen = 0.0, FullRefLen = 0.0;

    // iterate over sentences
    TIntV SentIdV = _SentIdV;
    if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); }
    const int Sents = SentIdV.Len();
    for (int SentIdN = 0; SentIdN < Sents; SentIdN++) {
        const int SentId = SentIdV[SentIdN];
        // tokenize translation
        TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV);
        TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH);
        TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams
        // counters for getting the closest length of reference sentences
        const int TransLen = TransWIdV.Len(); int RefLenSum = 0;
        // go over reference translations and count ngram matches
        TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId);
        // we assume that there is at least one reference translation
        IAssert(!RefTransStrV.Empty());
        for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) {
            // parse reference translation sentence
            TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV);
            TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH);
            // check for matches
            int TransNGramKeyId = TransNGramH.FFirstKeyId();
            while(TransNGramH.FNextKeyId(TransNGramKeyId)) {
                const int NGramId = TransNGramH.GetKey(TransNGramKeyId);
                const int FreeTransNGrams = FreeTransNGramH(NGramId);
                if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) {
                    // ngram match and still some free ngrams left to clip
                    const int RefNGrams = RefNGramH(NGramId);
                    FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams);
                }
            }
            // check the length difference
            const int RefLen = RefWIdV.Len();
            RefLenSum += RefLen;
        }
        // count ngrams
        int TransNGramKeyId = TransNGramH.FFirstKeyId();
        while(TransNGramH.FNextKeyId(TransNGramKeyId)) {
            // get ngram
            const int NGramId = TransNGramH.GetKey(TransNGramKeyId);
            IAssert(NGramId != -1);
            // check if two hash tables are aligned (should be...)
            const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId);
            IAssert(NGramId == FreeNGramId);
            // get ngram count and clip-count
            const int Count = TransNGramH[TransNGramKeyId];
            const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId];
            // add ngram to the coprus ngram counts
            CountNGramH.AddDat(NGramId) += Count;
            ClipCountNGramH.AddDat(NGramId) += ClipCount;
        }
        // count length
        FullTransLen += double(TransLen);
        FullRefLen += double(RefLenSum) / double(RefTransStrV.Len());
    }

    // calculate ngram info scores
    int CountKeyId = CountNGramH.FFirstKeyId();
    while (CountNGramH.FNextKeyId(CountKeyId)) {
        // get the n-gram
        const int NGramId = CountNGramH.GetKey(CountKeyId);
        TIntV NGram = GetNGram(NGramId);
        // prepare counts
        if (NGram.Len() == 1) {
            // n-gram is a word
            const int WordCount = CountNGramH[CountKeyId];
            const double NGramInfoScore = TMath::Log2(FullTransLen / double(WordCount));
            NGramInfoH.AddDat(NGramId, NGramInfoScore);
        } else {
            // more then one word in the n-gram
            // get a n-gram with removed last element
            TIntV N1Gram = NGram; N1Gram.DelLast();
            const int N1GramId = NGramH.GetKeyId(N1Gram);
            // get the counts
            const int NGramCount = CountNGramH(NGramId);
            const int N1GramCount = CountNGramH(N1GramId);
            // get the score
            const double NGramInfoScore = TMath::Log2(double(N1GramCount) / double(NGramCount));
            NGramInfoH.AddDat(NGramId, NGramInfoScore);
        }
    }

    // calcualte ngram precisions
    TFltV ClipCountV(MxNGramLen); ClipCountV.PutAll(0);
    int ClipCountKeyId = ClipCountNGramH.FFirstKeyId();
    while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) {
        const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId);
        const int NGramLen = GetNGramLen(NGramId);
        const double NGramInfo = NGramInfoH(NGramId);
        IAssert(0 < NGramLen && NGramLen <= MxNGramLen);
        const int ClipCountNGram = ClipCountNGramH[ClipCountKeyId];
        ClipCountV[NGramLen-1] += double(ClipCountNGram) * NGramInfo;
    }
    TIntV CountV(MxNGramLen); CountV.PutAll(0);
    CountKeyId = CountNGramH.FFirstKeyId();
    while (CountNGramH.FNextKeyId(CountKeyId)) {
        const int NGramId = CountNGramH.GetKey(CountKeyId);
        const int NGramLen = GetNGramLen(NGramId);
        IAssert(0 < NGramLen && NGramLen <= MxNGramLen);
        CountV[NGramLen-1] += CountNGramH[CountKeyId];
    }
    TFltV PrecV(MxNGramLen, 0);
    for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) {
        const double ClipCount = ClipCountV[NGramLen];
        const int Count = CountV[NGramLen];
        const double Prec = (Count > 0) ? ClipCount / double(Count) : 0.0;
        PrecV.Add(Prec);
    }

    // calcualte brevity penalty
    const double LenFrac = double(FullTransLen)/double(FullRefLen);
    double BP = 0.0;
    if (LenFrac >= 1.0) { BP = 1.0; }
    else if (LenFrac <= 0.0) { BP = 0.0; }
    else {
        // calculate beta
        const double LenFracX = 1.5, BPX = 0.5;
        const double Beta = log(BPX) / TMath::Sqr(log(LenFracX));
        // calculate BP score
        BP = exp(Beta * TMath::Sqr(log(LenFrac)));
    }

    // calculate full NIST score
    double NistScore = 0.0; 
    for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) {
        NistScore += PrecV[NGramLen];
    }    
    NistScore *= BP;
    printf("NIST Score: %.5f\n", NistScore);
    
    // done!
    return NistScore;
}
示例#12
0
/////////////////////////////////////////////////
// Best-Paths
void GetBestPaths(
 const TStr& SrcNmObjStr, const TStr& DstNmObjStr, const PNmObjBs& NmObjBs){
  int SrcNmObjId=NmObjBs->GetNmObjId(SrcNmObjStr);
  int DstNmObjId=NmObjBs->GetNmObjId(DstNmObjStr);
  int NmObjs=NmObjBs->GetNmObjs();
  TIntPrV ParLevPrV(NmObjs); TIntPrV DstParLevPrV;
  ParLevPrV.PutAll(TIntPr(-1, -1));
  int CurLev=0;
  ParLevPrV[SrcNmObjId]=TIntPr(SrcNmObjId, CurLev);
  forever{
    CurLev++; int NewEdges=0;
    for (int NmObjId1=0; NmObjId1<NmObjs; NmObjId1++){
      if (ParLevPrV[NmObjId1].Val2==CurLev-1){
        TIntV DocIdV1; NmObjBs->GetNmObjDocIdV(NmObjId1, DocIdV1);
        for (int NmObjId2=0; NmObjId2<NmObjs; NmObjId2++){
          if ((NmObjId2==DstNmObjId)||(ParLevPrV[NmObjId2].Val2==-1)){
            TIntV DocIdV2; NmObjBs->GetNmObjDocIdV(NmObjId2, DocIdV2);
            TIntV IntrsDocIdV; DocIdV1.Intrs(DocIdV2, IntrsDocIdV);
            if (!IntrsDocIdV.Empty()){
              ParLevPrV[NmObjId2]=TIntPr(NmObjId1, CurLev); NewEdges++;
              if (NmObjId2==DstNmObjId){
                DstParLevPrV.Add(TIntPr(NmObjId1, CurLev));
              }
            }
          }
        }
      }
    }
    if ((NewEdges==0)||(ParLevPrV[DstNmObjId].Val2!=-1)){
      break;
    }
  }
  // prepare graph
  THash<TStr, PVrtx> VrtxNmToVrtxH; TStrPrV VrtxNmPrV;
  VrtxNmToVrtxH.AddKey(SrcNmObjStr);
  VrtxNmToVrtxH.AddKey(DstNmObjStr);
  // write path
  ContexterF->NmObjLinkageREd->Clear();
  for (int DstParLevPrN=0; DstParLevPrN<DstParLevPrV.Len(); DstParLevPrN++){
    ParLevPrV[DstNmObjId]=DstParLevPrV[DstParLevPrN];
    int DstParLev=ParLevPrV[DstNmObjId].Val2;
    TStr DstNmObjStr=NmObjBs->GetNmObjStr(DstNmObjId);
    ContexterF->NmObjLinkageREd->Lines->Add(DstNmObjStr.CStr());
    int ParNmObjId=DstNmObjId;
    TStr PrevNmObjStr=DstNmObjStr;
    forever {
      if (ParNmObjId==SrcNmObjId){break;}
      ParNmObjId=ParLevPrV[ParNmObjId].Val1;
      int ParLev=ParLevPrV[ParNmObjId].Val2;
      TStr CurNmObjStr=NmObjBs->GetNmObjStr(ParNmObjId);
      TStr ParNmObjStr=TStr::GetSpaceStr((DstParLev-ParLev)*4)+CurNmObjStr;
      ContexterF->NmObjLinkageREd->Lines->Add(ParNmObjStr.CStr());
      // create vertex & edge
      VrtxNmToVrtxH.AddKey(CurNmObjStr);
      if (!PrevNmObjStr.Empty()){
        if (PrevNmObjStr<CurNmObjStr){
          VrtxNmPrV.AddUnique(TStrPr(PrevNmObjStr, CurNmObjStr));
        } else
        if (PrevNmObjStr>CurNmObjStr){
          VrtxNmPrV.AddUnique(TStrPr(CurNmObjStr, PrevNmObjStr));
        }
      }
      // save curent named-object
      PrevNmObjStr=CurNmObjStr;
    }
  }
  // generate graph
  // create graph
  PGraph Graph=TGGraph::New();
  // create vertices
  for (int VrtxN=0; VrtxN<VrtxNmToVrtxH.Len(); VrtxN++){
    TStr VrtxNm=VrtxNmToVrtxH.GetKey(VrtxN);
    PVrtx Vrtx=TGVrtx::New(VrtxNm);
    VrtxNmToVrtxH.GetDat(VrtxNm)=Vrtx;
    Graph->AddVrtx(Vrtx);
  }
  // create edges
  for (int EdgeN=0; EdgeN<VrtxNmPrV.Len(); EdgeN++){
    PVrtx Vrtx1=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val1);
    PVrtx Vrtx2=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val2);
    PEdge Edge=new TGEdge(Vrtx1, Vrtx2, TStr::Fmt("_%d", EdgeN), false);
    Graph->AddEdge(Edge);
  }
  // place graph
  ContexterF->State->ElGraph=Graph;
  TRnd Rnd(1);
  ContexterF->State->ElGraph->PlaceSimAnnXY(Rnd, ContexterF->State->ElGks);
  // draw graph
  ContexterF->State->ElGks->Clr();
  ContexterF->ElPbPaint(NULL);
}