void TUStr::GetWordUStrV(TUStrV& WordUStrV){ // clear word vector WordUStrV.Clr(); // create boundaries TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV); IAssert(Len()==WordBoundPV.Len()-1); IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last())); // traverse characters and bounds int UniChs=Len(); TIntV WordUniChV; for (int UniChN=0; UniChN<=UniChs; UniChN++){ if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary if (UniChN<UniChs){ // if not finish // if last-word-char or single-alphabetic-char if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){ WordUniChV.Add(UniChV[UniChN]); // add char } } if (!WordUniChV.Empty()){ // add current word to vector TUStr WordUStr(WordUniChV); // construct word from char-vector WordUStrV.Add(WordUStr); // add word to word-vector WordUniChV.Clr(false); // clear char-vector } } else { // add character to char-vector WordUniChV.Add(UniChV[UniChN]); } } }
///////////////////////////////////////////////// // Trawling the web for emerging communities // graph, left points to right TTrawling::TTrawling(const PNGraph& Graph, const int& MinSupport) : MinSup(MinSupport) { TIntH ItemCntH; for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { IAssert(NI.GetOutDeg()==0 || NI.GetInDeg()==0); // edges only point from left to right if (NI.GetOutDeg()==0) { continue; } for (int e = 0; e < NI.GetOutDeg(); e++) { ItemCntH.AddDat(NI.GetOutNId(e)) += 1; } } TIntV RightV; for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { IAssert(NI.GetOutDeg()==0 || NI.GetInDeg()==0); // edges only point from left to right if (NI.GetOutDeg()==0) { continue; } RightV.Clr(false); for (int e = 0; e < NI.GetOutDeg(); e++) { const int itm = NI.GetOutNId(e); // only include items that already are above minimum support if (ItemCntH.GetDat(itm) >= MinSup) { RightV.Add(itm); } } if (! RightV.Empty()) { NIdSetH.AddDat(NI.GetId(), RightV); } } // for (int n = 0; n < NIdSetH.Len(); n++) { const TIntV& Set = NIdSetH[n]; for (int s = 0; s < Set.Len(); s++) { SetNIdH.AddDat(Set[s]).Add(n); } } }
void TDecisionTree::TNode::Fit(const TFltVV& FtrVV, const TFltV& ClassV, const TIntV& InstNV) { EAssert(!InstNV.Empty()); const int Dim = FtrVV.GetRows(); NExamples = InstNV.Len(); ClassHist.Gen(2); FtrHist.Gen(Dim); { int TotalPos = 0; double BestScore = TFlt::NInf, CutVal = TFlt::NInf, Score = TFlt::NInf; for (int i = 0; i < NExamples; i++) { AssertR(0 <= InstNV[i] && InstNV[i] < FtrVV.GetCols(), "Invalid instance index: " + TInt::GetStr(InstNV[i]) + "!"); TotalPos += (int) ClassV[InstNV[i]]; } ClassHist[0] = 1 - double(TotalPos) / NExamples; ClassHist[1] = 1 - ClassHist[0]; TFltIntPrV ValClassPrV(NExamples); // get the best score and cut value int InstN; for (int FtrN = 0; FtrN < Dim; FtrN++) { double FtrSum = 0; for (int i = 0; i < NExamples; i++) { InstN = InstNV[i]; AssertR(0 <= InstN && InstN < FtrVV.GetCols(), "Invalid instance index: " + TInt::GetStr(InstN) + "!"); ValClassPrV[i].Val1 = FtrVV(FtrN, InstN); ValClassPrV[i].Val2 = (int) ClassV[InstN]; FtrSum += FtrVV(FtrN, InstN); } ValClassPrV.Sort(true); // have to sort to speed up the calculation if (CanSplitNumFtr(ValClassPrV, TotalPos, CutVal, Score) && Score > BestScore) { BestScore = Score; CutFtrN = FtrN; CutFtrVal = CutVal; } FtrHist[FtrN] = FtrSum / NExamples; } } // cut the dataset into left and right and build the tree recursively if (ShouldGrow() && CutFtrN >= 0) { EAssert(CutFtrN < Dim); // the best attribute is now selected, calculate the correlation between the // selected attribute and other attributes, then split the node CalcCorrFtrV(FtrVV, InstNV); Split(FtrVV, ClassV, InstNV); } }
// burn each link independently (forward with FwdBurnProb, backward with BckBurnProb) void TForestFire::BurnExpFire() { const double OldFwdBurnProb = FwdBurnProb; const double OldBckBurnProb = BckBurnProb; const int NInfect = InfectNIdV.Len(); const TNGraph& G = *Graph; TIntH BurnedNIdH; // burned nodes TIntV BurningNIdV = InfectNIdV; // currently burning nodes TIntV NewBurnedNIdV; // nodes newly burned in current step bool HasAliveNbrs; // has unburned neighbors int NBurned = NInfect, NDiedFire=0; for (int i = 0; i < InfectNIdV.Len(); i++) { BurnedNIdH.AddDat(InfectNIdV[i]); } NBurnedTmV.Clr(false); NBurningTmV.Clr(false); NewBurnedTmV.Clr(false); for (int time = 0; ; time++) { NewBurnedNIdV.Clr(false); // for each burning node for (int node = 0; node < BurningNIdV.Len(); node++) { const int& BurningNId = BurningNIdV[node]; const TNGraph::TNodeI Node = G.GetNI(BurningNId); HasAliveNbrs = false; NDiedFire = 0; // burn forward links (out-links) for (int e = 0; e < Node.GetOutDeg(); e++) { const int OutNId = Node.GetOutNId(e); if (! BurnedNIdH.IsKey(OutNId)) { // not yet burned HasAliveNbrs = true; if (Rnd.GetUniDev() < FwdBurnProb) { BurnedNIdH.AddDat(OutNId); NewBurnedNIdV.Add(OutNId); NBurned++; } } } // burn backward links (in-links) if (BckBurnProb > 0.0) { for (int e = 0; e < Node.GetInDeg(); e++) { const int InNId = Node.GetInNId(e); if (! BurnedNIdH.IsKey(InNId)) { // not yet burned HasAliveNbrs = true; if (Rnd.GetUniDev() < BckBurnProb) { BurnedNIdH.AddDat(InNId); NewBurnedNIdV.Add(InNId); NBurned++; } } } } if (! HasAliveNbrs) { NDiedFire++; } } NBurnedTmV.Add(NBurned); NBurningTmV.Add(BurningNIdV.Len() - NDiedFire); NewBurnedTmV.Add(NewBurnedNIdV.Len()); //BurningNIdV.AddV(NewBurnedNIdV); // node is burning eternally BurningNIdV.Swap(NewBurnedNIdV); // node is burning just 1 time step if (BurningNIdV.Empty()) break; FwdBurnProb = FwdBurnProb * ProbDecay; BckBurnProb = BckBurnProb * ProbDecay; } BurnedNIdV.Gen(BurnedNIdH.Len(), 0); for (int i = 0; i < BurnedNIdH.Len(); i++) { BurnedNIdV.Add(BurnedNIdH.GetKey(i)); } FwdBurnProb = OldFwdBurnProb; BckBurnProb = OldBckBurnProb; }
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm, const TStr& CatFNm, const TIntV& _DIdV) { TIntV DIdV; if (_DIdV.Empty()) { BowDocBs->GetAllDIdV(DIdV); } else { DIdV = _DIdV; } // generate map of row-ids to words TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat")); for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) { TStr WdStr = BowDocBs->GetWordStr(WId); WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1, WdStr.CStr())); } WdMapSOut.Flush(); // generate map of col-ids to document names TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat")); for (int DocN = 0; DocN < DIdV.Len(); DocN++) { const int DId = DIdV[DocN]; TStr DocNm = BowDocBs->GetDocNm(DId); DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId, DocNm.CStr())); } DocMapSOut.Flush(); // save documents' sparse vectors TFOut SOut(FNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId); const int DocWIds = DocSpV->GetWIds(); for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){ const int WId = DocSpV->GetWId(DocWIdN); const double WordWgt = DocSpV->GetWgt(DocWIdN); SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt)); } } SOut.Flush(); // save documents' category sparse vectors if (!CatFNm.Empty()) { TFOut CatSOut(CatFNm); for (int DocN = 0; DocN < DIdV.Len(); DocN++){ const int DId = DIdV[DocN]; const int DocCIds = BowDocBs->GetDocCIds(DId); for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){ const int CId = BowDocBs->GetDocCId(DId, DocCIdN); const double CatWgt = 1.0; CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt)); } } CatSOut.Flush(); } }
void TUStr::GetWordUStrLst(TLst<TUStr>& WordUStrV, TLst<TBool> &TerminalV){ //TBoolV& TerminalV){ // clear word vector WordUStrV.Clr(); // create boundaries TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV); //TerminalV.Reserve(WordBoundPV.Len()); IAssert(Len()==WordBoundPV.Len()-1); IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last())); // traverse characters and bounds int UniChs=Len(); TIntV WordUniChV; bool terminal = false; for (int UniChN=0; UniChN<=UniChs; UniChN++){ if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary if (UniChN<UniChs){ // if not finish // if last-word-char or single-alphabetic-char if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){ WordUniChV.Add(UniChV[UniChN]); // add char } else{ if(WordUStrV.Len() > 0){ if(IsTerminal(UniChV[UniChN])) terminal = true; } } } if (!WordUniChV.Empty()){ // add current word to vector TUStr WordUStr(WordUniChV); // construct word from char-vector WordUStrV.AddBack(WordUStr); // add word to word-vector WordUniChV.Clr(false); // clear char-vector if(terminal){ TerminalV.AddBack(true);} else{ TerminalV.AddBack(false);} terminal = false; } } else { // add character to char-vector WordUniChV.Add(UniChV[UniChN]); } } }
void TGraphCascade::Print(const TIntV& SortV) { printf("graph start:\n"); if (SortV.Empty()) { for (TNGraph::TNodeI NI = Graph.BegNI(); NI < Graph.EndNI(); NI++) { printf("%s %d %d\n", NodeIdNmH.GetDat(NI.GetId()).CStr(), NI.GetId(), NodeNmIdH.GetDat(NodeIdNmH.GetDat(NI.GetId())).Val); } } else { for (int NodeN = 0; NodeN < SortV.Len(); NodeN++) { printf("%s %d\n", NodeIdNmH.GetDat(SortV[NodeN]).CStr(), SortV[NodeN].Val); } } printf("graph end\n"); }
void TWnBs::SaveTxtRel(const TWnRelType& RelType, const int& SynSetP, const bool& Recurse, FILE* fOut, const int& LevelN, TIntS& SynSetPS){ PWnSynSet SynSet=GetSynSetFromP(SynSetP); TIntV SubSynSetPV; SynSet->GetDstSynSetPV(RelType, SubSynSetPV); if ((LevelN==0)&&(!SubSynSetPV.Empty())){ TStr RelTypeNm=GetRelTypeNm(RelType); fprintf(fOut, " ---%s--------------------------\n", RelTypeNm.CStr()); } for (int SubSynSetPN=0; SubSynSetPN<SubSynSetPV.Len(); SubSynSetPN++){ int SubSynSetP=SubSynSetPV[SubSynSetPN]; SaveTxtSynSet(SubSynSetP, fOut, LevelN+1); if (Recurse){ if (!SynSetPS.IsIn(SubSynSetP)){ SynSetPS.Push(SubSynSetP); SaveTxtRel(RelType, SubSynSetP, Recurse, fOut, LevelN+1, SynSetPS); SynSetPS.Pop(); } else { fprintf(fOut, " ***Cycling\n"); } } } }
// Node selects N~geometric(1.0-FwdBurnProb)-1 out-links and burns them. Then same for in-links. // geometirc(p) has mean 1/(p), so for given FwdBurnProb, we burn 1/(1-FwdBurnProb) void TForestFire::BurnGeoFire() { const double OldFwdBurnProb = FwdBurnProb; const double OldBckBurnProb = BckBurnProb; const int& NInfect = InfectNIdV.Len(); const TNGraph& G = *Graph; TIntH BurnedNIdH; // burned nodes TIntV BurningNIdV = InfectNIdV; // currently burning nodes TIntV NewBurnedNIdV; // nodes newly burned in current step bool HasAliveInNbrs, HasAliveOutNbrs; // has unburned neighbors TIntV AliveNIdV; // NIds of alive neighbors int NBurned = NInfect, time; for (int i = 0; i < InfectNIdV.Len(); i++) { BurnedNIdH.AddDat(InfectNIdV[i]); } NBurnedTmV.Clr(false); NBurningTmV.Clr(false); NewBurnedTmV.Clr(false); for (time = 0;; time++) { NewBurnedNIdV.Clr(false); for (int node = 0; node < BurningNIdV.Len(); node++) { const int& BurningNId = BurningNIdV[node]; const TNGraph::TNodeI Node = G.GetNI(BurningNId); // find unburned links HasAliveOutNbrs = false; AliveNIdV.Clr(false); // unburned links for (int e = 0; e < Node.GetOutDeg(); e++) { const int OutNId = Node.GetOutNId(e); if (!BurnedNIdH.IsKey(OutNId)) { HasAliveOutNbrs = true; AliveNIdV.Add(OutNId); } } // number of links to burn (geometric coin). Can also burn 0 links const int BurnNFwdLinks = Rnd.GetGeoDev(1.0 - FwdBurnProb) - 1; if (HasAliveOutNbrs && BurnNFwdLinks > 0) { AliveNIdV.Shuffle(Rnd); for (int i = 0; i < TMath::Mn(BurnNFwdLinks, AliveNIdV.Len()); i++) { BurnedNIdH.AddDat(AliveNIdV[i]); NewBurnedNIdV.Add(AliveNIdV[i]); NBurned++; } } // backward links if (BckBurnProb > 0.0) { // find unburned links HasAliveInNbrs = false; AliveNIdV.Clr(false); for (int e = 0; e < Node.GetInDeg(); e++) { const int InNId = Node.GetInNId(e); if (!BurnedNIdH.IsKey(InNId)) { HasAliveInNbrs = true; AliveNIdV.Add(InNId); } } // number of links to burn (geometric coin). Can also burn 0 links const int BurnNBckLinks = Rnd.GetGeoDev(1.0 - BckBurnProb) - 1; if (HasAliveInNbrs && BurnNBckLinks > 0) { AliveNIdV.Shuffle(Rnd); for (int i = 0; i < TMath::Mn(BurnNBckLinks, AliveNIdV.Len()); i++) { BurnedNIdH.AddDat(AliveNIdV[i]); NewBurnedNIdV.Add(AliveNIdV[i]); NBurned++; } } } } NBurnedTmV.Add(NBurned); NBurningTmV.Add(BurningNIdV.Len()); NewBurnedTmV.Add(NewBurnedNIdV.Len()); // BurningNIdV.AddV(NewBurnedNIdV); // node is burning eternally BurningNIdV.Swap(NewBurnedNIdV); // node is burning just 1 time step if (BurningNIdV.Empty()) break; FwdBurnProb = FwdBurnProb * ProbDecay; BckBurnProb = BckBurnProb * ProbDecay; } BurnedNIdV.Gen(BurnedNIdH.Len(), 0); for (int i = 0; i < BurnedNIdH.Len(); i++) { BurnedNIdV.Add(BurnedNIdH.GetKey(i)); } FwdBurnProb = OldFwdBurnProb; BckBurnProb = OldBckBurnProb; }
///////////////////////////////////////////////// // BLEU-score double TEvalScoreBleu::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) { // check if the corpus has translations IAssert(TransCorpus->IsTrans()); // ngram counts (cliped and full) TIntH ClipCountNGramH, CountNGramH; // candidate and effective reference length int FullTransLen = 0, FullRefLen = 0; // iterate over sentences TIntV SentIdV = _SentIdV; if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); } const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; // tokenize translation TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV); TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH); TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams // counters for getting the closest length of reference sentences const int TransLen = TransWIdV.Len(); int BestLen = 0, BestLenDiff = TInt::Mx; // go over reference translations and count ngram matches TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId); for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) { // parse reference translation sentence TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV); TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH); // check for matches int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { const int NGramId = TransNGramH.GetKey(TransNGramKeyId); const int FreeTransNGrams = FreeTransNGramH(NGramId); if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) { // ngram match and still some free ngrams left to clip const int RefNGrams = RefNGramH(NGramId); FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams); } } // check the length difference const int RefLen = RefWIdV.Len(); const int LenDiff = TInt::Abs(TransLen - RefLen); if (LenDiff < BestLenDiff) { BestLen = RefLen; BestLenDiff = LenDiff; } } // count ngrams int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { // get ngram const int NGramId = TransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId != -1); // check if two hash tables are aligned (should be...) const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId == FreeNGramId); // get ngram count and clip-count const int Count = TransNGramH[TransNGramKeyId]; const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId]; // add ngram to the coprus ngram counts CountNGramH.AddDat(NGramId) += Count; ClipCountNGramH.AddDat(NGramId) += ClipCount; } // count length FullTransLen += TransLen; FullRefLen += BestLen; } // calcualte ngram precisions TIntV ClipCountV(MxNGramLen); ClipCountV.PutAll(0); int ClipCountKeyId = ClipCountNGramH.FFirstKeyId(); while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) { const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); ClipCountV[NGramLen-1] += ClipCountNGramH[ClipCountKeyId]; } TIntV CountV(MxNGramLen); CountV.PutAll(0); int CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { const int NGramId = CountNGramH.GetKey(CountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); CountV[NGramLen-1] += CountNGramH[CountKeyId]; } TFltV PrecV(MxNGramLen, 0); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { const int ClipCount = ClipCountV[NGramLen]; const int Count = CountV[NGramLen]; const double Prec = (Count > 0) ? double(ClipCount)/double(Count) : 0.0; PrecV.Add(Prec); //printf("%d-gram Match:%d Total:%d Prec:%.5f\n", NGramLen+1, ClipCount, Count, Prec); } // calcualte brevity penalty double LogBP = TFlt::GetMn(0.0, 1.0 - double(FullRefLen)/double(FullTransLen)); double BP = exp(LogBP); // calculate full BLEU score double BleuScore = BP; const double Wgt = 1.0 / double(MxNGramLen); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { BleuScore *= pow(PrecV[NGramLen], Wgt); } printf("BLEU Score: %.5f\n", BleuScore); // done! return BleuScore; }
///////////////////////////////////////////////// // NIST-score double TEvalScoreNist::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) { // check if the corpus has translations IAssert(TransCorpus->IsTrans()); // ngram counts (cliped and full) TIntH ClipCountNGramH, CountNGramH; // ngram info score TIntFltH NGramInfoH; // candidate and effective reference length double FullTransLen = 0.0, FullRefLen = 0.0; // iterate over sentences TIntV SentIdV = _SentIdV; if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); } const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; // tokenize translation TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV); TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH); TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams // counters for getting the closest length of reference sentences const int TransLen = TransWIdV.Len(); int RefLenSum = 0; // go over reference translations and count ngram matches TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId); // we assume that there is at least one reference translation IAssert(!RefTransStrV.Empty()); for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) { // parse reference translation sentence TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV); TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH); // check for matches int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { const int NGramId = TransNGramH.GetKey(TransNGramKeyId); const int FreeTransNGrams = FreeTransNGramH(NGramId); if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) { // ngram match and still some free ngrams left to clip const int RefNGrams = RefNGramH(NGramId); FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams); } } // check the length difference const int RefLen = RefWIdV.Len(); RefLenSum += RefLen; } // count ngrams int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { // get ngram const int NGramId = TransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId != -1); // check if two hash tables are aligned (should be...) const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId == FreeNGramId); // get ngram count and clip-count const int Count = TransNGramH[TransNGramKeyId]; const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId]; // add ngram to the coprus ngram counts CountNGramH.AddDat(NGramId) += Count; ClipCountNGramH.AddDat(NGramId) += ClipCount; } // count length FullTransLen += double(TransLen); FullRefLen += double(RefLenSum) / double(RefTransStrV.Len()); } // calculate ngram info scores int CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { // get the n-gram const int NGramId = CountNGramH.GetKey(CountKeyId); TIntV NGram = GetNGram(NGramId); // prepare counts if (NGram.Len() == 1) { // n-gram is a word const int WordCount = CountNGramH[CountKeyId]; const double NGramInfoScore = TMath::Log2(FullTransLen / double(WordCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } else { // more then one word in the n-gram // get a n-gram with removed last element TIntV N1Gram = NGram; N1Gram.DelLast(); const int N1GramId = NGramH.GetKeyId(N1Gram); // get the counts const int NGramCount = CountNGramH(NGramId); const int N1GramCount = CountNGramH(N1GramId); // get the score const double NGramInfoScore = TMath::Log2(double(N1GramCount) / double(NGramCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } } // calcualte ngram precisions TFltV ClipCountV(MxNGramLen); ClipCountV.PutAll(0); int ClipCountKeyId = ClipCountNGramH.FFirstKeyId(); while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) { const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId); const int NGramLen = GetNGramLen(NGramId); const double NGramInfo = NGramInfoH(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); const int ClipCountNGram = ClipCountNGramH[ClipCountKeyId]; ClipCountV[NGramLen-1] += double(ClipCountNGram) * NGramInfo; } TIntV CountV(MxNGramLen); CountV.PutAll(0); CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { const int NGramId = CountNGramH.GetKey(CountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); CountV[NGramLen-1] += CountNGramH[CountKeyId]; } TFltV PrecV(MxNGramLen, 0); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { const double ClipCount = ClipCountV[NGramLen]; const int Count = CountV[NGramLen]; const double Prec = (Count > 0) ? ClipCount / double(Count) : 0.0; PrecV.Add(Prec); } // calcualte brevity penalty const double LenFrac = double(FullTransLen)/double(FullRefLen); double BP = 0.0; if (LenFrac >= 1.0) { BP = 1.0; } else if (LenFrac <= 0.0) { BP = 0.0; } else { // calculate beta const double LenFracX = 1.5, BPX = 0.5; const double Beta = log(BPX) / TMath::Sqr(log(LenFracX)); // calculate BP score BP = exp(Beta * TMath::Sqr(log(LenFrac))); } // calculate full NIST score double NistScore = 0.0; for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { NistScore += PrecV[NGramLen]; } NistScore *= BP; printf("NIST Score: %.5f\n", NistScore); // done! return NistScore; }
///////////////////////////////////////////////// // Best-Paths void GetBestPaths( const TStr& SrcNmObjStr, const TStr& DstNmObjStr, const PNmObjBs& NmObjBs){ int SrcNmObjId=NmObjBs->GetNmObjId(SrcNmObjStr); int DstNmObjId=NmObjBs->GetNmObjId(DstNmObjStr); int NmObjs=NmObjBs->GetNmObjs(); TIntPrV ParLevPrV(NmObjs); TIntPrV DstParLevPrV; ParLevPrV.PutAll(TIntPr(-1, -1)); int CurLev=0; ParLevPrV[SrcNmObjId]=TIntPr(SrcNmObjId, CurLev); forever{ CurLev++; int NewEdges=0; for (int NmObjId1=0; NmObjId1<NmObjs; NmObjId1++){ if (ParLevPrV[NmObjId1].Val2==CurLev-1){ TIntV DocIdV1; NmObjBs->GetNmObjDocIdV(NmObjId1, DocIdV1); for (int NmObjId2=0; NmObjId2<NmObjs; NmObjId2++){ if ((NmObjId2==DstNmObjId)||(ParLevPrV[NmObjId2].Val2==-1)){ TIntV DocIdV2; NmObjBs->GetNmObjDocIdV(NmObjId2, DocIdV2); TIntV IntrsDocIdV; DocIdV1.Intrs(DocIdV2, IntrsDocIdV); if (!IntrsDocIdV.Empty()){ ParLevPrV[NmObjId2]=TIntPr(NmObjId1, CurLev); NewEdges++; if (NmObjId2==DstNmObjId){ DstParLevPrV.Add(TIntPr(NmObjId1, CurLev)); } } } } } } if ((NewEdges==0)||(ParLevPrV[DstNmObjId].Val2!=-1)){ break; } } // prepare graph THash<TStr, PVrtx> VrtxNmToVrtxH; TStrPrV VrtxNmPrV; VrtxNmToVrtxH.AddKey(SrcNmObjStr); VrtxNmToVrtxH.AddKey(DstNmObjStr); // write path ContexterF->NmObjLinkageREd->Clear(); for (int DstParLevPrN=0; DstParLevPrN<DstParLevPrV.Len(); DstParLevPrN++){ ParLevPrV[DstNmObjId]=DstParLevPrV[DstParLevPrN]; int DstParLev=ParLevPrV[DstNmObjId].Val2; TStr DstNmObjStr=NmObjBs->GetNmObjStr(DstNmObjId); ContexterF->NmObjLinkageREd->Lines->Add(DstNmObjStr.CStr()); int ParNmObjId=DstNmObjId; TStr PrevNmObjStr=DstNmObjStr; forever { if (ParNmObjId==SrcNmObjId){break;} ParNmObjId=ParLevPrV[ParNmObjId].Val1; int ParLev=ParLevPrV[ParNmObjId].Val2; TStr CurNmObjStr=NmObjBs->GetNmObjStr(ParNmObjId); TStr ParNmObjStr=TStr::GetSpaceStr((DstParLev-ParLev)*4)+CurNmObjStr; ContexterF->NmObjLinkageREd->Lines->Add(ParNmObjStr.CStr()); // create vertex & edge VrtxNmToVrtxH.AddKey(CurNmObjStr); if (!PrevNmObjStr.Empty()){ if (PrevNmObjStr<CurNmObjStr){ VrtxNmPrV.AddUnique(TStrPr(PrevNmObjStr, CurNmObjStr)); } else if (PrevNmObjStr>CurNmObjStr){ VrtxNmPrV.AddUnique(TStrPr(CurNmObjStr, PrevNmObjStr)); } } // save curent named-object PrevNmObjStr=CurNmObjStr; } } // generate graph // create graph PGraph Graph=TGGraph::New(); // create vertices for (int VrtxN=0; VrtxN<VrtxNmToVrtxH.Len(); VrtxN++){ TStr VrtxNm=VrtxNmToVrtxH.GetKey(VrtxN); PVrtx Vrtx=TGVrtx::New(VrtxNm); VrtxNmToVrtxH.GetDat(VrtxNm)=Vrtx; Graph->AddVrtx(Vrtx); } // create edges for (int EdgeN=0; EdgeN<VrtxNmPrV.Len(); EdgeN++){ PVrtx Vrtx1=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val1); PVrtx Vrtx2=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val2); PEdge Edge=new TGEdge(Vrtx1, Vrtx2, TStr::Fmt("_%d", EdgeN), false); Graph->AddEdge(Edge); } // place graph ContexterF->State->ElGraph=Graph; TRnd Rnd(1); ContexterF->State->ElGraph->PlaceSimAnnXY(Rnd, ContexterF->State->ElGks); // draw graph ContexterF->State->ElGks->Clr(); ContexterF->ElPbPaint(NULL); }