void printOverlapReport(const tmodel<COUNT, PROB>& tTable, sentenceHandler& testHandler, vcbList& trainEList, vcbList& trainFList, vcbList& testEList, vcbList& testFList) { set<pair<WordIndex, WordIndex> > testCoocur ; sentPair s ; /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ; ofstream of_unseenCoocur(unseenCoocurFile.c_str()); string seenCoocurFile = Prefix + ".tst.seen.cooc" ; ofstream of_seenCoocur(seenCoocurFile.c_str()); */ testHandler.rewind(); int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ; while(testHandler.getNextSentence(s)){ for (WordIndex i = 1 ; i < s.eSent.size() ; i++) for (WordIndex j = 1 ; j < s.fSent.size() ; j++) testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ; } set<pair<WordIndex, WordIndex> >::const_iterator i ; for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){ if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){ seen_coocur ++ ; // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n'; } else { unseen_coocur++; // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n'; } } string trgUnkFile = Prefix + ".tst.trg.unk" ; ofstream of_trgUnk(trgUnkFile.c_str()); for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++) if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){ of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq << '\n'; trgUnk++ ; } string srcUnkFile = Prefix + ".tst.src.unk" ; ofstream of_srcUnk(srcUnkFile.c_str()); for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++) if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){ srcUnk++ ; of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq << '\n'; } string summaryFile = Prefix + ".tst.stats" ; ofstream of_summary(summaryFile.c_str()); of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n"; of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n'; of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n'; of_summary << "unique unseen source tokens: " << srcUnk << '\n'; of_summary << "unique unseen target tokens: " << trgUnk << '\n'; of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n'; of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n'; }
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter) // normalize conditional probability P(fj/ei): // i.e. make sure that Sum over all j of P(fj/e) = 1 // this method reads the counts portion of the table and normalize into // the probability portion. Then the counts are cleared (i.e. zeroed) // if the resulting probability of an entry is below a threshold, then // remove it . { if( iter==2 ) { total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0; } nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0; nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0; Vector<double> total(engl.uniqTokens(),0.0); //Vector<int> nFrench(engl.uniqTokens(), 0); //Vector<int> nEng(french.uniqTokens(), 0); typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i; for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e if( iter==2 ) total2[((*i).first).first] += (*i).second.count; total[((*i).first).first] += (*i).second.count; nFrench[((*i).first).first]++; nEng[((*i).first).second]++; } for(unsigned int k=0;k<engl.uniqTokens();++k) if( nFrench[k] ) { double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH; if( probMass<0.0 ) cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n'; total[k]+= total[k]*probMass/(1-probMass); } typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k; PROB p ; int nParams=0; int nor = 0; for(j = ef.begin(); j != ef.end(); ){ k = j; k++ ; if( (total[((*j).first).first])>0.0 ) p = ((((*j).second).count) /(total[((*j).first).first])) ; else p= 0.0; if (p > PROB_CUTOFF) { if( iter>0 ) { if(useWord2Vec && word2vec.Method == 2){ p = word2vec.L * word2vec.getW2VProb(((*j).first).first, ((*j).first).second) + (1. - word2vec.L) * p; nor = 1; } ((*j).second).prob = 0 ; ((*j).second).count = p ; } else { ((*j).second).prob = p ; ((*j).second).count = 0 ; } nParams++; } else { erase(((*j).first).first, ((*j).first).second); } j = k ; } if(nor) cout << "probabilities Normalized in iteration = " << iter << endl; if( iter>0 ) return normalizeTable(engl, french, iter-1); else { } }