void printAllTables(vcbList& eTrainVcbList, vcbList& eTestVcbList, vcbList& fTrainVcbList, vcbList& fTestVcbList, model1& m1) { cerr << "writing Final tables to Disk \n"; string t_inv_file = Prefix + ".ti.final" ; if( !FEWDUMPS) m1.getTTable().printProbTableInverse(t_inv_file.c_str(), m1.getEnglishVocabList(), m1.getFrenchVocabList(), m1.getETotalWCount(), m1.getFTotalWCount()); t_inv_file = Prefix + ".actual.ti.final" ; if( !FEWDUMPS ) m1.getTTable().printProbTableInverse(t_inv_file.c_str(), eTrainVcbList.getVocabList(), fTrainVcbList.getVocabList(), m1.getETotalWCount(), m1.getFTotalWCount(), true); string perp_filename = Prefix + ".perp" ; ofstream of_perp(perp_filename.c_str()); cout << "Writing PERPLEXITY report to: " << perp_filename << '\n'; if(!of_perp){ cerr << "\nERROR: Cannot write to " << perp_filename <<'\n'; exit(1); } if (testCorpus) generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, testViterbiPerp, of_perp, (*corpus).getTotalNoPairs1(), (*testCorpus).getTotalNoPairs1(), true); else generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, testViterbiPerp, of_perp, (*corpus).getTotalNoPairs1(), 0, true); string eTrainVcbFile = Prefix + ".trn.src.vcb" ; ofstream of_eTrainVcb(eTrainVcbFile.c_str()); cout << "Writing source vocabulary list to : " << eTrainVcbFile << '\n'; if(!of_eTrainVcb){ cerr << "\nERROR: Cannot write to " << eTrainVcbFile <<'\n'; exit(1); } eTrainVcbList.printVocabList(of_eTrainVcb) ; string fTrainVcbFile = Prefix + ".trn.trg.vcb" ; ofstream of_fTrainVcb(fTrainVcbFile.c_str()); cout << "Writing source vocabulary list to : " << fTrainVcbFile << '\n'; if(!of_fTrainVcb){ cerr << "\nERROR: Cannot write to " << fTrainVcbFile <<'\n'; exit(1); } fTrainVcbList.printVocabList(of_fTrainVcb) ; //print test vocabulary list string eTestVcbFile = Prefix + ".tst.src.vcb" ; ofstream of_eTestVcb(eTestVcbFile.c_str()); cout << "Writing source vocabulary list to : " << eTestVcbFile << '\n'; if(!of_eTestVcb){ cerr << "\nERROR: Cannot write to " << eTestVcbFile <<'\n'; exit(1); } eTestVcbList.printVocabList(of_eTestVcb) ; string fTestVcbFile = Prefix + ".tst.trg.vcb" ; ofstream of_fTestVcb(fTestVcbFile.c_str()); cout << "Writing source vocabulary list to : " << fTestVcbFile << '\n'; if(!of_fTestVcb){ cerr << "\nERROR: Cannot write to " << fTestVcbFile <<'\n'; exit(1); } fTestVcbList.printVocabList(of_fTestVcb) ; printDecoderConfigFile(); if (testCorpus) printOverlapReport(m1.getTTable(), *testCorpus, eTrainVcbList, fTrainVcbList, eTestVcbList, fTestVcbList); }
void printOverlapReport(const tmodel<COUNT, PROB>& tTable, sentenceHandler& testHandler, vcbList& trainEList, vcbList& trainFList, vcbList& testEList, vcbList& testFList) { set<pair<WordIndex, WordIndex> > testCoocur ; sentPair s ; /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ; ofstream of_unseenCoocur(unseenCoocurFile.c_str()); string seenCoocurFile = Prefix + ".tst.seen.cooc" ; ofstream of_seenCoocur(seenCoocurFile.c_str()); */ testHandler.rewind(); int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ; while(testHandler.getNextSentence(s)){ for (WordIndex i = 1 ; i < s.eSent.size() ; i++) for (WordIndex j = 1 ; j < s.fSent.size() ; j++) testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ; } set<pair<WordIndex, WordIndex> >::const_iterator i ; for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){ if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){ seen_coocur ++ ; // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n'; } else { unseen_coocur++; // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n'; } } string trgUnkFile = Prefix + ".tst.trg.unk" ; ofstream of_trgUnk(trgUnkFile.c_str()); for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++) if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){ of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq << '\n'; trgUnk++ ; } string srcUnkFile = Prefix + ".tst.src.unk" ; ofstream of_srcUnk(srcUnkFile.c_str()); for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++) if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){ srcUnk++ ; of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq << '\n'; } string summaryFile = Prefix + ".tst.stats" ; ofstream of_summary(summaryFile.c_str()); of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n"; of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n'; of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n'; of_summary << "unique unseen source tokens: " << srcUnk << '\n'; of_summary << "unique unseen target tokens: " << trgUnk << '\n'; of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n'; of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n'; }
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter) // normalize conditional probability P(fj/ei): // i.e. make sure that Sum over all j of P(fj/e) = 1 // this method reads the counts portion of the table and normalize into // the probability portion. Then the counts are cleared (i.e. zeroed) // if the resulting probability of an entry is below a threshold, then // remove it . { if( iter==2 ) { total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0; } nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0; nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0; Vector<double> total(engl.uniqTokens(),0.0); //Vector<int> nFrench(engl.uniqTokens(), 0); //Vector<int> nEng(french.uniqTokens(), 0); typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i; for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e if( iter==2 ) total2[((*i).first).first] += (*i).second.count; total[((*i).first).first] += (*i).second.count; nFrench[((*i).first).first]++; nEng[((*i).first).second]++; } for(unsigned int k=0;k<engl.uniqTokens();++k) if( nFrench[k] ) { double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH; if( probMass<0.0 ) cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n'; total[k]+= total[k]*probMass/(1-probMass); } typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k; PROB p ; int nParams=0; int nor = 0; for(j = ef.begin(); j != ef.end(); ){ k = j; k++ ; if( (total[((*j).first).first])>0.0 ) p = ((((*j).second).count) /(total[((*j).first).first])) ; else p= 0.0; if (p > PROB_CUTOFF) { if( iter>0 ) { if(useWord2Vec && word2vec.Method == 2){ p = word2vec.L * word2vec.getW2VProb(((*j).first).first, ((*j).first).second) + (1. - word2vec.L) * p; nor = 1; } ((*j).second).prob = 0 ; ((*j).second).count = p ; } else { ((*j).second).prob = p ; ((*j).second).count = 0 ; } nParams++; } else { erase(((*j).first).first, ((*j).first).second); } j = k ; } if(nor) cout << "probabilities Normalized in iteration = " << iter << endl; if( iter>0 ) return normalizeTable(engl, french, iter-1); else { } }