void printOverlapReport(const tmodel<COUNT, PROB>& tTable, sentenceHandler& testHandler, vcbList& trainEList, vcbList& trainFList, vcbList& testEList, vcbList& testFList) { set<pair<WordIndex, WordIndex> > testCoocur ; sentPair s ; /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ; ofstream of_unseenCoocur(unseenCoocurFile.c_str()); string seenCoocurFile = Prefix + ".tst.seen.cooc" ; ofstream of_seenCoocur(seenCoocurFile.c_str()); */ testHandler.rewind(); int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ; while(testHandler.getNextSentence(s)){ for (WordIndex i = 1 ; i < s.eSent.size() ; i++) for (WordIndex j = 1 ; j < s.fSent.size() ; j++) testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ; } set<pair<WordIndex, WordIndex> >::const_iterator i ; for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){ if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){ seen_coocur ++ ; // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n'; } else { unseen_coocur++; // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n'; } } string trgUnkFile = Prefix + ".tst.trg.unk" ; ofstream of_trgUnk(trgUnkFile.c_str()); for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++) if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){ of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq << '\n'; trgUnk++ ; } string srcUnkFile = Prefix + ".tst.src.unk" ; ofstream of_srcUnk(srcUnkFile.c_str()); for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++) if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){ srcUnk++ ; of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq << '\n'; } string summaryFile = Prefix + ".tst.stats" ; ofstream of_summary(summaryFile.c_str()); of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n"; of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n'; of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n'; of_summary << "unique unseen source tokens: " << srcUnk << '\n'; of_summary << "unique unseen target tokens: " << trgUnk << '\n'; of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n'; of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n'; }
void model1::initialize_table_uniformly(sentenceHandler& sHandler1){ WordIndex i, j; cout << "Initialize tTable\n"; sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; PROB uniform = 1.0/es.size() ; for( i=0; i < es.size(); i++) for(j=1; j < fs.size(); j++) tTable.insert(es[i],fs[j],0,uniform); } }
void model1::initialize_table_uniformly(sentenceHandler& sHandler1){ WordIndex i, j; cout << "Initialize tTable\n"; sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; // source Vector<WordIndex>& fs = sent.fSent; // target PROB uniform = 1.0/es.size() ; for( i=0; i < es.size(); i++) for(j=1; j < fs.size(); j++) { //cout << "es[i]=" << es[i] << "\tfs[j]=" << fs[j] << endl; tTable.insert(es[i],fs[j],0,uniform); } } tTable.printProbTable("./testfile",Elist.getVocabList(),Flist.getVocabList(),true); }
void model2::initialize_table_uniformly(sentenceHandler& sHandler1){ // initialize the aTable uniformly (run this before running em_with_tricks) int n=0; sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; WordIndex l = es.size() - 1; WordIndex m = fs.size() - 1; n++; if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH) { PROB uniform_val = 1.0 / (l+1) ; for(WordIndex j=1; j <= m; j++) for(WordIndex i=0; i <= l; i++) aTable.setValue(i,j, l, m, uniform_val); } } }
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Word2Vec& word2vec, bool useWord2Vec, Perplexity& viterbi_perp, bool test) { WordIndex i, j, l, m ; double cross_entropy; int pair_no=0 ; perp.clear(); viterbi_perp.clear(); ofstream of2; // for each sentence pair in the corpus if (dump_alignment||FEWDUMPS) of2.open(alignfile); PROB uniform = 1.0/noFrenchWords ; sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float so = sent.getCount(); l = es.size() - 1; m = fs.size() - 1; cross_entropy = log(1.0); Vector<WordIndex> viterbi_alignment(fs.size()); double viterbi_score = 1 ; bool eindict[l + 1]; bool findict[m + 1]; bool indict[m + 1][l + 1]; bool isSimilar[m + 1][l + 1]; if(useWord2Vec && word2vec.Method == 1){ for(unsigned int dummy = 0; dummy <= m; dummy++){ for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) isSimilar[dummy][dummy2] = false; } for(i = 1; i <= l; i++){ map<WordIndex, bool> simWords = word2vec.getVectorMap(es[i]); for(j = 1; j <= m; j++){ if(simWords.find(fs[j]) != simWords.end()){ isSimilar[j][i] = true; } } } } // cout << sent.sentenceNo << endl; if(it == 1 && useDict){ for(unsigned int dummy = 0; dummy <= l; dummy++) { eindict[dummy] = false; } for(unsigned int dummy = 0; dummy <= m; dummy++){ findict[dummy] = false; for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) indict[dummy][dummy2] = false; } for(j = 0; j <= m; j++) for(i = 0; i <= l; i++) if(dict.indict(fs[j], es[i])){ eindict[i] = findict[j] = indict[j][i] = true; } } for(j=1; j <= m; j++){ // entries that map fs to all possible ei in this sentence. Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table LpPair<COUNT,PROB> **sPtrCachePtr; PROB denom = 0.0; WordIndex best_i = 0 ; // i for which fj is best maped to ei PROB word_best_score = 0 ; // score for the best mapping of fj if (it == 1 && !seedModel1){ denom = uniform * es.size(); word_best_score = uniform ; } else for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ PROB e(0.0) ; (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ; if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1) continue; if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; denom += e ; if (e > word_best_score){ word_best_score = e ; best_i = i ; } } viterbi_alignment[j] = best_i ; viterbi_score *= word_best_score ; /// denom ; if (denom == 0){ if (test) cerr << "WARNING: denom is zero (TEST)\n"; else cerr << "WARNING: denom is zero (TRAIN)\n"; } cross_entropy += log(denom) ; if (!test){ if(denom > 0){ COUNT val = COUNT(so) / (COUNT) double(denom) ; /* this if loop implements a constraint on counting: count(es[i], fs[j]) is implemented if and only if es[i] and fs[j] occur together in the dictionary, OR es[i] does not occur in the dictionary with any fs[x] and fs[j] does not occur in the dictionary with any es[y] */ if(it == 1 && useDict){ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1) continue; if(indict[j][i] || (!findict[j] && !eindict[i])){ PROB e(0.0) ; if (it == 1 && !seedModel1) e = uniform ; else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; COUNT x=e*val; if( it==1||x>MINCOUNTINCREASE ) if ((*sPtrCachePtr) != 0) (*((*sPtrCachePtr))).count += x; else tTable.incCount(es[i], fs[j], x); } /* end of if */ } /* end of for i */ } /* end of it == 1 */ // Old code: else{ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1) continue; PROB e(0.0) ; if (it == 1 && !seedModel1){ e = uniform ; } else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; //if( !(i==0) ) //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl; // if (useWord2Vec && word2vec.Method == 2) // e = e * (1. - word2vec.L) + word2vec.getW2VProb(es[i], fs[j]) * word2vec.L; COUNT x=e*val; if( pair_no==VerboseSentence ) cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl; if( it==1||x>MINCOUNTINCREASE ) if( NoEmptyWord==0 || i!=0 ) if ((*sPtrCachePtr) != 0) (*((*sPtrCachePtr))).count += x; else tTable.incCount(es[i], fs[j], x); } /* end of for i */ } // end of else } // end of if (denom > 0) }// if (!test) } // end of for (j) ; sHandler1.setProbOfSentence(sent,cross_entropy); //cerr << sent << "CE: " << cross_entropy << " " << so << endl; perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1); viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1); if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000)) printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score); addAL(viterbi_alignment,sent.sentenceNo,l); pair_no++; } /* of while */ sHandler1.rewind(); perp.record("Model1"); viterbi_perp.record("Model1"); errorReportAL(cout, "IBM-1"); }
void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1, bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, bool test) { massert( aTable.is_distortion==0 ); massert( aCountTable.is_distortion==0 ); WordIndex i, j, l, m ; double cross_entropy; int pair_no=0 ; perp.clear(); viterbi_perp.clear(); ofstream of2; // for each sentence pair in the corpus if (dump_alignment||FEWDUMPS ) of2.open(alignfile); sentPair sent ; vector<double> ferts(evlist.size()); sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float so = sent.getCount(); l = es.size() - 1; m = fs.size() - 1; cross_entropy = log(1.0); Vector<WordIndex> viterbi_alignment(fs.size()); double viterbi_score = 1; for(j=1; j <= m; j++){ Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table // entries that map fs to all possible ei in this sentence. PROB denom = 0.0; PROB e = 0.0, word_best_score = 0; WordIndex best_i = 0 ; // i for which fj is best maped to ei for(i=0; i <= l; i++){ sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ; if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH ) e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ; else e = PROB_SMOOTH * aTable.getValue(i,j, l, m); denom += e ; if (e > word_best_score){ word_best_score = e ; best_i = i ; } } viterbi_alignment[j] = best_i ; viterbi_score *= word_best_score; ///denom ; cross_entropy += log(denom) ; if (denom == 0){ if (test) cerr << "WARNING: denom is zero (TEST)\n"; else cerr << "WARNING: denom is zero (TRAIN)\n"; } if (!test){ if(denom > 0){ COUNT val = COUNT(so) / (COUNT) double(denom) ; for( i=0; i <= l; i++){ PROB e(0.0); if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH) e = (*(sPtrCache[i])).prob ; else e = PROB_SMOOTH ; e *= aTable.getValue(i,j, l, m); COUNT temp = COUNT(e) * val ; if( NoEmptyWord==0 || i!=0 ) if (sPtrCache[i] != 0) (*(sPtrCache[i])).count += temp ; else tTable.incCount(es[i], fs[j], temp); aCountTable.getRef(i,j, l, m)+= temp ; } /* end of for i */ } // end of if (denom > 0) }// if (!test) } // end of for (j) ; sHandler1.setProbOfSentence(sent,cross_entropy); perp.addFactor(cross_entropy, so, l, m,1); viterbi_perp.addFactor(log(viterbi_score), so, l, m,1); if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) ) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score); addAL(viterbi_alignment,sent.sentenceNo,l); pair_no++; } /* of while */ sHandler1.rewind(); perp.record("Model2"); viterbi_perp.record("Model2"); errorReportAL(cout,"IBM-2"); }
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test) { WordIndex i, j, l, m ; double cross_entropy; int pair_no=0 ; perp.clear(); viterbi_perp.clear(); ofstream of2; // for each sentence pair in the corpus if (dump_alignment||FEWDUMPS) of2.open(alignfile); cerr <<" number of French (target) words = " << noFrenchWords << endl; PROB uniform = 1.0/noFrenchWords ; cerr << "initial unifrom prob = " << uniform << endl; sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float so = sent.getCount(); // number of times sentence occurs in corpus //std::cerr << "\n\nNEW sentence (#" << (pair_no + 1) << ") with count = " << so << endl; l = es.size() - 1; // source length m = fs.size() - 1; // target length cross_entropy = log(1.0); Vector<WordIndex> viterbi_alignment(fs.size()); double viterbi_score = 1 ; /*mebool eindict[l + 1]; bool findict[m + 1]; bool indict[m + 1][l + 1]; if(it == 1 && useDict){ for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false; for(unsigned int dummy = 0; dummy <= m; dummy++){ findict[dummy] = false; for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) indict[dummy][dummy2] = false; } for(j = 0; j <= m; j++) for(i = 0; i <= l; i++) if(dict.indict(fs[j], es[i])){ eindict[i] = findict[j] = indict[j][i] = true; } }me*/ for(j=1; j <= m; j++){ //cerr << "Current french (TARGET) word = " << fs[j] << endl; // entries that map fs to all possible ei in this sentence. Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table LpPair<COUNT,PROB> **sPtrCachePtr; PROB denom = 0.0; WordIndex best_i = 0 ; // i for which fj is best maped to ei PROB word_best_score = 0 ; // score for the best mapping of fj if (it == 1 && !seedModel1){ //cerr << "Using uniform denominator\n"; denom = uniform * es.size() ; word_best_score = uniform ; } else for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ //cerr << "current english (SOURCE) word = " << es[i] << endl; PROB e(0.0) ; srcHits_.insert(es[i]); (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ; if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; denom += e ; if (e > word_best_score){ word_best_score = e ; best_i = i ; } } viterbi_alignment[j] = best_i ; viterbi_score *= word_best_score ; /// denom ; if (denom == 0){ if (test) cerr << "WARNING: denom is zero (TEST)\n"; else cerr << "WARNING: denom is zero (TRAIN)\n"; } cross_entropy += log(denom) ; if (!test){ if(denom > 0){ COUNT val = COUNT(so) / (COUNT) double(denom) ; /* this if loop implements a constraint on counting: count(es[i], fs[j]) is implemented if and only if es[i] and fs[j] occur together in the dictionary, OR es[i] does not occur in the dictionary with any fs[x] and fs[j] does not occur in the dictionary with any es[y] */ /*meif(it == 1 && useDict){ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ if(indict[j][i] || (!findict[j] && !eindict[i])){ PROB e(0.0) ; if (it == 1 && !seedModel1) e = uniform ; else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; COUNT x=e*val; if( it==1||x>MINCOUNTINCREASE ) if ((*sPtrCachePtr) != 0) (*((*sPtrCachePtr))).count += x; else tTable.incCount(es[i], fs[j], x); } } } // Old code: else{me*/ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ //for(i=0; i <= l; i++) { PROB e(0.0) ; if (it == 1 && !seedModel1) e = uniform ; else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) e = (*((*sPtrCachePtr))).prob; else e = PROB_SMOOTH ; //if( !(i==0) ) //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl; COUNT x=e*val; // new count if( pair_no==VerboseSentence ) cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl; if( it==1||x>MINCOUNTINCREASE ) { if(step_k != 0) tTable.stepCounts_[wordPairIds(es[i], fs[j])] += x; else if( NoEmptyWord==0 || i!=0 ) if ((*sPtrCachePtr) != 0){ // handles single sentence updates //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here (*((*sPtrCachePtr))).count += x; } else { //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here tTable.incCount(es[i], fs[j], x); } // increment temp table instead } } /* end of for i */ //me} // end of else } // end of if (denom > 0) }// if (!test) } // end of for all (j) target words; sHandler1.setProbOfSentence(sent,cross_entropy); //cerr << sent << "CE: " << cross_entropy << " " << so << endl; perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1); viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1); if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000)) printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score); addAL(viterbi_alignment,sent.sentenceNo,l); pair_no++; } /* of while */ sHandler1.rewind(); perp.record("Model1"); viterbi_perp.record("Model1"); errorReportAL(cout, "IBM-1"); }
void model3::viterbi_loop(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, bool dump_files, const char* alignfile, bool collect_counts, string model ) { WordIndex i, j, l, m ; ofstream of2 ; int pair_no; LogProb temp; if (dump_files) of2.open(alignfile); pair_no = 0 ; // sentence pair number // for each sentence pair in the corpus perp.clear() ; // clears cross_entrop & perplexity viterbiPerp.clear(); sentPair sent ; while(sHandler1.getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float count = sent.getCount(); if ((sent.sentenceNo % 1000) == 0) cerr <<sent.sentenceNo << '\n'; time_t sent_s = time(NULL) ; pair_no++ ; l = es.size() - 1 ; m = fs.size() - 1 ; if (Log){ logmsg << "Processing sentence pair:\n\t"; printSentencePair(es, fs, logmsg); for (i = 0 ; i <= l ; i++) logmsg << Elist.getVocabList()[es[i]].word << " "; logmsg << "\n\t"; for (j = 1 ; j <= m ; j++) logmsg << Flist.getVocabList()[fs[j]].word << " "; logmsg << "\n"; } LogProb align_total_count=0; // LogProb best_score; Vector<WordIndex> viterbi_alignment; LogProb viterbi_score ; alignmodel neighborhood; neighborhood.clear(); align_total_count = 0; findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count,*/ es, fs, align_total_count, neighborhood) ; if (Peg){ for (i = 0 ; i <= l ; i++) for (j = 1 ; j <= m ; j++){ if ( (tTable.getProb(es[i], fs[j]) > PROB_SMOOTH) && (aTable.getValue(i, j, l, m) > PROB_SMOOTH) && (dTable.getValue(j, i, l, m) > PROB_SMOOTH)) findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count, */ es, fs, align_total_count, neighborhood, i, j); } } // Now Collect counts over saved neighborhoods viterbi_score = 0 ; if (Verbose) cerr << "\nCollecting counts over found alignments, total prob: " << align_total_count << "\n"; if (Log) logmsg << "\nCollecting counts over found alignments, total prob: " << align_total_count << "\n"; hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator align ; int acount = 0 ; if (align_total_count == 0 ){ cerr << " WARNINIG: For the following sentence pair : \n"; printSentencePair(es, fs, cerr); cerr << "The collection of alignments found have 0 probability!!\n"; cerr << "No counts will be collected of it \n"; if (Log){ logmsg << "The collection of alignments found have 0 probability!!\n"; logmsg << "No counts will be collected of it \n"; } } else { if (collect_counts) { for(align = neighborhood.begin(); align != neighborhood.end(); align++){ temp = (*align).second/align_total_count ; collectCountsOverAlignement(/*tTable, aCountTable, */es, fs, /*p1_count, p0_count ,*/ ((*align).first), temp , count); acount++; if (viterbi_score < temp){ viterbi_alignment = ((*align).first); viterbi_score = temp; } } } // end of if (collect_counts) perp.addFactor(log(double(align_total_count)), count, l, m,0); viterbiPerp.addFactor(log(double(viterbi_score)), count, l, m,0); if (Verbose){ cerr << "Collected counts over "<<acount <<" (of " << pow(double(m), double(l+1)) <<") differnet alignments\n"; cerr << "Bucket count of alignments hash: "<< neighborhood.getHash().bucket_count()<< ", size " << neighborhood.getHash().size() << "\n"; } if (Log){ logmsg << "Collected counts over "<<acount <<" (of " << pow(double(m), double(l+1)) <<") differnet alignments\n"; logmsg << "Bucket count of alignments hash: "<< neighborhood.getHash().bucket_count()<< "\n"; } } // end of else // write best alignment (viterbi) for this sentence pair to alignment file if (collect_counts){ if (viterbi_score <= 0){ cerr << "Viterbi Alignment for this pair have score zero!!\n"; of2 << "\n\n"; } else { if (dump_files) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, pair_no, viterbi_score); addAL(viterbi_alignment,sent.sentenceNo,l); } } // end of if (collect_counts) double period = difftime(time(NULL), sent_s); if (Log) logmsg << "processing this sentence pair ("<<l+1<<"x"<<m<<") : "<< (l+1)*m << " took : " << period << " seconds\n"; if (Verbose) cerr << "processing this sentence pair took : " << period << " seconds\n"; } /* of sentence pair E, F */ sHandler1.rewind(); errorReportAL(cerr,model); perp.record(model); viterbiPerp.record(model); if (dump_files) of2.close(); }
void model3::em(int noIterations, sentenceHandler& sHandler1) { LogProb all_prob, aprob, temp; WordIndex i, j, l, m; time_t it_st, st, it_fn, fn; string tfile, dfile, nfile, p0file, afile, number; st = time(NULL) ; cout << "\n" << "Starting Model3: Training"; // sentenceHandler sHandler1(efFilename.c_str()); sHandler1.rewind(); for (int it=1; it <= noIterations; it++) { it_st = time(NULL) ; cout << "\n" << "Model3: Iteration " << it; // set up the names of the files where the tables will be printed int n = it; number = ""; do { //mj changed next line number.insert((size_t) 0, 1, (char)(n % 10 + '0')); } while ((n /= 10) > 0); tfile = Prefix + ".t3." + number; afile = Prefix + ".a3." + number; nfile = Prefix + ".n3." + number; dfile = Prefix + ".d3." + number; p0file = Prefix + ".p0_3." + number; // tCountTable.clear(); dCountTable.clear(); nCountTable.clear(); p0_count = 0.0; p1_count = 0.0; all_prob = 0; sentPair sent; while (sHandler1.getNextSentence(sent)) { Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float count = sent.getCount(); if ((sent.sentenceNo % 1000) == 0) cout <<sent.sentenceNo << '\n'; Vector<WordIndex> A(fs.size(),/*-1*/0); Vector<WordIndex> Fert(es.size(),0); LogProb lcount=(LogProb)count; l = es.size()-1; m = fs.size()-1; WordIndex x, y; all_prob = prob_of_target_given_source(tTable, fs, es); if (all_prob == 0) cout << "\n" <<"all_prob = 0"; for (x = 0; x < pow(l+1.0, double(m)) ; x++) { // For all possible alignmets A y = x; for (j = 1; j <= m; j++) { A[j] = y % (l+1); y /= (l+1); } for (i = 0; i <= l; i++) Fert[i] = 0; for (j = 1; j <= m; j++) Fert[A[j]]++; if (2 * Fert[0] <= m) { /* consider alignments that has Fert[0] less than half the number of words in French sentence */ aprob = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); temp = aprob/all_prob; LogProb templcount = temp*lcount; for (j = 1; j <= m; j++) { tTable.incCount(es[A[j]], fs[j], templcount); if (0 != A[j]) dCountTable.addValue(j, A[j], l, m, templcount); } for (i = 0; i <= l; i++) { nCountTable.addValue(es[i], Fert[i], templcount); //cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n'; } p1_count += double(temp) * (Fert[0] * count); p0_count += double(temp) * ((m - 2 * Fert[0]) * count); } } /* of looping over all alignments */ } /* of sentence pair E, F */ sHandler1.rewind(); // normalize tables if (OutputInAachenFormat==1) tTable.printCountTable(tfile.c_str(), Elist.getVocabList(), Flist.getVocabList(), 1); tTable.normalizeTable(Elist, Flist); aCountTable.normalize(aTable); dCountTable.normalize(dTable); nCountTable.normalize(nTable, &Elist.getVocabList()); // normalize p1 & p0 if (p1_count + p0_count != 0) { p1 = p1_count / (p1_count + p0_count ); p0 = 1 - p1; } else { p1 = p0 = 0; } // print tables if (OutputInAachenFormat==0) tTable.printProbTable(tfile.c_str(), Elist.getVocabList(), Flist.getVocabList(), OutputInAachenFormat); dTable.printTable(dfile.c_str()); nTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(), OutputInAachenFormat); ofstream of(p0file.c_str()); of << p0; of.close(); it_fn = time(NULL) ; cout << "\n" << "Model3 Iteration "<<it<<" took: " << difftime(it_fn, it_st) << " seconds\n"; } /* of iterations */ fn = time(NULL) ; cout << "\n" << "Entire Model3 Training took: " << difftime(fn, st) << " seconds\n"; }
void hmm::em_loop(Perplexity& perp, sentenceHandler& sHandler1, bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, bool test,bool doInit,int ) { WordIndex i, j, l, m ; double cross_entropy; int pair_no=0 ; perp.clear(); viterbi_perp.clear(); ofstream of2; // for each sentence pair in the corpus if (dump_alignment||FEWDUMPS ) of2.open(alignfile); sentPair sent ; sHandler1.rewind(); while(sHandler1.getNextSentence(sent)){ const Vector<WordIndex>& es = sent.get_eSent(); const Vector<WordIndex>& fs = sent.get_fSent(); const float so = sent.getCount(); l = es.size() - 1; m = fs.size() - 1; cross_entropy = log(1.0); Vector<WordIndex> viterbi_alignment(fs.size()); unsigned int I=2*l,J=m; bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2); bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0); HMMNetwork *net=makeHMMNetwork(es,fs,doInit); Array<double> gamma; Array<Array2<double> > epsilon(DependencyOfJ?(m-1):1); double trainProb; trainProb=ForwardBackwardTraining(*net,gamma,epsilon); if( !test ) { /* bool eIsPunct[l + 1]; bool fIsPunct[m + 1]; for(unsigned int dummy = 0; dummy <= l; dummy++) { eIsPunct[dummy] = isPunctuation(evlist[es[dummy]].word); } for(unsigned int dummy = 0; dummy <= m; dummy++){ fIsPunct[dummy] = isPunctuation(fvlist[fs[dummy]].word); }*/ double *gp=conv<double>(gamma.begin()); for(unsigned int i2=0;i2<J;i2++)for(unsigned int i1=0;i1<I;++i1,++gp) if( *gp>MINCOUNTINCREASE ) { COUNT add= *gp*so; if( i1>=l ) { tTable.incCount(es[0],fs[1+i2],add); aCountTable.getRef(0,i2+1,l,m)+=add; } else //if((eIsPunct[i1 + 1] && fIsPunct[i2 + 1] && evlist[es[i1 + 1]].word == fvlist[fs[i2 + 1]].word) || (!eIsPunct[i1 + 1] && !fIsPunct[i2 + 1]) ) { tTable.incCount(es[1+i1],fs[1+i2],add); aCountTable.getRef(1+i1,1+i2,l,m)+=add; } } double p0c=0.0,np0c=0.0; for(unsigned int jj=0;jj<epsilon.size();jj++) { int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(jj)+1)]); double *ep=epsilon[jj].begin(); if( ep ) { //for(i=0;i<I;i++) // normalize_if_possible_with_increment(ep+i,ep+i+I*I,I); // for(i=0;i<I*I;++i) // ep[i] *= I; //if( DependencyOfJ ) // if( J-1 ) // for(i=0;i<I*I;++i) // ep[i] /= (J-1); double mult=1.0; mult*=l; //if( DependencyOfJ && J-1) // mult/=(J-1); for(i=0;i<I;i++) { for(unsigned int i_bef=0;i_bef<I;i_bef++,ep++) { CLASSIFY(i,i_empty,ireal); CLASSIFY2(i_bef,i_befreal); if( i_empty ) p0c+=*ep * mult; else { counts.addAlCount(i_befreal,ireal,l,m,ewordclasses.getClass(es[1+i_befreal]), frenchClass ,jj+1,*ep * mult,0.0); np0c+=*ep * mult; } massert( &epsilon[jj](i,i_bef)== ep); } } } } double *gp1=conv<double>(gamma.begin()),*gp2=conv<double>(gamma.end())-I; Array<double>&ai=counts.doGetAlphaInit(I); Array<double>&bi=counts.doGetBetaInit(I); int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0; for(i=0;i<I;i++,gp1++,gp2++) { CLASSIFY(i,i_empty,ireal); ai[i]+= *gp1; bi[i]+= *gp2; if( DependencyOfPrevAJ==0 ) { if( i_empty ) p0c+=*gp1; else { counts.addAlCount(-1,ireal,l,m,0,firstFrenchClass,0,*gp1,0.0); np0c+=*gp1; } } } if( Verbose ) cout << "l: " << l << "m: " << m << " p0c: " << p0c << " np0c: " << np0c << endl; } cross_entropy+=log(max(trainProb,1e-100))+log(max(net->finalMultiply,1e-100)); Array<int>vit; double viterbi_score=1.0; if( (HMMTrainingSpecialFlags&1) ) HMMViterbi(*net,gamma,vit); else viterbi_score=HMMRealViterbi(*net,vit); for(j=1;j<=m;j++) { viterbi_alignment[j]=vit[j-1]+1; if( viterbi_alignment[j]>l) viterbi_alignment[j]=0; } sHandler1.setProbOfSentence(sent,cross_entropy); perp.addFactor(cross_entropy, so, l, m,1); viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1); if( Verbose ) cout << "Viterbi-perp: " << log(viterbi_score) << ' ' << log(max(net->finalMultiply,1e-100)) << ' ' << viterbi_score << ' ' << net->finalMultiply << ' ' << *net << "gamma: " << gamma << endl; delete net;net=0; if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000) ) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score); addAL(viterbi_alignment,sent.getSentenceNo(),l); pair_no++; } /* of while */ sHandler1.rewind(); perp.record("HMM"); viterbi_perp.record("HMM"); errorReportAL(cout,"HMM"); }