LogProb model3::prob_of_target_given_source(tmodel<COUNT, PROB>& tTable, Vector<WordIndex>& fs, Vector<WordIndex>& es) { WordIndex x, y ; LogProb total = 0 ; // WordIndex l = es.size(), m = fs.size(); WordIndex l = es.size()-1, m = fs.size()-1; Vector<WordIndex> A(fs.size(),/*-1*/0); Vector<WordIndex> Fert(es.size(),0); WordIndex i,j ; for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A y = x ; // for (j = 1 ; j < m ; j++){ for (j = 1 ; j <= m ; j++){ A[j] = y % (l+1) ; y /= (l+1) ; } // for(i = 0 ; i < l ; i++) for(i = 0 ; i <= l ; i++) Fert[i] = 0 ; // for (j = 1 ; j < m ; j++) for (j = 1 ; j <= m ; j++) Fert[A[j]]++; // if (2 * Fert[0] < m){ if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than half the length of french sentence */ total += prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); } } return(total); }
void model3::collectCountsOverAlignement(const Vector<WordIndex>& es, const Vector<WordIndex>& fs, const Vector<WordIndex>& A, LogProb score, float count) { WordIndex j,i,l,m ; Vector<WordIndex> Fert(es.size(),0); l = es.size() - 1 ; m = fs.size() - 1 ; score *= LogProb(count); COUNT temp = COUNT(score) ; for (i=0 ; i <= l ; i++) Fert[i] = 0 ; for (j = 1 ; j <= m ; j++){ Fert[A[j]]++; tTable.incCount(es[A[j]], fs[j], temp); // tCountTable.getRef(es[A[j]], fs[j])+=score; if (A[j]) dCountTable.getRef(j, A[j], l, m)+= temp ; aCountTable.getRef(A[j], j, l, m)+= temp ; } for(i = 0 ; i <= l ; i++) nCountTable.getRef(es[i], Fert[i])+= temp ; // p1_count += score * (LogProb) (Fert[0]) ; // p0_count += score * (LogProb) ((m - 2 * Fert[0])) ; p1_count += temp * (Fert[0]) ; p0_count += temp * ((m - 2 * Fert[0])) ; }
void model3::findAlignmentsNeighborhood(Vector<WordIndex>& es, Vector<WordIndex>& fs, LogProb&align_total_count, alignmodel&neighborhood, int i_peg = -1, int j_peg = -1 ) // Finding the Neigborhood of a best viterbi alignment after hill climbing // if (i_peg == -1 and j_peg == -1, then No Pegging is done. { LogProb best_score,score; WordIndex i,j,l,m,old_i,j1; Vector<WordIndex> A(fs.size(),0); Vector<WordIndex> Fert(es.size(),0); time_t it_st; best_score = 0 ; l = es.size() - 1; m = fs.size() - 1; findBestAlignment(es, fs, A, Fert, best_score, /*tTable, aTable,*/ i_peg, j_peg); if (best_score == 0){ cerr << "WARNING: viterbi alignment score is zero for the following pair\n"; printSentencePair(es, fs, cerr); } hillClimb(es, fs, A, Fert, best_score, tTable, i_peg, j_peg); if (best_score <= 0){ cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; printSentencePair(es, fs, cerr); if(Log){ logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; printSentencePair(es, fs, logmsg); } } else { // best_score > 0 // if (2 * Fert[0] < m ){ if (2*Fert[0] <= m ){ /* consider alignments that has Fert[0] less than half the number of words in French sentence */ if (neighborhood.insert(A, best_score)){ align_total_count += best_score ; } } else { // else part is added for debugging / Yaser cerr << "WARNING:Best Alignment found violates Fertility requiremnets !!\n" ; for (i = 0 ; i <= l ; i++) cerr << "Fert["<<i<<"] = "<< Fert[i] << "\n"; for (j = 1 ; j <= m ; j++){ cerr << "A["<<j<<"] = "<< A[j] <<"\n"; } cerr << "Condition violated : 2 * Fert[0] <= m " << 2*Fert[0] <<"?"<< m << "\n"; } // end of added code for debugging // Yaser it_st = time(NULL) ; // Now find add all neighbors of the best alignmet to the collection for (j = 1 ; j <= m ; j++){ for (j1 = j + 1 ; j1 <= m; j1++){ // all possible swaps if (A[j] != A[j1]){// make sure you are not swapping at same position // score = best_score * scoreOfSwap(es, fs, A, best_score, tTable, j, j1); score = best_score * scoreOfSwap(es, fs, A, tTable, j, j1); // ADD A and its score to list of alig. to collect counts over if (2 * Fert[0] <= m && score > 0){ /* consider alignments that has Fert[0] less than half the number of words in French sentence */ old_i = A[j] ; A[j] = A[j1] ; A[j1] = old_i ; if (neighborhood.insert(A, score)){ align_total_count += score ; } // restore original alignment old_i = A[j] ; A[j] = A[j1] ; A[j1] = old_i ; } } } for (i = 0 ; i <= l ; i++){ // all possible moves if (i != A[j]){ // make sure not to move to same position if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1))) || (i != 0))){ // consider legal alignments only score = best_score * scoreOfMove(es, fs, A, Fert, tTable, j, i); // ADD A and its score to list of alig. to collect counts over if (score > 0){ old_i = A[j] ; A[j] = i ; Fert[old_i]-- ; Fert[i]++ ; // add to list of alignemts here ****************** if (neighborhood.insert(A, score)){ align_total_count += score ; } // now resotre alignment and fertilities to previoud values A[j] = old_i ; Fert[old_i]++ ; Fert[i]-- ; } // end of if (score > 0) } // end of if (i == 0 ...) } // end of if (i != A[j]) }// end of for(i = 0 ; ...) }// end of for (j = 1 ; ...) } // of else best_score <= 0 }
void model3::em(int noIterations, sentenceHandler& sHandler1) { LogProb all_prob, aprob, temp; WordIndex i, j, l, m; time_t it_st, st, it_fn, fn; string tfile, dfile, nfile, p0file, afile, number; st = time(NULL) ; cout << "\n" << "Starting Model3: Training"; // sentenceHandler sHandler1(efFilename.c_str()); sHandler1.rewind(); for (int it=1; it <= noIterations; it++) { it_st = time(NULL) ; cout << "\n" << "Model3: Iteration " << it; // set up the names of the files where the tables will be printed int n = it; number = ""; do { //mj changed next line number.insert((size_t) 0, 1, (char)(n % 10 + '0')); } while ((n /= 10) > 0); tfile = Prefix + ".t3." + number; afile = Prefix + ".a3." + number; nfile = Prefix + ".n3." + number; dfile = Prefix + ".d3." + number; p0file = Prefix + ".p0_3." + number; // tCountTable.clear(); dCountTable.clear(); nCountTable.clear(); p0_count = 0.0; p1_count = 0.0; all_prob = 0; sentPair sent; while (sHandler1.getNextSentence(sent)) { Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; const float count = sent.getCount(); if ((sent.sentenceNo % 1000) == 0) cout <<sent.sentenceNo << '\n'; Vector<WordIndex> A(fs.size(),/*-1*/0); Vector<WordIndex> Fert(es.size(),0); LogProb lcount=(LogProb)count; l = es.size()-1; m = fs.size()-1; WordIndex x, y; all_prob = prob_of_target_given_source(tTable, fs, es); if (all_prob == 0) cout << "\n" <<"all_prob = 0"; for (x = 0; x < pow(l+1.0, double(m)) ; x++) { // For all possible alignmets A y = x; for (j = 1; j <= m; j++) { A[j] = y % (l+1); y /= (l+1); } for (i = 0; i <= l; i++) Fert[i] = 0; for (j = 1; j <= m; j++) Fert[A[j]]++; if (2 * Fert[0] <= m) { /* consider alignments that has Fert[0] less than half the number of words in French sentence */ aprob = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es); temp = aprob/all_prob; LogProb templcount = temp*lcount; for (j = 1; j <= m; j++) { tTable.incCount(es[A[j]], fs[j], templcount); if (0 != A[j]) dCountTable.addValue(j, A[j], l, m, templcount); } for (i = 0; i <= l; i++) { nCountTable.addValue(es[i], Fert[i], templcount); //cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n'; } p1_count += double(temp) * (Fert[0] * count); p0_count += double(temp) * ((m - 2 * Fert[0]) * count); } } /* of looping over all alignments */ } /* of sentence pair E, F */ sHandler1.rewind(); // normalize tables if (OutputInAachenFormat==1) tTable.printCountTable(tfile.c_str(), Elist.getVocabList(), Flist.getVocabList(), 1); tTable.normalizeTable(Elist, Flist); aCountTable.normalize(aTable); dCountTable.normalize(dTable); nCountTable.normalize(nTable, &Elist.getVocabList()); // normalize p1 & p0 if (p1_count + p0_count != 0) { p1 = p1_count / (p1_count + p0_count ); p0 = 1 - p1; } else { p1 = p0 = 0; } // print tables if (OutputInAachenFormat==0) tTable.printProbTable(tfile.c_str(), Elist.getVocabList(), Flist.getVocabList(), OutputInAachenFormat); dTable.printTable(dfile.c_str()); nTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(), OutputInAachenFormat); ofstream of(p0file.c_str()); of << p0; of.close(); it_fn = time(NULL) ; cout << "\n" << "Model3 Iteration "<<it<<" took: " << difftime(it_fn, it_st) << " seconds\n"; } /* of iterations */ fn = time(NULL) ; cout << "\n" << "Entire Model3 Training took: " << difftime(fn, st) << " seconds\n"; }