Beispiel #1
0
LogProb model3::prob_of_target_given_source(tmodel<COUNT, PROB>& tTable, 
					  Vector<WordIndex>& fs, 
					  Vector<WordIndex>& es)
{
  
  WordIndex x, y ;
  LogProb total = 0 ;
  //  WordIndex l = es.size(), m = fs.size();
  WordIndex l = es.size()-1, m = fs.size()-1;
  Vector<WordIndex> A(fs.size(),/*-1*/0);
  Vector<WordIndex> Fert(es.size(),0);
  WordIndex i,j ;

  for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A
    y = x ;
    //    for (j = 1 ; j < m ; j++){
    for (j = 1 ; j <= m ; j++){
      A[j] = y % (l+1) ;
      y /= (l+1) ;
    }
    //    for(i = 0 ; i < l ; i++)
    for(i = 0 ; i <= l ; i++)
      Fert[i] = 0 ;
    //    for (j = 1 ; j < m ; j++)
    for (j = 1 ; j <= m ; j++)
      Fert[A[j]]++;
    //    if (2 * Fert[0] < m){ 
    if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than
			      half the length of french sentence  */
      total += prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es);
    }
  }
  return(total);
}
Beispiel #2
0
void model3::collectCountsOverAlignement(const Vector<WordIndex>& es, 
					 const Vector<WordIndex>& fs, 
					 const Vector<WordIndex>& A, 
					 LogProb score, 
					 float count)
{
  WordIndex j,i,l,m ;
  Vector<WordIndex> Fert(es.size(),0);
  l = es.size() - 1 ;
  m = fs.size() - 1 ;
  score *= LogProb(count);
  COUNT temp = COUNT(score) ;
  for (i=0 ; i <= l ; i++)
    Fert[i] = 0 ;
  for (j = 1 ; j <= m ; j++){
    Fert[A[j]]++;
    tTable.incCount(es[A[j]], fs[j], temp);
    //    tCountTable.getRef(es[A[j]], fs[j])+=score;
    if (A[j])
      dCountTable.getRef(j, A[j], l, m)+= temp ;
    aCountTable.getRef(A[j], j, l, m)+= temp ;
  }
  for(i = 0 ; i <= l ; i++)
    nCountTable.getRef(es[i], Fert[i])+= temp ;
  //  p1_count += score * (LogProb) (Fert[0]) ;
  //  p0_count += score * (LogProb) ((m - 2 * Fert[0])) ;
  p1_count += temp * (Fert[0]) ;
  p0_count += temp *  ((m - 2 * Fert[0])) ;
}
Beispiel #3
0
void model3::findAlignmentsNeighborhood(Vector<WordIndex>& es, 
					Vector<WordIndex>& fs, 
					LogProb&align_total_count, 
					alignmodel&neighborhood, 
					int i_peg = -1, 
					int j_peg = -1
					)
  // Finding the Neigborhood of a best viterbi alignment after hill climbing
     // if (i_peg == -1 and j_peg == -1, then  No Pegging is done.
{
    LogProb best_score,score;
    WordIndex i,j,l,m,old_i,j1;
    Vector<WordIndex> A(fs.size(),0);
    Vector<WordIndex> Fert(es.size(),0);
    time_t it_st;
    
    best_score = 0 ;
    l = es.size() - 1;
    m = fs.size() - 1;
    findBestAlignment(es, fs, A, Fert, best_score, /*tTable, aTable,*/ i_peg, j_peg);
    if (best_score == 0){
      cerr << "WARNING: viterbi alignment score is zero for the following pair\n";
      printSentencePair(es, fs, cerr);
    }
    hillClimb(es, fs, A, Fert, best_score, tTable, i_peg, j_peg);
    if (best_score <= 0){
      cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
      printSentencePair(es, fs, cerr);      
      if(Log){
	logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
	printSentencePair(es, fs, logmsg);
      }
    }
    else { // best_score > 0
      //      if (2 * Fert[0] < m ){ 
      if (2*Fert[0] <= m ){ 
	/* consider alignments that has Fert[0] less than
	   half the number of words in French sentence */
	if (neighborhood.insert(A, best_score)){
	  align_total_count += best_score ;
	}
      }
      else { // else part is added for debugging / Yaser
	cerr << "WARNING:Best Alignment found violates Fertility requiremnets !!\n" ;
	for (i = 0 ; i <= l ; i++)
	  cerr << "Fert["<<i<<"] = "<< Fert[i] << "\n";
	for (j = 1 ; j <= m ; j++){
	  cerr << "A["<<j<<"] = "<< A[j] <<"\n";
	}
	cerr << "Condition violated : 2 * Fert[0] <= m " << 2*Fert[0] <<"?"<<
	  m << "\n";
      } // end of added code for debugging // Yaser
      it_st = time(NULL) ;
      
      // Now find add all neighbors of the best alignmet to the  collection
      for (j = 1 ; j <= m ; j++){
	for (j1 = j + 1 ; j1 <= m; j1++){ // all possible swaps
	  if (A[j] != A[j1]){// make sure you are not swapping at same position
	    //	    score = best_score * scoreOfSwap(es, fs, A, best_score, tTable, j, j1);
	    score = best_score * scoreOfSwap(es, fs, A, tTable, j, j1);
	    // ADD  A and its score to list of alig. to collect counts over
	    if (2 * Fert[0] <= m && score > 0){ 
	      /* consider alignments that has Fert[0] less than
		 half the number of words in French sentence */
	      old_i = A[j] ;
	      A[j] = A[j1] ;
	      A[j1] = old_i ;
	      if (neighborhood.insert(A, score)){
		align_total_count += score ;      
	      }	    
	      // restore original alignment 
	      old_i = A[j] ;
	      A[j] = A[j1] ;
	      A[j1] = old_i ;
	    }
	  }
	}
	for (i = 0 ; i <= l ; i++){ // all possible moves
	  if (i != A[j]){ // make sure not to move to same position
	    if ((Fert[i]+1 < MAX_FERTILITY) && 
		((i == 0 &&  (m >= 2*(Fert[0]+1))) || (i != 0))){ 
	      // consider legal alignments only
	      score = best_score * scoreOfMove(es, fs, A, Fert, tTable, j, i);
	      // ADD  A and its score to list of alig. to collect counts over
	      if (score > 0){
		old_i = A[j] ;
		A[j] = i ;
		Fert[old_i]-- ;
		Fert[i]++ ;
		// add to list of alignemts here  ******************
		if (neighborhood.insert(A, score)){
		  align_total_count += score ;	      
		}
		// now resotre alignment and fertilities to previoud values
		A[j] = old_i ;
		Fert[old_i]++ ;
		Fert[i]-- ;
	      } // end of if (score > 0)
	    } // end of if (i == 0 ...) 
	  } // end of if (i != A[j])
	}// end of for(i = 0 ; ...)
      }// end of for (j = 1 ; ...)
    } // of else best_score <= 0  
}
Beispiel #4
0
void model3::em(int noIterations, sentenceHandler& sHandler1) {

	LogProb all_prob, aprob, temp;
	WordIndex i, j, l, m;
	time_t it_st, st, it_fn, fn;
	string tfile, dfile, nfile, p0file, afile, number;

	st = time(NULL) ;
	cout << "\n" << "Starting Model3:  Training";
	//  sentenceHandler sHandler1(efFilename.c_str());
	sHandler1.rewind();
	for (int it=1; it <= noIterations; it++) {
		it_st = time(NULL) ;
		cout << "\n" << "Model3: Iteration " << it;

		// set up the names of the files where the tables will be printed 
		int n = it;
		number = "";
		do {
			//mj changed next line
			number.insert((size_t) 0, 1, (char)(n % 10 + '0'));
		} while ((n /= 10) > 0);
		tfile = Prefix + ".t3." + number;
		afile = Prefix + ".a3." + number;
		nfile = Prefix + ".n3." + number;
		dfile = Prefix + ".d3." + number;
		p0file = Prefix + ".p0_3." + number;
		//    tCountTable.clear();
		dCountTable.clear();
		nCountTable.clear();
		p0_count = 0.0;
		p1_count = 0.0;
		all_prob = 0;
		sentPair sent;
		while (sHandler1.getNextSentence(sent)) {
			Vector<WordIndex>& es = sent.eSent;
			Vector<WordIndex>& fs = sent.fSent;
			const float count = sent.getCount();
			if ((sent.sentenceNo % 1000) == 0)
				cout <<sent.sentenceNo << '\n';
			Vector<WordIndex> A(fs.size(),/*-1*/0);
			Vector<WordIndex> Fert(es.size(),0);
			LogProb lcount=(LogProb)count;
			l = es.size()-1;
			m = fs.size()-1;
			WordIndex x, y;
			all_prob = prob_of_target_given_source(tTable, fs, es);
			if (all_prob == 0)
				cout << "\n" <<"all_prob = 0";

			for (x = 0; x < pow(l+1.0, double(m)) ; x++) { // For all possible alignmets A
				y = x;
				for (j = 1; j <= m; j++) {
					A[j] = y % (l+1);
					y /= (l+1);
				}
				for (i = 0; i <= l; i++)
					Fert[i] = 0;
				for (j = 1; j <= m; j++)
					Fert[A[j]]++;
				if (2 * Fert[0] <= m) { /* consider alignments that has Fert[0] less than
				 half the number of words in French sentence */
					aprob = prob_of_target_and_alignment_given_source(A, Fert,
							tTable, fs, es);
					temp = aprob/all_prob;
					LogProb templcount = temp*lcount;

					for (j = 1; j <= m; j++) {
						tTable.incCount(es[A[j]], fs[j], templcount);
						if (0 != A[j])
							dCountTable.addValue(j, A[j], l, m, templcount);
					}
					for (i = 0; i <= l; i++) {
						nCountTable.addValue(es[i], Fert[i], templcount);
						//cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n';
					}
					p1_count += double(temp) * (Fert[0] * count);
					p0_count += double(temp) * ((m - 2 * Fert[0]) * count);
				}
			} /* of looping over all alignments */
		} /* of sentence pair E, F */
		sHandler1.rewind();

		// normalize tables
		if (OutputInAachenFormat==1)
			tTable.printCountTable(tfile.c_str(), Elist.getVocabList(),
					Flist.getVocabList(), 1);
		tTable.normalizeTable(Elist, Flist);
		aCountTable.normalize(aTable);
		dCountTable.normalize(dTable);
		nCountTable.normalize(nTable, &Elist.getVocabList());

		// normalize p1 & p0 

		if (p1_count + p0_count != 0) {
			p1 = p1_count / (p1_count + p0_count );
			p0 = 1 - p1;
		} else {
			p1 = p0 = 0;
		}
		// print tables 
		if (OutputInAachenFormat==0)
			tTable.printProbTable(tfile.c_str(), Elist.getVocabList(),
					Flist.getVocabList(), OutputInAachenFormat);
		dTable.printTable(dfile.c_str());
		nTable.printNTable(Elist.uniqTokens(), nfile.c_str(),
				Elist.getVocabList(), OutputInAachenFormat);
		ofstream of(p0file.c_str());
		of << p0;
		of.close();
		it_fn = time(NULL) ;
		cout << "\n" << "Model3 Iteration "<<it<<" took: " << difftime(it_fn,
				it_st) << " seconds\n";

	} /* of iterations */
	fn = time(NULL) ;
	cout << "\n" << "Entire Model3 Training took: " << difftime(fn, st)
			<< " seconds\n";
}