Exemplo n.º 1
0
void printOverlapReport(const tmodel<COUNT, PROB>& tTable, 
			sentenceHandler& testHandler,  vcbList& trainEList, 
			vcbList& trainFList, vcbList& testEList, vcbList& testFList)
{
  set<pair<WordIndex, WordIndex> > testCoocur ;
  sentPair s ;
  /*  string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ;
      ofstream of_unseenCoocur(unseenCoocurFile.c_str());
      
      string seenCoocurFile = Prefix + ".tst.seen.cooc" ;
      ofstream of_seenCoocur(seenCoocurFile.c_str());
  */  
  testHandler.rewind();
  int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ;
  while(testHandler.getNextSentence(s)){    
    for (WordIndex i = 1 ; i < s.eSent.size() ; i++)
      for (WordIndex j = 1 ; j < s.fSent.size() ; j++)	
	testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ;
  }
  set<pair<WordIndex, WordIndex> >::const_iterator i ;
  for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){
    if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){
      seen_coocur ++ ;
      //      of_seenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
    else {
      unseen_coocur++;
      //      of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
  }
  
  string trgUnkFile = Prefix + ".tst.trg.unk" ;
  ofstream of_trgUnk(trgUnkFile.c_str());

  for (WordIndex i = 0 ; i <  testFList.getVocabList().size() && i < testFList.uniqTokens();i++)
    if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){
      of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq
		<< '\n';
      trgUnk++ ;
    }
  string srcUnkFile = Prefix + ".tst.src.unk" ;
  ofstream of_srcUnk(srcUnkFile.c_str());

  for (WordIndex j = 0 ; j <  testEList.getVocabList().size() && j < testEList.uniqTokens();j++)
    if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){
      srcUnk++ ;
      of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq
		<< '\n';
    }
  string summaryFile = Prefix + ".tst.stats" ;  
  ofstream of_summary(summaryFile.c_str());
  of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n";
  of_summary << "source unique tokens: " <<  testEList.uniqTokens() << '\n';
  of_summary << "target unique tokens: " <<  testFList.uniqTokens() << '\n';
  of_summary << "unique unseen source tokens: " << srcUnk << '\n';
  of_summary << "unique unseen target tokens: " << trgUnk << '\n';
  of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n';
  of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n';
  
}
Exemplo n.º 2
0
void model1::initialize_table_uniformly(sentenceHandler& sHandler1){
  WordIndex i, j;

  cout << "Initialize tTable\n";

  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    PROB uniform = 1.0/es.size() ;
    for( i=0; i < es.size(); i++)
      for(j=1; j < fs.size(); j++)
	tTable.insert(es[i],fs[j],0,uniform);
  }
}
Exemplo n.º 3
0
void model1::initialize_table_uniformly(sentenceHandler& sHandler1){
  WordIndex i, j;

  cout << "Initialize tTable\n";
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;  // source
    Vector<WordIndex>& fs = sent.fSent; // target
    PROB uniform = 1.0/es.size() ;
    for( i=0; i < es.size(); i++)
      for(j=1; j < fs.size(); j++) {
        //cout << "es[i]=" << es[i] << "\tfs[j]=" << fs[j] << endl;
	tTable.insert(es[i],fs[j],0,uniform);
      }
  }
  tTable.printProbTable("./testfile",Elist.getVocabList(),Flist.getVocabList(),true);
}
Exemplo n.º 4
0
void model2::initialize_table_uniformly(sentenceHandler& sHandler1){
  // initialize the aTable uniformly (run this before running em_with_tricks)
  int n=0;
  sentPair sent ;
  sHandler1.rewind();
   while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    WordIndex l = es.size() - 1;
    WordIndex m = fs.size() - 1;
    n++;
    if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH)
      {
	PROB uniform_val = 1.0 / (l+1) ;
	for(WordIndex j=1; j <= m; j++)
	  for(WordIndex i=0; i <= l; i++)
	    aTable.setValue(i,j, l, m, uniform_val);
      }
  }
}
Exemplo n.º 5
0
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, 
             bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Word2Vec& word2vec, bool useWord2Vec, Perplexity& viterbi_perp, bool test)
{
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS)
    of2.open(alignfile);
  PROB uniform = 1.0/noFrenchWords ;
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1 ;
    bool eindict[l + 1];
    bool findict[m + 1];
    bool indict[m + 1][l + 1];
    bool isSimilar[m + 1][l + 1];
    if(useWord2Vec && word2vec.Method == 1){
        for(unsigned int dummy = 0; dummy <= m; dummy++){
          for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++)
            isSimilar[dummy][dummy2] = false;
        }
        for(i = 1; i <= l; i++){
            map<WordIndex, bool> simWords = word2vec.getVectorMap(es[i]);
            for(j = 1; j <= m; j++){
                if(simWords.find(fs[j]) != simWords.end()){
                    isSimilar[j][i] = true;
                }
            }
        }

    }
    // cout << sent.sentenceNo << endl;
    if(it == 1 && useDict){
      for(unsigned int dummy = 0; dummy <= l; dummy++) {
          eindict[dummy] = false;
      }
      for(unsigned int dummy = 0; dummy <= m; dummy++){
        findict[dummy] = false;
        for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++)
          indict[dummy][dummy2] = false;
      }
      for(j = 0; j <= m; j++)
        for(i = 0; i <= l; i++)
          if(dict.indict(fs[j], es[i])){
            eindict[i] = findict[j] = indict[j][i] = true;
          }
    }

    for(j=1; j <= m; j++){
      // entries  that map fs to all possible ei in this sentence.
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      LpPair<COUNT,PROB> **sPtrCachePtr;

      PROB denom = 0.0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      PROB word_best_score = 0 ;  // score for the best mapping of fj
      if (it == 1 && !seedModel1){
             denom = uniform * es.size();
             word_best_score = uniform ;
      }
      else
	for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	  PROB e(0.0) ;
      (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
      if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
          continue;
      if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH)
	    e = (*((*sPtrCachePtr))).prob;
	  else e = PROB_SMOOTH ;
	  denom += e  ;
	  if (e > word_best_score){
	    word_best_score = e ;
	    best_i = i ;
	  }	}
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score ; /// denom ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }
      cross_entropy += log(denom) ;
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  /* this if loop implements a constraint on counting:
	     count(es[i], fs[j]) is implemented if and only if
	     es[i] and fs[j] occur together in the dictionary, 
	     OR
	     es[i] does not occur in the dictionary with any fs[x] and
	     fs[j] does not occur in the dictionary with any es[y]
	  */
      if(it == 1 && useDict){
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
            if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
                continue;
            if(indict[j][i] || (!findict[j] && !eindict[i])){
		PROB e(0.0) ;

        if (it == 1 && !seedModel1)
		  e =  uniform  ;
		else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		  e = (*((*sPtrCachePtr))).prob;
		else e = PROB_SMOOTH ;
        COUNT x=e*val;
		if( it==1||x>MINCOUNTINCREASE )
		  if ((*sPtrCachePtr) != 0)
		    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	      } /* end of if */
	    } /* end of for i */
	  } /* end of it == 1 */
	  // Old code:
	  else{
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
          if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
              continue;
           PROB e(0.0) ;
          if (it == 1 && !seedModel1){
            e =  uniform  ;
          }
	      else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
            e = (*((*sPtrCachePtr))).prob;
	      else e = PROB_SMOOTH ;
	      //if( !(i==0) )
	      //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
        //  if (useWord2Vec && word2vec.Method == 2)
       //       e = e * (1. - word2vec.L) + word2vec.getW2VProb(es[i], fs[j]) * word2vec.L;
          COUNT x=e*val;

	      if( pair_no==VerboseSentence )
            cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
	      if( it==1||x>MINCOUNTINCREASE )
            if( NoEmptyWord==0 || i!=0 )
                if ((*sPtrCachePtr) != 0)
                    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	    } /* end of for i */
	  } // end of else
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for (j) ;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
    perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
      printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model1");
  viterbi_perp.record("Model1");
  errorReportAL(cout, "IBM-1");
}
Exemplo n.º 6
0
void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1, 
		     bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, 
		     bool test)
{
  massert( aTable.is_distortion==0 );
  massert( aCountTable.is_distortion==0 );
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS )
    of2.open(alignfile);
  sentPair sent ;

  vector<double> ferts(evlist.size());
  
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1;



    for(j=1; j <= m; j++){
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      // entries  that map fs to all possible ei in this sentence.
      PROB denom = 0.0;
      PROB e = 0.0, word_best_score = 0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      for(i=0; i <= l; i++){
	sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
    if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH )
	  e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ;
	else e = PROB_SMOOTH * aTable.getValue(i,j, l, m);
	denom += e ;
	if (e > word_best_score){
	  word_best_score = e ;
	  best_i = i ;
	}
      }
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score; ///denom ;
      cross_entropy += log(denom) ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }      
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  for( i=0; i <= l; i++){
        PROB e(0.0);
	    if (sPtrCache[i] != 0 &&  (*(sPtrCache[i])).prob > PROB_SMOOTH)
	      e = (*(sPtrCache[i])).prob ;
	    else e = PROB_SMOOTH  ;
	    e *= aTable.getValue(i,j, l, m);
	    COUNT temp = COUNT(e) * val ;
	    if( NoEmptyWord==0 || i!=0 )
	      if (sPtrCache[i] != 0) 
		(*(sPtrCache[i])).count += temp ;
	      else 	      
		tTable.incCount(es[i], fs[j], temp);	    
	    aCountTable.getRef(i,j, l, m)+= temp ; 
	  } /* end of for i */
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for (j) ;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    perp.addFactor(cross_entropy, so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) )
      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model2");
  viterbi_perp.record("Model2");
  errorReportAL(cout,"IBM-2");
}
Exemplo n.º 7
0
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, 
		     bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test)
{
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS)
    of2.open(alignfile);
  cerr <<" number of French (target) words = " << noFrenchWords << endl;
  PROB uniform = 1.0/noFrenchWords ;
  cerr << "initial unifrom prob = " << uniform << endl;
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount(); // number of times sentence occurs in corpus
    //std::cerr << "\n\nNEW sentence (#" << (pair_no + 1) << ") with count = " << so << endl;
    l = es.size() - 1;  // source length
    m = fs.size() - 1;  // target length
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1 ;

    /*mebool eindict[l + 1];
    bool findict[m + 1];
    bool indict[m + 1][l + 1];
    if(it == 1 && useDict){
      for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false;
      for(unsigned int dummy = 0; dummy <= m; dummy++){
	findict[dummy] = false;
	for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) 
	  indict[dummy][dummy2] = false;
      }
      for(j = 0; j <= m; j++)
	for(i = 0; i <= l; i++)
	  if(dict.indict(fs[j], es[i])){
	    eindict[i] = findict[j] = indict[j][i] = true;
	  }
    }me*/

    for(j=1; j <= m; j++){
      //cerr << "Current french (TARGET) word = " << fs[j] << endl;
      // entries  that map fs to all possible ei in this sentence.
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      LpPair<COUNT,PROB> **sPtrCachePtr;

      PROB denom = 0.0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      PROB word_best_score = 0 ;  // score for the best mapping of fj
      if (it == 1 && !seedModel1){
        //cerr << "Using uniform denominator\n";
	denom = uniform  * es.size() ;
	word_best_score = uniform ;
      }
      else 
	for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
          //cerr << "current english (SOURCE) word = " << es[i] << endl;
	  PROB e(0.0) ;
          srcHits_.insert(es[i]);
	  (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
	  if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
	    e = (*((*sPtrCachePtr))).prob;
	  else e = PROB_SMOOTH ;
	  denom += e  ;
	  if (e > word_best_score){
	    word_best_score = e ;
	    best_i = i ;
	  }	
        }
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score ; /// denom ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }
      cross_entropy += log(denom) ;
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  /* this if loop implements a constraint on counting:
	     count(es[i], fs[j]) is implemented if and only if
	     es[i] and fs[j] occur together in the dictionary, 
	     OR
	     es[i] does not occur in the dictionary with any fs[x] and
	     fs[j] does not occur in the dictionary with any es[y]
	  */
	  /*meif(it == 1 && useDict){
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	      if(indict[j][i] || (!findict[j] && !eindict[i])){
		PROB e(0.0) ;
		if (it == 1 && !seedModel1)
		  e =  uniform  ;
		else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		  e = (*((*sPtrCachePtr))).prob;
		else e = PROB_SMOOTH ;
		COUNT x=e*val;
		if( it==1||x>MINCOUNTINCREASE )
		  if ((*sPtrCachePtr) != 0)
		    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	      } 
	    } 	  } 
	  // Old code:
	  else{me*/
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	      //for(i=0; i <= l; i++) {	    
	      PROB e(0.0) ;
	      if (it == 1 && !seedModel1)
		e =  uniform  ;
	      else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		e = (*((*sPtrCachePtr))).prob;
	      else e = PROB_SMOOTH ;
	      //if( !(i==0) )
	      //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
	      COUNT x=e*val;  // new count
	      if( pair_no==VerboseSentence )
		cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
	      if( it==1||x>MINCOUNTINCREASE ) {
                if(step_k != 0) tTable.stepCounts_[wordPairIds(es[i], fs[j])] += x;
                else if( NoEmptyWord==0 || i!=0 )
		  if ((*sPtrCachePtr) != 0){ 
                    // handles single sentence updates
                    //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here
		    (*((*sPtrCachePtr))).count += x;
                  }
		  else {	      
                    //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here
		    tTable.incCount(es[i], fs[j], x);
                  }
                // increment temp table instead
              }
	    } /* end of for i */
	  //me} // end of else
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for all (j) target words;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
    perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
      printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model1");
  viterbi_perp.record("Model1");
  errorReportAL(cout, "IBM-1");
}
Exemplo n.º 8
0
void model3::viterbi_loop(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, 
			   bool dump_files, const char* alignfile, 
			   bool collect_counts, string model )
{
  WordIndex i, j, l, m ;
  ofstream of2 ;
  int pair_no;
  LogProb temp;

  if (dump_files)
    of2.open(alignfile);
  pair_no = 0 ; // sentence pair number 
  // for each sentence pair in the corpus
  perp.clear() ; // clears cross_entrop & perplexity 
  viterbiPerp.clear();
  sentPair sent ;
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float count  = sent.getCount();
    if ((sent.sentenceNo % 1000) == 0)
      cerr <<sent.sentenceNo << '\n'; 
    time_t sent_s = time(NULL) ;
    pair_no++ ;
    l = es.size() - 1 ;
    m = fs.size() - 1 ;
    if (Log){
      logmsg << "Processing sentence pair:\n\t";
      printSentencePair(es, fs, logmsg);
      for (i = 0 ; i <= l ; i++)
	  logmsg << Elist.getVocabList()[es[i]].word << " ";
	logmsg << "\n\t";
	for (j = 1 ; j <= m ; j++)
	  logmsg << Flist.getVocabList()[fs[j]].word << " ";
	logmsg << "\n";
      } 

      LogProb align_total_count=0;
      //      LogProb best_score;

      Vector<WordIndex> viterbi_alignment;
      LogProb  viterbi_score ;
      alignmodel neighborhood;
      neighborhood.clear();
      align_total_count = 0;
      findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count,*/ es, fs, align_total_count, neighborhood) ;
      if (Peg){
	for (i = 0 ; i <= l ; i++)
	  for (j = 1 ; j <= m ; j++){
	    if ( (tTable.getProb(es[i], fs[j]) > PROB_SMOOTH) &&
		 (aTable.getValue(i, j, l, m) > PROB_SMOOTH) &&
		 (dTable.getValue(j, i, l, m) > PROB_SMOOTH))
	      findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, 
							   p0_count, */ es, fs, align_total_count, neighborhood, i, j);  
	  }
      }
      //  Now Collect counts over saved neighborhoods
      viterbi_score = 0 ;
      if (Verbose)
	cerr << "\nCollecting counts over found alignments, total prob: " 
	     << align_total_count <<  "\n";
      if (Log)
	logmsg << "\nCollecting counts over found alignments, total prob: " 
	       << align_total_count <<  "\n";
      hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator align ;
      int acount = 0 ;
      if (align_total_count == 0 ){
	cerr << " WARNINIG: For the following sentence pair : \n";
	printSentencePair(es, fs, cerr);
	cerr << "The collection of alignments found have 0 probability!!\n";
	cerr << "No counts will be collected of it \n";
	if (Log){
	  logmsg << "The collection of alignments found have 0 probability!!\n";
	  logmsg << "No counts will be collected of it \n";
	}
      }
      else {
	if (collect_counts) {
	  for(align = neighborhood.begin(); align != neighborhood.end(); align++){
	    temp = (*align).second/align_total_count ;	  
	    collectCountsOverAlignement(/*tTable, aCountTable, */es, fs, /*p1_count, 
					  p0_count ,*/ ((*align).first), temp , count);
	    acount++;
	    if (viterbi_score < temp){
	      viterbi_alignment = ((*align).first);
	      viterbi_score = temp;
	    }
	  }
	} // end of if (collect_counts)
	perp.addFactor(log(double(align_total_count)), count, l, m,0);
	viterbiPerp.addFactor(log(double(viterbi_score)), count, l, m,0);
	
      if (Verbose){
	cerr << "Collected counts over "<<acount <<" (of "
	     << pow(double(m), double(l+1)) <<") differnet alignments\n";
	cerr << "Bucket count of alignments hash: "<<
	  neighborhood.getHash().bucket_count()<< ", size " <<
	  neighborhood.getHash().size() << "\n";
	}
	if (Log){
	  logmsg << "Collected counts over "<<acount <<" (of "
		 << pow(double(m), double(l+1)) <<") differnet alignments\n";
	  logmsg << "Bucket count of alignments hash: "<<
	    neighborhood.getHash().bucket_count()<< "\n";
	}
      } // end of else 
      // write best alignment (viterbi) for this sentence pair to alignment file 
      if (collect_counts){
	if (viterbi_score <= 0){
	  cerr << "Viterbi Alignment for this pair have score zero!!\n";
	  of2 << "\n\n";
	}
	else {
	  if (dump_files)
	    printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, pair_no, viterbi_score);
	  addAL(viterbi_alignment,sent.sentenceNo,l);
	}
      } // end of if (collect_counts) 
      double period = difftime(time(NULL), sent_s);
      if (Log)
      	logmsg << "processing this sentence pair ("<<l+1<<"x"<<m<<") : "<<
      	  (l+1)*m << " took : " << period << " seconds\n";
      if (Verbose)
	cerr << "processing this sentence pair took : " << period
	     << " seconds\n";
      
    } /* of sentence pair E, F */
    sHandler1.rewind();
    errorReportAL(cerr,model);
    perp.record(model);
    viterbiPerp.record(model);
    if (dump_files)
      of2.close();

}
Exemplo n.º 9
0
void model3::em(int noIterations, sentenceHandler& sHandler1) {

	LogProb all_prob, aprob, temp;
	WordIndex i, j, l, m;
	time_t it_st, st, it_fn, fn;
	string tfile, dfile, nfile, p0file, afile, number;

	st = time(NULL) ;
	cout << "\n" << "Starting Model3:  Training";
	//  sentenceHandler sHandler1(efFilename.c_str());
	sHandler1.rewind();
	for (int it=1; it <= noIterations; it++) {
		it_st = time(NULL) ;
		cout << "\n" << "Model3: Iteration " << it;

		// set up the names of the files where the tables will be printed 
		int n = it;
		number = "";
		do {
			//mj changed next line
			number.insert((size_t) 0, 1, (char)(n % 10 + '0'));
		} while ((n /= 10) > 0);
		tfile = Prefix + ".t3." + number;
		afile = Prefix + ".a3." + number;
		nfile = Prefix + ".n3." + number;
		dfile = Prefix + ".d3." + number;
		p0file = Prefix + ".p0_3." + number;
		//    tCountTable.clear();
		dCountTable.clear();
		nCountTable.clear();
		p0_count = 0.0;
		p1_count = 0.0;
		all_prob = 0;
		sentPair sent;
		while (sHandler1.getNextSentence(sent)) {
			Vector<WordIndex>& es = sent.eSent;
			Vector<WordIndex>& fs = sent.fSent;
			const float count = sent.getCount();
			if ((sent.sentenceNo % 1000) == 0)
				cout <<sent.sentenceNo << '\n';
			Vector<WordIndex> A(fs.size(),/*-1*/0);
			Vector<WordIndex> Fert(es.size(),0);
			LogProb lcount=(LogProb)count;
			l = es.size()-1;
			m = fs.size()-1;
			WordIndex x, y;
			all_prob = prob_of_target_given_source(tTable, fs, es);
			if (all_prob == 0)
				cout << "\n" <<"all_prob = 0";

			for (x = 0; x < pow(l+1.0, double(m)) ; x++) { // For all possible alignmets A
				y = x;
				for (j = 1; j <= m; j++) {
					A[j] = y % (l+1);
					y /= (l+1);
				}
				for (i = 0; i <= l; i++)
					Fert[i] = 0;
				for (j = 1; j <= m; j++)
					Fert[A[j]]++;
				if (2 * Fert[0] <= m) { /* consider alignments that has Fert[0] less than
				 half the number of words in French sentence */
					aprob = prob_of_target_and_alignment_given_source(A, Fert,
							tTable, fs, es);
					temp = aprob/all_prob;
					LogProb templcount = temp*lcount;

					for (j = 1; j <= m; j++) {
						tTable.incCount(es[A[j]], fs[j], templcount);
						if (0 != A[j])
							dCountTable.addValue(j, A[j], l, m, templcount);
					}
					for (i = 0; i <= l; i++) {
						nCountTable.addValue(es[i], Fert[i], templcount);
						//cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n';
					}
					p1_count += double(temp) * (Fert[0] * count);
					p0_count += double(temp) * ((m - 2 * Fert[0]) * count);
				}
			} /* of looping over all alignments */
		} /* of sentence pair E, F */
		sHandler1.rewind();

		// normalize tables
		if (OutputInAachenFormat==1)
			tTable.printCountTable(tfile.c_str(), Elist.getVocabList(),
					Flist.getVocabList(), 1);
		tTable.normalizeTable(Elist, Flist);
		aCountTable.normalize(aTable);
		dCountTable.normalize(dTable);
		nCountTable.normalize(nTable, &Elist.getVocabList());

		// normalize p1 & p0 

		if (p1_count + p0_count != 0) {
			p1 = p1_count / (p1_count + p0_count );
			p0 = 1 - p1;
		} else {
			p1 = p0 = 0;
		}
		// print tables 
		if (OutputInAachenFormat==0)
			tTable.printProbTable(tfile.c_str(), Elist.getVocabList(),
					Flist.getVocabList(), OutputInAachenFormat);
		dTable.printTable(dfile.c_str());
		nTable.printNTable(Elist.uniqTokens(), nfile.c_str(),
				Elist.getVocabList(), OutputInAachenFormat);
		ofstream of(p0file.c_str());
		of << p0;
		of.close();
		it_fn = time(NULL) ;
		cout << "\n" << "Model3 Iteration "<<it<<" took: " << difftime(it_fn,
				it_st) << " seconds\n";

	} /* of iterations */
	fn = time(NULL) ;
	cout << "\n" << "Entire Model3 Training took: " << difftime(fn, st)
			<< " seconds\n";
}
Exemplo n.º 10
0
void hmm::em_loop(Perplexity& perp, sentenceHandler& sHandler1, 
		  bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, 
		     bool test,bool doInit,int 
)
{
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS )
    of2.open(alignfile);
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    const Vector<WordIndex>& es = sent.get_eSent();
    const Vector<WordIndex>& fs = sent.get_fSent();
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());

    unsigned int I=2*l,J=m;
    bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2);
    bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0);
    HMMNetwork *net=makeHMMNetwork(es,fs,doInit);
    Array<double> gamma;
    Array<Array2<double> > epsilon(DependencyOfJ?(m-1):1);
    double trainProb;
      trainProb=ForwardBackwardTraining(*net,gamma,epsilon);
    if( !test )
      {
      /*  bool eIsPunct[l + 1];
        bool fIsPunct[m + 1];

        for(unsigned int dummy = 0; dummy <= l; dummy++) {
           eIsPunct[dummy] = isPunctuation(evlist[es[dummy]].word);
        }
        for(unsigned int dummy = 0; dummy <= m; dummy++){
           fIsPunct[dummy] = isPunctuation(fvlist[fs[dummy]].word);
        }*/
	double *gp=conv<double>(gamma.begin());
	for(unsigned int i2=0;i2<J;i2++)for(unsigned int i1=0;i1<I;++i1,++gp)
	  if( *gp>MINCOUNTINCREASE )
	    {
	      COUNT add= *gp*so;
	      if( i1>=l )
		{
		  tTable.incCount(es[0],fs[1+i2],add);
		  aCountTable.getRef(0,i2+1,l,m)+=add;
		}
          else //if((eIsPunct[i1 + 1] && fIsPunct[i2 + 1] && evlist[es[i1 + 1]].word == fvlist[fs[i2 + 1]].word) || (!eIsPunct[i1 + 1] && !fIsPunct[i2 + 1]) )

		{
		  tTable.incCount(es[1+i1],fs[1+i2],add);
		  aCountTable.getRef(1+i1,1+i2,l,m)+=add;
		}
	    }
	double p0c=0.0,np0c=0.0;
	for(unsigned int jj=0;jj<epsilon.size();jj++)
	  {
	    int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(jj)+1)]);
	    double *ep=epsilon[jj].begin();
	    if( ep )
	      {
		//for(i=0;i<I;i++)
		//  normalize_if_possible_with_increment(ep+i,ep+i+I*I,I);
		//		for(i=0;i<I*I;++i)
		//  ep[i] *= I;
		//if( DependencyOfJ )
		//  if( J-1 )
		//    for(i=0;i<I*I;++i)
		//      ep[i] /= (J-1);
		double mult=1.0;
		mult*=l;
		//if( DependencyOfJ && J-1)
		//  mult/=(J-1);
		for(i=0;i<I;i++)
		  {
		    for(unsigned int i_bef=0;i_bef<I;i_bef++,ep++)
		      {
			CLASSIFY(i,i_empty,ireal);
			CLASSIFY2(i_bef,i_befreal);
			if( i_empty )
			  p0c+=*ep * mult;
			else
			  {
			    counts.addAlCount(i_befreal,ireal,l,m,ewordclasses.getClass(es[1+i_befreal]),
					      frenchClass ,jj+1,*ep * mult,0.0);
			    np0c+=*ep * mult; 
			  }
			massert( &epsilon[jj](i,i_bef)== ep);
		      }
		  }
	      }
	  }
	double *gp1=conv<double>(gamma.begin()),*gp2=conv<double>(gamma.end())-I;
	Array<double>&ai=counts.doGetAlphaInit(I);
	Array<double>&bi=counts.doGetBetaInit(I);
	int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0;
	for(i=0;i<I;i++,gp1++,gp2++)
	  {
	    CLASSIFY(i,i_empty,ireal);
	    ai[i]+= *gp1;
	    bi[i]+= *gp2;
	    if( DependencyOfPrevAJ==0 )
	      {
		if( i_empty )
		  p0c+=*gp1;
		else
		  {
		    counts.addAlCount(-1,ireal,l,m,0,firstFrenchClass,0,*gp1,0.0);
		    np0c+=*gp1;
		  }
	      }
	  }
    if( Verbose )
      cout << "l: " << l << "m: " << m << " p0c: " << p0c << " np0c: " << np0c << endl;
      }
    cross_entropy+=log(max(trainProb,1e-100))+log(max(net->finalMultiply,1e-100));
    Array<int>vit;
    double viterbi_score=1.0;
    if( (HMMTrainingSpecialFlags&1) )
      HMMViterbi(*net,gamma,vit);
    else
      viterbi_score=HMMRealViterbi(*net,vit);
    for(j=1;j<=m;j++)
      {
	viterbi_alignment[j]=vit[j-1]+1;
	if( viterbi_alignment[j]>l)
	  viterbi_alignment[j]=0;
      }
    sHandler1.setProbOfSentence(sent,cross_entropy);
    perp.addFactor(cross_entropy, so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1);
    if( Verbose )
      cout << "Viterbi-perp: " << log(viterbi_score) << ' ' << log(max(net->finalMultiply,1e-100)) << ' ' << viterbi_score << ' ' << net->finalMultiply << ' ' << *net << "gamma: " << gamma << endl;
    delete net;net=0;
    if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000) )
      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score);
    addAL(viterbi_alignment,sent.getSentenceNo(),l);    
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("HMM");
  viterbi_perp.record("HMM");
  errorReportAL(cout,"HMM");
}