Example #1
0
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, 
             bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Word2Vec& word2vec, bool useWord2Vec, Perplexity& viterbi_perp, bool test)
{
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS)
    of2.open(alignfile);
  PROB uniform = 1.0/noFrenchWords ;
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1 ;
    bool eindict[l + 1];
    bool findict[m + 1];
    bool indict[m + 1][l + 1];
    bool isSimilar[m + 1][l + 1];
    if(useWord2Vec && word2vec.Method == 1){
        for(unsigned int dummy = 0; dummy <= m; dummy++){
          for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++)
            isSimilar[dummy][dummy2] = false;
        }
        for(i = 1; i <= l; i++){
            map<WordIndex, bool> simWords = word2vec.getVectorMap(es[i]);
            for(j = 1; j <= m; j++){
                if(simWords.find(fs[j]) != simWords.end()){
                    isSimilar[j][i] = true;
                }
            }
        }

    }
    // cout << sent.sentenceNo << endl;
    if(it == 1 && useDict){
      for(unsigned int dummy = 0; dummy <= l; dummy++) {
          eindict[dummy] = false;
      }
      for(unsigned int dummy = 0; dummy <= m; dummy++){
        findict[dummy] = false;
        for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++)
          indict[dummy][dummy2] = false;
      }
      for(j = 0; j <= m; j++)
        for(i = 0; i <= l; i++)
          if(dict.indict(fs[j], es[i])){
            eindict[i] = findict[j] = indict[j][i] = true;
          }
    }

    for(j=1; j <= m; j++){
      // entries  that map fs to all possible ei in this sentence.
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      LpPair<COUNT,PROB> **sPtrCachePtr;

      PROB denom = 0.0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      PROB word_best_score = 0 ;  // score for the best mapping of fj
      if (it == 1 && !seedModel1){
             denom = uniform * es.size();
             word_best_score = uniform ;
      }
      else
	for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	  PROB e(0.0) ;
      (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
      if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
          continue;
      if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH)
	    e = (*((*sPtrCachePtr))).prob;
	  else e = PROB_SMOOTH ;
	  denom += e  ;
	  if (e > word_best_score){
	    word_best_score = e ;
	    best_i = i ;
	  }	}
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score ; /// denom ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }
      cross_entropy += log(denom) ;
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  /* this if loop implements a constraint on counting:
	     count(es[i], fs[j]) is implemented if and only if
	     es[i] and fs[j] occur together in the dictionary, 
	     OR
	     es[i] does not occur in the dictionary with any fs[x] and
	     fs[j] does not occur in the dictionary with any es[y]
	  */
      if(it == 1 && useDict){
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
            if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
                continue;
            if(indict[j][i] || (!findict[j] && !eindict[i])){
		PROB e(0.0) ;

        if (it == 1 && !seedModel1)
		  e =  uniform  ;
		else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		  e = (*((*sPtrCachePtr))).prob;
		else e = PROB_SMOOTH ;
        COUNT x=e*val;
		if( it==1||x>MINCOUNTINCREASE )
		  if ((*sPtrCachePtr) != 0)
		    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	      } /* end of if */
	    } /* end of for i */
	  } /* end of it == 1 */
	  // Old code:
	  else{
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
          if(i && useWord2Vec && !isSimilar[j][i] && word2vec.Method == 1)
              continue;
           PROB e(0.0) ;
          if (it == 1 && !seedModel1){
            e =  uniform  ;
          }
	      else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
            e = (*((*sPtrCachePtr))).prob;
	      else e = PROB_SMOOTH ;
	      //if( !(i==0) )
	      //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
        //  if (useWord2Vec && word2vec.Method == 2)
       //       e = e * (1. - word2vec.L) + word2vec.getW2VProb(es[i], fs[j]) * word2vec.L;
          COUNT x=e*val;

	      if( pair_no==VerboseSentence )
            cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
	      if( it==1||x>MINCOUNTINCREASE )
            if( NoEmptyWord==0 || i!=0 )
                if ((*sPtrCachePtr) != 0)
                    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	    } /* end of for i */
	  } // end of else
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for (j) ;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
    perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
      printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model1");
  viterbi_perp.record("Model1");
  errorReportAL(cout, "IBM-1");
}
Example #2
0
void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1, 
		     bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, 
		     bool test)
{
  massert( aTable.is_distortion==0 );
  massert( aCountTable.is_distortion==0 );
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS )
    of2.open(alignfile);
  sentPair sent ;

  vector<double> ferts(evlist.size());
  
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1;



    for(j=1; j <= m; j++){
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      // entries  that map fs to all possible ei in this sentence.
      PROB denom = 0.0;
      PROB e = 0.0, word_best_score = 0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      for(i=0; i <= l; i++){
	sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
    if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH )
	  e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ;
	else e = PROB_SMOOTH * aTable.getValue(i,j, l, m);
	denom += e ;
	if (e > word_best_score){
	  word_best_score = e ;
	  best_i = i ;
	}
      }
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score; ///denom ;
      cross_entropy += log(denom) ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }      
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  for( i=0; i <= l; i++){
        PROB e(0.0);
	    if (sPtrCache[i] != 0 &&  (*(sPtrCache[i])).prob > PROB_SMOOTH)
	      e = (*(sPtrCache[i])).prob ;
	    else e = PROB_SMOOTH  ;
	    e *= aTable.getValue(i,j, l, m);
	    COUNT temp = COUNT(e) * val ;
	    if( NoEmptyWord==0 || i!=0 )
	      if (sPtrCache[i] != 0) 
		(*(sPtrCache[i])).count += temp ;
	      else 	      
		tTable.incCount(es[i], fs[j], temp);	    
	    aCountTable.getRef(i,j, l, m)+= temp ; 
	  } /* end of for i */
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for (j) ;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    perp.addFactor(cross_entropy, so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) )
      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model2");
  viterbi_perp.record("Model2");
  errorReportAL(cout,"IBM-2");
}
Example #3
0
void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, 
		     bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test)
{
  WordIndex i, j, l, m ;
  double cross_entropy;
  int pair_no=0 ;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS)
    of2.open(alignfile);
  cerr <<" number of French (target) words = " << noFrenchWords << endl;
  PROB uniform = 1.0/noFrenchWords ;
  cerr << "initial unifrom prob = " << uniform << endl;
  sentPair sent ;
  sHandler1.rewind();
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float so  = sent.getCount(); // number of times sentence occurs in corpus
    //std::cerr << "\n\nNEW sentence (#" << (pair_no + 1) << ") with count = " << so << endl;
    l = es.size() - 1;  // source length
    m = fs.size() - 1;  // target length
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());
    double viterbi_score = 1 ;

    /*mebool eindict[l + 1];
    bool findict[m + 1];
    bool indict[m + 1][l + 1];
    if(it == 1 && useDict){
      for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false;
      for(unsigned int dummy = 0; dummy <= m; dummy++){
	findict[dummy] = false;
	for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) 
	  indict[dummy][dummy2] = false;
      }
      for(j = 0; j <= m; j++)
	for(i = 0; i <= l; i++)
	  if(dict.indict(fs[j], es[i])){
	    eindict[i] = findict[j] = indict[j][i] = true;
	  }
    }me*/

    for(j=1; j <= m; j++){
      //cerr << "Current french (TARGET) word = " << fs[j] << endl;
      // entries  that map fs to all possible ei in this sentence.
      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
      LpPair<COUNT,PROB> **sPtrCachePtr;

      PROB denom = 0.0;
      WordIndex best_i = 0 ; // i for which fj is best maped to ei
      PROB word_best_score = 0 ;  // score for the best mapping of fj
      if (it == 1 && !seedModel1){
        //cerr << "Using uniform denominator\n";
	denom = uniform  * es.size() ;
	word_best_score = uniform ;
      }
      else 
	for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
          //cerr << "current english (SOURCE) word = " << es[i] << endl;
	  PROB e(0.0) ;
          srcHits_.insert(es[i]);
	  (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
	  if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
	    e = (*((*sPtrCachePtr))).prob;
	  else e = PROB_SMOOTH ;
	  denom += e  ;
	  if (e > word_best_score){
	    word_best_score = e ;
	    best_i = i ;
	  }	
        }
      viterbi_alignment[j] = best_i ;
      viterbi_score *= word_best_score ; /// denom ;
      if (denom == 0){
	if (test)
	  cerr << "WARNING: denom is zero (TEST)\n";
	else 
	  cerr << "WARNING: denom is zero (TRAIN)\n";
      }
      cross_entropy += log(denom) ;
      if (!test){
	if(denom > 0){	  
	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
	  /* this if loop implements a constraint on counting:
	     count(es[i], fs[j]) is implemented if and only if
	     es[i] and fs[j] occur together in the dictionary, 
	     OR
	     es[i] does not occur in the dictionary with any fs[x] and
	     fs[j] does not occur in the dictionary with any es[y]
	  */
	  /*meif(it == 1 && useDict){
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	      if(indict[j][i] || (!findict[j] && !eindict[i])){
		PROB e(0.0) ;
		if (it == 1 && !seedModel1)
		  e =  uniform  ;
		else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		  e = (*((*sPtrCachePtr))).prob;
		else e = PROB_SMOOTH ;
		COUNT x=e*val;
		if( it==1||x>MINCOUNTINCREASE )
		  if ((*sPtrCachePtr) != 0)
		    (*((*sPtrCachePtr))).count += x;
		  else 	      
		    tTable.incCount(es[i], fs[j], x);
	      } 
	    } 	  } 
	  // Old code:
	  else{me*/
	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
	      //for(i=0; i <= l; i++) {	    
	      PROB e(0.0) ;
	      if (it == 1 && !seedModel1)
		e =  uniform  ;
	      else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
		e = (*((*sPtrCachePtr))).prob;
	      else e = PROB_SMOOTH ;
	      //if( !(i==0) )
	      //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
	      COUNT x=e*val;  // new count
	      if( pair_no==VerboseSentence )
		cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
	      if( it==1||x>MINCOUNTINCREASE ) {
                if(step_k != 0) tTable.stepCounts_[wordPairIds(es[i], fs[j])] += x;
                else if( NoEmptyWord==0 || i!=0 )
		  if ((*sPtrCachePtr) != 0){ 
                    // handles single sentence updates
                    //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here
		    (*((*sPtrCachePtr))).count += x;
                  }
		  else {	      
                    //x = getInterpolatedCount(x, (*((*sPtrCachePtr))).count); // get interpolated count here
		    tTable.incCount(es[i], fs[j], x);
                  }
                // increment temp table instead
              }
	    } /* end of for i */
	  //me} // end of else
	} // end of if (denom > 0)
      }// if (!test)
    } // end of for all (j) target words;
    sHandler1.setProbOfSentence(sent,cross_entropy);
    //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
    perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
      printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
    addAL(viterbi_alignment,sent.sentenceNo,l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("Model1");
  viterbi_perp.record("Model1");
  errorReportAL(cout, "IBM-1");
}
Example #4
0
void HMM::em_loop(Perplexity& perp, SentenceHandler& sHandler1,
                  bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp,
                  bool test,bool doInit,int) {
  WordIndex i, j, l, m;
  double cross_entropy;
  int pair_no=0;
  perp.clear();
  viterbi_perp.clear();
  ofstream of2;
  // for each sentence pair in the corpus
  if (dump_alignment||FEWDUMPS)
    of2.open(alignfile);
  SentencePair sent;
  sHandler1.rewind();
  while (sHandler1.getNextSentence(sent)) {
    const Vector<WordIndex>& es = sent.get_eSent();
    const Vector<WordIndex>& fs = sent.get_fSent();
    const float so  = sent.getCount();
    l = es.size() - 1;
    m = fs.size() - 1;
    cross_entropy = log(1.0);
    Vector<WordIndex> viterbi_alignment(fs.size());

    unsigned int I=2*l,J=m;
    bool DependencyOfJ=(CompareAlDeps&(16|8))||(g_prediction_in_alignments==2);
    bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(g_prediction_in_alignments==0);
    HMMNetwork *net= makeHMMNetwork(es,fs,doInit);
    Array<double> gamma;
    Array<Array2<double> > epsilon(DependencyOfJ?(m-1):1);
    double trainProb;
    trainProb=ForwardBackwardTraining(*net,gamma,epsilon);
    if (!test)
    {
      double *gp=conv<double>(gamma.begin());
      for (unsigned int i2=0;i2<J;i2++)for (unsigned int i1=0;i1<I;++i1,++gp)
                                        if (*gp>MINCOUNTINCREASE)
                                        {
                                          COUNT add= *gp*so;
                                          if (i1>=l)
                                          {
                                            tTable.incCount(es[0],fs[1+i2],add);
                                            aCountTable.getRef(0,i2+1,l,m)+=add;
                                          }
                                          else
                                          {
                                            tTable.incCount(es[1+i1],fs[1+i2],add);
                                            aCountTable.getRef(1+i1,1+i2,l,m)+=add;
                                          }
                                        }
      double p0c=0.0,np0c=0.0;
      for (unsigned int jj=0;jj<epsilon.size();jj++)
      {
        int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(jj)+1)]);
        double *ep=epsilon[jj].begin();
        if (ep)
        {
          //for (i=0;i<I;i++)
          //  normalize_if_possible_with_increment(ep+i,ep+i+I*I,I);
          //    for (i=0;i<I*I;++i)
          //  ep[i] *= I;
          //if (DependencyOfJ)
          //  if (J-1)
          //    for (i=0;i<I*I;++i)
          //      ep[i] /= (J-1);
          double mult=1.0;
          mult*=l;
          //if (DependencyOfJ && J-1)
          //  mult/=(J-1);
          for (i=0;i<I;i++)
          {
            for (unsigned int i_bef=0;i_bef<I;i_bef++,ep++)
            {
              CLASSIFY(i,i_empty,ireal);
              CLASSIFY2(i_bef,i_befreal);
              if (i_empty)
                p0c+=*ep * mult;
              else
              {
                counts.addAlCount(i_befreal,ireal,l,m,ewordclasses.getClass(es[1+i_befreal]),
                                  frenchClass ,jj+1,*ep * mult,0.0);
                np0c+=*ep * mult;
              }
              MASSERT( &epsilon[jj](i,i_bef)== ep);
            }
          }
        }
      }
      double *gp1=conv<double>(gamma.begin()),*gp2=conv<double>(gamma.end())-I;
      Array<double>&ai=counts.doGetAlphaInit(I);
      Array<double>&bi=counts.doGetBetaInit(I);
      int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0;
      for (i=0;i<I;i++,gp1++,gp2++)
      {
        CLASSIFY(i,i_empty,ireal);
        ai[i]+= *gp1;
        bi[i]+= *gp2;
        if (DependencyOfPrevAJ==0)
        {
          if (i_empty)
            p0c+=*gp1;
          else
          {
            counts.addAlCount(-1,ireal,l,m,0,firstFrenchClass,0,*gp1,0.0);
            np0c+=*gp1;
          }
        }
      }
      if (g_is_verbose)
        cout << "l: " << l << "m: " << m << " p0c: " << p0c << " np0c: " << np0c << endl;
    }
    cross_entropy+=log(max(trainProb,1e-100))+log(max(net->finalMultiply,1e-100));
    Array<int>vit;
    double viterbi_score=1.0;
    if ((g_hmm_training_special_flags&1))
      HMMViterbi(*net,gamma,vit);
    else
      viterbi_score=HMMRealViterbi(*net,vit);
    for (j=1;j<=m;j++)
    {
      viterbi_alignment[j]=vit[j-1]+1;
      if (viterbi_alignment[j]>l)
        viterbi_alignment[j]=0;
    }
    sHandler1.setProbOfSentence(sent,cross_entropy);
    perp.addFactor(cross_entropy, so, l, m,1);
    viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1);

    if (g_is_verbose) {
      cout << "Viterbi-perp: " << log(viterbi_score) << ' '
           << log(max(net->finalMultiply,1e-100)) << ' '
           << viterbi_score << ' ' << net->finalMultiply
           << ' ' << *net << "gamma: " << gamma << endl;
    }

    // TODO: Use more safe resource management like RAII.
    delete net;
    net = 0;

    if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000))
      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score);
    addAL(viterbi_alignment,sent.getSentenceNo(),l);
    pair_no++;
  } /* of while */
  sHandler1.rewind();
  perp.record("HMM");
  viterbi_perp.record("HMM");
  errorReportAL(cout,"HMM");
}
Example #5
0
void model3::viterbi_loop(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, 
			   bool dump_files, const char* alignfile, 
			   bool collect_counts, string model )
{
  WordIndex i, j, l, m ;
  ofstream of2 ;
  int pair_no;
  LogProb temp;

  if (dump_files)
    of2.open(alignfile);
  pair_no = 0 ; // sentence pair number 
  // for each sentence pair in the corpus
  perp.clear() ; // clears cross_entrop & perplexity 
  viterbiPerp.clear();
  sentPair sent ;
  while(sHandler1.getNextSentence(sent)){
    Vector<WordIndex>& es = sent.eSent;
    Vector<WordIndex>& fs = sent.fSent;
    const float count  = sent.getCount();
    if ((sent.sentenceNo % 1000) == 0)
      cerr <<sent.sentenceNo << '\n'; 
    time_t sent_s = time(NULL) ;
    pair_no++ ;
    l = es.size() - 1 ;
    m = fs.size() - 1 ;
    if (Log){
      logmsg << "Processing sentence pair:\n\t";
      printSentencePair(es, fs, logmsg);
      for (i = 0 ; i <= l ; i++)
	  logmsg << Elist.getVocabList()[es[i]].word << " ";
	logmsg << "\n\t";
	for (j = 1 ; j <= m ; j++)
	  logmsg << Flist.getVocabList()[fs[j]].word << " ";
	logmsg << "\n";
      } 

      LogProb align_total_count=0;
      //      LogProb best_score;

      Vector<WordIndex> viterbi_alignment;
      LogProb  viterbi_score ;
      alignmodel neighborhood;
      neighborhood.clear();
      align_total_count = 0;
      findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count,*/ es, fs, align_total_count, neighborhood) ;
      if (Peg){
	for (i = 0 ; i <= l ; i++)
	  for (j = 1 ; j <= m ; j++){
	    if ( (tTable.getProb(es[i], fs[j]) > PROB_SMOOTH) &&
		 (aTable.getValue(i, j, l, m) > PROB_SMOOTH) &&
		 (dTable.getValue(j, i, l, m) > PROB_SMOOTH))
	      findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, 
							   p0_count, */ es, fs, align_total_count, neighborhood, i, j);  
	  }
      }
      //  Now Collect counts over saved neighborhoods
      viterbi_score = 0 ;
      if (Verbose)
	cerr << "\nCollecting counts over found alignments, total prob: " 
	     << align_total_count <<  "\n";
      if (Log)
	logmsg << "\nCollecting counts over found alignments, total prob: " 
	       << align_total_count <<  "\n";
      hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator align ;
      int acount = 0 ;
      if (align_total_count == 0 ){
	cerr << " WARNINIG: For the following sentence pair : \n";
	printSentencePair(es, fs, cerr);
	cerr << "The collection of alignments found have 0 probability!!\n";
	cerr << "No counts will be collected of it \n";
	if (Log){
	  logmsg << "The collection of alignments found have 0 probability!!\n";
	  logmsg << "No counts will be collected of it \n";
	}
      }
      else {
	if (collect_counts) {
	  for(align = neighborhood.begin(); align != neighborhood.end(); align++){
	    temp = (*align).second/align_total_count ;	  
	    collectCountsOverAlignement(/*tTable, aCountTable, */es, fs, /*p1_count, 
					  p0_count ,*/ ((*align).first), temp , count);
	    acount++;
	    if (viterbi_score < temp){
	      viterbi_alignment = ((*align).first);
	      viterbi_score = temp;
	    }
	  }
	} // end of if (collect_counts)
	perp.addFactor(log(double(align_total_count)), count, l, m,0);
	viterbiPerp.addFactor(log(double(viterbi_score)), count, l, m,0);
	
      if (Verbose){
	cerr << "Collected counts over "<<acount <<" (of "
	     << pow(double(m), double(l+1)) <<") differnet alignments\n";
	cerr << "Bucket count of alignments hash: "<<
	  neighborhood.getHash().bucket_count()<< ", size " <<
	  neighborhood.getHash().size() << "\n";
	}
	if (Log){
	  logmsg << "Collected counts over "<<acount <<" (of "
		 << pow(double(m), double(l+1)) <<") differnet alignments\n";
	  logmsg << "Bucket count of alignments hash: "<<
	    neighborhood.getHash().bucket_count()<< "\n";
	}
      } // end of else 
      // write best alignment (viterbi) for this sentence pair to alignment file 
      if (collect_counts){
	if (viterbi_score <= 0){
	  cerr << "Viterbi Alignment for this pair have score zero!!\n";
	  of2 << "\n\n";
	}
	else {
	  if (dump_files)
	    printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, pair_no, viterbi_score);
	  addAL(viterbi_alignment,sent.sentenceNo,l);
	}
      } // end of if (collect_counts) 
      double period = difftime(time(NULL), sent_s);
      if (Log)
      	logmsg << "processing this sentence pair ("<<l+1<<"x"<<m<<") : "<<
      	  (l+1)*m << " took : " << period << " seconds\n";
      if (Verbose)
	cerr << "processing this sentence pair took : " << period
	     << " seconds\n";
      
    } /* of sentence pair E, F */
    sHandler1.rewind();
    errorReportAL(cerr,model);
    perp.record(model);
    viterbiPerp.record(model);
    if (dump_files)
      of2.close();

}
Example #6
0
void addFirstAL(ArrayList* liste, void* element) {
    addAL(liste, element, 0);
    return;
}