Exemplo n.º 1
0
void printOverlapReport(const tmodel<COUNT, PROB>& tTable, 
			sentenceHandler& testHandler,  vcbList& trainEList, 
			vcbList& trainFList, vcbList& testEList, vcbList& testFList)
{
  set<pair<WordIndex, WordIndex> > testCoocur ;
  sentPair s ;
  /*  string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ;
      ofstream of_unseenCoocur(unseenCoocurFile.c_str());
      
      string seenCoocurFile = Prefix + ".tst.seen.cooc" ;
      ofstream of_seenCoocur(seenCoocurFile.c_str());
  */  
  testHandler.rewind();
  int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ;
  while(testHandler.getNextSentence(s)){    
    for (WordIndex i = 1 ; i < s.eSent.size() ; i++)
      for (WordIndex j = 1 ; j < s.fSent.size() ; j++)	
	testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ;
  }
  set<pair<WordIndex, WordIndex> >::const_iterator i ;
  for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){
    if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){
      seen_coocur ++ ;
      //      of_seenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
    else {
      unseen_coocur++;
      //      of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
  }
  
  string trgUnkFile = Prefix + ".tst.trg.unk" ;
  ofstream of_trgUnk(trgUnkFile.c_str());

  for (WordIndex i = 0 ; i <  testFList.getVocabList().size() && i < testFList.uniqTokens();i++)
    if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){
      of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq
		<< '\n';
      trgUnk++ ;
    }
  string srcUnkFile = Prefix + ".tst.src.unk" ;
  ofstream of_srcUnk(srcUnkFile.c_str());

  for (WordIndex j = 0 ; j <  testEList.getVocabList().size() && j < testEList.uniqTokens();j++)
    if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){
      srcUnk++ ;
      of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq
		<< '\n';
    }
  string summaryFile = Prefix + ".tst.stats" ;  
  ofstream of_summary(summaryFile.c_str());
  of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n";
  of_summary << "source unique tokens: " <<  testEList.uniqTokens() << '\n';
  of_summary << "target unique tokens: " <<  testFList.uniqTokens() << '\n';
  of_summary << "unique unseen source tokens: " << srcUnk << '\n';
  of_summary << "unique unseen target tokens: " << trgUnk << '\n';
  of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n';
  of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n';
  
}
Exemplo n.º 2
0
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
  // normalize conditional probability P(fj/ei):
  // i.e. make sure that Sum over all j of P(fj/e) = 1  
  // this method reads the counts portion of the table and normalize into
  // the probability portion. Then the counts are cleared (i.e. zeroed)
  // if the resulting probability of an entry is below a threshold, then 
  // remove it .
{
  if( iter==2 )
    {
      total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
    }
  nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
  nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
  Vector<double> total(engl.uniqTokens(),0.0);
  //Vector<int> nFrench(engl.uniqTokens(), 0);
  //Vector<int> nEng(french.uniqTokens(), 0);

  typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
  for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
    if( iter==2 )
      total2[((*i).first).first] += (*i).second.count;
    total[((*i).first).first] += (*i).second.count;
    nFrench[((*i).first).first]++;
    nEng[((*i).first).second]++;
  }
  for(unsigned int k=0;k<engl.uniqTokens();++k)
    if( nFrench[k] )
      {
	double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
	if( probMass<0.0 )
	  cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << "  nFrench[k]:"<< nFrench[k] << '\n';
	total[k]+= total[k]*probMass/(1-probMass);
      }
  typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
  PROB p ;
  int nParams=0;
  int nor = 0;
  for(j = ef.begin(); j != ef.end(); ){
    k = j;
    k++ ;
    if( (total[((*j).first).first])>0.0 )
      p = ((((*j).second).count) /(total[((*j).first).first])) ;
    else
      p= 0.0;
    if (p > PROB_CUTOFF)
      {
	if( iter>0 )
	  {
        if(useWord2Vec && word2vec.Method == 2){
            p = word2vec.L * word2vec.getW2VProb(((*j).first).first, ((*j).first).second) + (1. - word2vec.L) * p;
            nor = 1;
        }
	    ((*j).second).prob = 0 ;
        ((*j).second).count = p ;
	  }
	else
	  {
	    ((*j).second).prob = p ;
	    ((*j).second).count = 0 ;
	  }
	nParams++;
      }
    else {
      erase(((*j).first).first, ((*j).first).second);
    }
    j = k ;
  }
  if(nor)
      cout << "probabilities Normalized in iteration = " << iter << endl;
  if( iter>0 )
    return normalizeTable(engl, french, iter-1);
  else
    {
    }
}