예제 #1
0
void printOverlapReport(const tmodel<COUNT, PROB>& tTable, 
			sentenceHandler& testHandler,  vcbList& trainEList, 
			vcbList& trainFList, vcbList& testEList, vcbList& testFList)
{
  set<pair<WordIndex, WordIndex> > testCoocur ;
  sentPair s ;
  /*  string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ;
      ofstream of_unseenCoocur(unseenCoocurFile.c_str());
      
      string seenCoocurFile = Prefix + ".tst.seen.cooc" ;
      ofstream of_seenCoocur(seenCoocurFile.c_str());
  */  
  testHandler.rewind();
  int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ;
  while(testHandler.getNextSentence(s)){    
    for (WordIndex i = 1 ; i < s.eSent.size() ; i++)
      for (WordIndex j = 1 ; j < s.fSent.size() ; j++)	
	testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ;
  }
  set<pair<WordIndex, WordIndex> >::const_iterator i ;
  for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){
    if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){
      seen_coocur ++ ;
      //      of_seenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
    else {
      unseen_coocur++;
      //      of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n';
    }
  }
  
  string trgUnkFile = Prefix + ".tst.trg.unk" ;
  ofstream of_trgUnk(trgUnkFile.c_str());

  for (WordIndex i = 0 ; i <  testFList.getVocabList().size() && i < testFList.uniqTokens();i++)
    if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){
      of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq
		<< '\n';
      trgUnk++ ;
    }
  string srcUnkFile = Prefix + ".tst.src.unk" ;
  ofstream of_srcUnk(srcUnkFile.c_str());

  for (WordIndex j = 0 ; j <  testEList.getVocabList().size() && j < testEList.uniqTokens();j++)
    if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){
      srcUnk++ ;
      of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq
		<< '\n';
    }
  string summaryFile = Prefix + ".tst.stats" ;  
  ofstream of_summary(summaryFile.c_str());
  of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n";
  of_summary << "source unique tokens: " <<  testEList.uniqTokens() << '\n';
  of_summary << "target unique tokens: " <<  testFList.uniqTokens() << '\n';
  of_summary << "unique unseen source tokens: " << srcUnk << '\n';
  of_summary << "unique unseen target tokens: " << trgUnk << '\n';
  of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n';
  of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n';
  
}
예제 #2
0
파일: main.cpp 프로젝트: hznlp/giza-kn
void printAllTables(vcbList& eTrainVcbList, vcbList& eTestVcbList,
		    vcbList& fTrainVcbList, vcbList& fTestVcbList, model1& m1)
{
  cerr << "writing Final tables to Disk \n";
  string t_inv_file = Prefix + ".ti.final" ;
  if( !FEWDUMPS)
    m1.getTTable().printProbTableInverse(t_inv_file.c_str(), m1.getEnglishVocabList(), 
					 m1.getFrenchVocabList(), 
					 m1.getETotalWCount(), 
					 m1.getFTotalWCount());
  t_inv_file = Prefix + ".actual.ti.final" ;
  if( !FEWDUMPS )
    m1.getTTable().printProbTableInverse(t_inv_file.c_str(), 
					 eTrainVcbList.getVocabList(), 
					 fTrainVcbList.getVocabList(), 
					 m1.getETotalWCount(), 
					 m1.getFTotalWCount(), true);
  
  string perp_filename = Prefix + ".perp" ;
  ofstream of_perp(perp_filename.c_str());
  
  cout << "Writing PERPLEXITY report to: " << perp_filename << '\n';
  if(!of_perp){
    cerr << "\nERROR: Cannot write to " << perp_filename <<'\n';
    exit(1);
  }
  
  if (testCorpus)
    generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, 
			     testViterbiPerp, of_perp, (*corpus).getTotalNoPairs1(), 
			     (*testCorpus).getTotalNoPairs1(),
			     true);
  else 
    generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, testViterbiPerp, 
			     of_perp, (*corpus).getTotalNoPairs1(), 0, true);
  
  string eTrainVcbFile = Prefix + ".trn.src.vcb" ;
  ofstream of_eTrainVcb(eTrainVcbFile.c_str());
  cout << "Writing source vocabulary list to : " << eTrainVcbFile << '\n';
  if(!of_eTrainVcb){
    cerr << "\nERROR: Cannot write to " << eTrainVcbFile <<'\n';
    exit(1);
  }
  eTrainVcbList.printVocabList(of_eTrainVcb) ;
  
  string fTrainVcbFile = Prefix + ".trn.trg.vcb" ;
  ofstream of_fTrainVcb(fTrainVcbFile.c_str());
  cout << "Writing source vocabulary list to : " << fTrainVcbFile << '\n';
  if(!of_fTrainVcb){
    cerr << "\nERROR: Cannot write to " << fTrainVcbFile <<'\n';
    exit(1);
  }
  fTrainVcbList.printVocabList(of_fTrainVcb) ;
  
  //print test vocabulary list 
  
  string eTestVcbFile = Prefix + ".tst.src.vcb" ;
  ofstream of_eTestVcb(eTestVcbFile.c_str());
  cout << "Writing source vocabulary list to : " << eTestVcbFile << '\n';
  if(!of_eTestVcb){
    cerr << "\nERROR: Cannot write to " << eTestVcbFile <<'\n';
    exit(1);
  }
  eTestVcbList.printVocabList(of_eTestVcb) ;
  
  string fTestVcbFile = Prefix + ".tst.trg.vcb" ;
  ofstream of_fTestVcb(fTestVcbFile.c_str());
  cout << "Writing source vocabulary list to : " << fTestVcbFile << '\n';
  if(!of_fTestVcb){
    cerr << "\nERROR: Cannot write to " << fTestVcbFile <<'\n';
    exit(1);
  }
  fTestVcbList.printVocabList(of_fTestVcb) ;
  printDecoderConfigFile();
  if (testCorpus)
    printOverlapReport(m1.getTTable(), *testCorpus, eTrainVcbList, 
		       fTrainVcbList, eTestVcbList, fTestVcbList);
  
}