コード例 #1
0
ファイル: main.cpp プロジェクト: hznlp/giza-kn
void printGIZAPars(ostream&out)
{
  out << "general parameters:\n"
         "-------------------\n";
  printPars(out,getGlobalParSet(),0);
  out << '\n';

  out << "No. of iterations:\n-"
         "------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_ITER);
  out << '\n';

  out << "parameter for various heuristics in GIZA++ for efficient training:\n"
         "------------------------------------------------------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_OPTHEUR);
  out << '\n';

  out << "parameters for describing the type and amount of output:\n"
         "-----------------------------------------------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_OUTPUT);
  out << '\n';

  out << "parameters describing input files:\n"
         "----------------------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_INPUT);
  out << '\n';

  out << "smoothing parameters:\n"
         "---------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_SMOOTH);
  out << '\n';

  out << "parameters modifying the models:\n"
         "--------------------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_MODELS);
  out << '\n';

  out << "parameters modifying the EM-algorithm:\n"
         "--------------------------------------\n";
  printPars(out,getGlobalParSet(),PARLEV_EM);
  out << '\n';
}
コード例 #2
0
ファイル: main.cpp プロジェクト: hznlp/giza-kn
double StartTraining(int&result)
{ 
  double errors=0.0;
  vcbList eTrainVcbList, fTrainVcbList;
  globeTrainVcbList=&eTrainVcbList;
  globfTrainVcbList=&fTrainVcbList;


  string repFilename = Prefix + ".gizacfg" ;
  ofstream of2(repFilename.c_str());
  writeParameters(of2,getGlobalParSet(),-1) ;

  cout << "reading vocabulary files \n";
  eTrainVcbList.setName(SourceVocabFilename.c_str());
  fTrainVcbList.setName(TargetVocabFilename.c_str());
  eTrainVcbList.readVocabList();
  fTrainVcbList.readVocabList();
  cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n";
  cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n";
  
  vcbList eTestVcbList(eTrainVcbList) ;
  vcbList fTestVcbList(fTrainVcbList) ;
  
  corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList);

  if (TestCorpusFilename == "NONE")
    TestCorpusFilename = "";

  if (TestCorpusFilename != ""){
    cout << "Test corpus will be read from: " << TestCorpusFilename << '\n';
      testCorpus= new sentenceHandler(TestCorpusFilename.c_str(), 
						       &eTestVcbList, &fTestVcbList);
      cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n';

      cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n";
      cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n";
      cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
      cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
      cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) /
	eTestVcbList.totalVocab() << '\n';
  }
  
  cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n';
  cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n";
  cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n";
  cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
  cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
  cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")=";
  LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2());
  cout << "= " << LAMBDA << '\n';
  // load dictionary
  Dictionary *dictionary;  
  useDict = !dictionary_Filename.empty();
  if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str());
  else dictionary = new Dictionary("");
  int minIter=0;
#ifdef BINARY_SEARCH_FOR_TTABLE
  if( CoocurrenceFile.length()==0 )
    {
      cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n";
      abort();
    }
  //ifstream coocs(CoocurrenceFile.c_str());
  tmodel<COUNT, PROB> tTable(CoocurrenceFile);
#else
  tmodel<COUNT, PROB> tTable;
#endif

  model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp, 
	    *corpus,&testPerp, testCorpus, 
	    trainViterbiPerp, &testViterbiPerp);
   amodel<PROB>  aTable(false);
   amodel<COUNT> aCountTable(false);
   model2 m2(m1,aTable,aCountTable);
   hmm h(m2);
   model3 m3(m2); 
   if(ReadTablePrefix.length() )
     {
       string number = "final";
       string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file
       tfile = ReadTablePrefix + ".t3." + number ;
       afile = ReadTablePrefix + ".a3." + number ;
       nfile = ReadTablePrefix + ".n3." + number ;
       dfile = ReadTablePrefix + ".d3." + number ;
       d4file = ReadTablePrefix + ".d4." + number ;
       //d5file = ReadTablePrefix + ".d5." + number ;
       p0file = ReadTablePrefix + ".p0_3." + number ;
       tTable.readProbTable(tfile.c_str());
       aTable.readTable(afile.c_str());
       m3.dTable.readTable(dfile.c_str());
       m3.nTable.readNTable(nfile.c_str());
       sentPair sent ;
       double p0;
       ifstream p0f(p0file.c_str());
       p0f >> p0;
       d4model d4m(MAX_SENTENCE_LENGTH);
       d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
       d4m.readProbTable(d4file.c_str());
       //d5model d5m(d4m);
       //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
       //d5m.readProbTable(d5file.c_str());
       makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2);
       //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2);
       if( corpus||testCorpus )
	 {
	   sentenceHandler *x=corpus;
	   if(x==0)
	     x=testCorpus;
	   cout << "Text corpus exists.\n";
	   x->rewind();
	   while(x&&x->getNextSentence(sent)){
	     Vector<WordIndex>& es = sent.eSent;
	     Vector<WordIndex>& fs = sent.fSent;
	     int l=es.size()-1;
	     int m=fs.size()-1;
	     transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m);
	     alignment al(l,m);
	     cout << "I use the alignment " << sent.sentenceNo-1 << '\n';
	     //convert(ReferenceAlignment[sent.sentenceNo-1],al);
	     transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0);
	     double p=tm3.prob_of_target_and_alignment_given_source(al,1);
	     cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n';
	     p=tm4.prob_of_target_and_alignment_given_source(al,3,1);
	     cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n';
	     //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m);
	     //p=tm5.prob_of_target_and_alignment_given_source(al,3,1);
	     //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n';
	   }
	 }
       else
	 {
	   cout << "No corpus exists.\n";
	 }
    }
コード例 #3
0
ファイル: parse.cpp プロジェクト: A30041839/joshua
void parseConfigFile (char * fname )
  // This functions reads in the configuration file to set up some run-time
  // parameters. The parameters are global variables that are defined in 
  // main.cc and used all over the place in the program
  // The format of the configuration file can be explained in the following way
  // FORMAT:
  // the character '\n' separates lines ..
  // lines that start with "//" (skipping over white spaces are considered 
  // as comments and will be ignored.
  // Any other line is considered as an attribute setting instruction and it 
  // is divided into haves (separated by a colon ":"). The first half is the
  // attribute value which consists of the concatenation of all non-white space
  // tokens before the colon. These tokens will have spaces eseparating them.
  // The attribute vlue is the first token after the colon (any thing after 
  // it will be ignored ;
  // For example :
  // if the configuration file has the following entry:
  //
  // NO.   ITERATIONS   MODEL 2 :	10
  //
  // then the attribute is "NO. ITERATIONS MODEL 2" , and the attribute value
  // is "10"  (these do not include the quotation marks).

{

  string line, word, attrib, attribval ;
  ifstream Config_File(fname);
  if(!Config_File){
    cerr << "ERROR:  Cannot open configuration file " << fname << "!\n" ;
    exit(1);
  }

  cout << "The following options are from the config file and will be overwritten by any command line options.\n";
  
  while(getline(Config_File, line)){

    istrstream buffer(line.c_str());
    word = attrib = attribval = "" ;
    buffer >> word  ;
    if (word != "//"){ // if line does not start with "//" (i.e. not a comment)
      attrib = word ;
      while((buffer >> word) && (word != ":")){
	attrib += " " + word ;
      }      
      if(!(buffer >> attribval))
	{
	  istrstream buffer2(line.c_str());
	  buffer2>>attrib;
	  buffer2>>attribval;
	}

      // This# is where (1) the configuration file is defined and
      //               (2) parsing of its attributes occurs.
      
      if(attrib == "t FILE"){
	t_Filename = attribval;
	cout << "\tt file:  " << t_Filename << '\n';
      }
      else if(attrib ==  "a FILE"){
	a_Filename = attribval;
	cout << "\ta file:  " << a_Filename << '\n';
      }
      else if(attrib == "d FILE"){
	d_Filename = attribval;
	cout << "\td file:  " << d_Filename << '\n';
      }
      else if(attrib == "n FILE"){
	n_Filename = attribval;
	cout << "\tn file:  " << n_Filename << '\n';
      }
      else if(attrib == "p0 FILE"){
	p0_Filename = attribval;
	cout << "\tp0 file:  " << p0_Filename << '\n';
      }
      else if ( line == ""){}
      else if(  !makeSetCommand(attrib,attribval,getGlobalParSet(),2) )
	cerr << "ERROR: Unrecognized attribute :" << attrib << '\n';
    }
  }