void printGIZAPars(ostream&out) { out << "general parameters:\n" "-------------------\n"; printPars(out,getGlobalParSet(),0); out << '\n'; out << "No. of iterations:\n-" "------------------\n"; printPars(out,getGlobalParSet(),PARLEV_ITER); out << '\n'; out << "parameter for various heuristics in GIZA++ for efficient training:\n" "------------------------------------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_OPTHEUR); out << '\n'; out << "parameters for describing the type and amount of output:\n" "-----------------------------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_OUTPUT); out << '\n'; out << "parameters describing input files:\n" "----------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_INPUT); out << '\n'; out << "smoothing parameters:\n" "---------------------\n"; printPars(out,getGlobalParSet(),PARLEV_SMOOTH); out << '\n'; out << "parameters modifying the models:\n" "--------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_MODELS); out << '\n'; out << "parameters modifying the EM-algorithm:\n" "--------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_EM); out << '\n'; }
double StartTraining(int&result) { double errors=0.0; vcbList eTrainVcbList, fTrainVcbList; globeTrainVcbList=&eTrainVcbList; globfTrainVcbList=&fTrainVcbList; string repFilename = Prefix + ".gizacfg" ; ofstream of2(repFilename.c_str()); writeParameters(of2,getGlobalParSet(),-1) ; cout << "reading vocabulary files \n"; eTrainVcbList.setName(SourceVocabFilename.c_str()); fTrainVcbList.setName(TargetVocabFilename.c_str()); eTrainVcbList.readVocabList(); fTrainVcbList.readVocabList(); cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n"; cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n"; vcbList eTestVcbList(eTrainVcbList) ; vcbList fTestVcbList(fTrainVcbList) ; corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList); if (TestCorpusFilename == "NONE") TestCorpusFilename = ""; if (TestCorpusFilename != ""){ cout << "Test corpus will be read from: " << TestCorpusFilename << '\n'; testCorpus= new sentenceHandler(TestCorpusFilename.c_str(), &eTestVcbList, &fTestVcbList); cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n'; cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n"; cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n"; cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) / eTestVcbList.totalVocab() << '\n'; } cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n'; cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n"; cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n"; cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")="; LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2()); cout << "= " << LAMBDA << '\n'; // load dictionary Dictionary *dictionary; useDict = !dictionary_Filename.empty(); if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str()); else dictionary = new Dictionary(""); int minIter=0; #ifdef BINARY_SEARCH_FOR_TTABLE if( CoocurrenceFile.length()==0 ) { cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n"; abort(); } //ifstream coocs(CoocurrenceFile.c_str()); tmodel<COUNT, PROB> tTable(CoocurrenceFile); #else tmodel<COUNT, PROB> tTable; #endif model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp, *corpus,&testPerp, testCorpus, trainViterbiPerp, &testViterbiPerp); amodel<PROB> aTable(false); amodel<COUNT> aCountTable(false); model2 m2(m1,aTable,aCountTable); hmm h(m2); model3 m3(m2); if(ReadTablePrefix.length() ) { string number = "final"; string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file tfile = ReadTablePrefix + ".t3." + number ; afile = ReadTablePrefix + ".a3." + number ; nfile = ReadTablePrefix + ".n3." + number ; dfile = ReadTablePrefix + ".d3." + number ; d4file = ReadTablePrefix + ".d4." + number ; //d5file = ReadTablePrefix + ".d5." + number ; p0file = ReadTablePrefix + ".p0_3." + number ; tTable.readProbTable(tfile.c_str()); aTable.readTable(afile.c_str()); m3.dTable.readTable(dfile.c_str()); m3.nTable.readNTable(nfile.c_str()); sentPair sent ; double p0; ifstream p0f(p0file.c_str()); p0f >> p0; d4model d4m(MAX_SENTENCE_LENGTH); d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); d4m.readProbTable(d4file.c_str()); //d5model d5m(d4m); //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); //d5m.readProbTable(d5file.c_str()); makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2); //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2); if( corpus||testCorpus ) { sentenceHandler *x=corpus; if(x==0) x=testCorpus; cout << "Text corpus exists.\n"; x->rewind(); while(x&&x->getNextSentence(sent)){ Vector<WordIndex>& es = sent.eSent; Vector<WordIndex>& fs = sent.fSent; int l=es.size()-1; int m=fs.size()-1; transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m); alignment al(l,m); cout << "I use the alignment " << sent.sentenceNo-1 << '\n'; //convert(ReferenceAlignment[sent.sentenceNo-1],al); transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0); double p=tm3.prob_of_target_and_alignment_given_source(al,1); cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n'; p=tm4.prob_of_target_and_alignment_given_source(al,3,1); cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n'; //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m); //p=tm5.prob_of_target_and_alignment_given_source(al,3,1); //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n'; } } else { cout << "No corpus exists.\n"; } }
void parseConfigFile (char * fname ) // This functions reads in the configuration file to set up some run-time // parameters. The parameters are global variables that are defined in // main.cc and used all over the place in the program // The format of the configuration file can be explained in the following way // FORMAT: // the character '\n' separates lines .. // lines that start with "//" (skipping over white spaces are considered // as comments and will be ignored. // Any other line is considered as an attribute setting instruction and it // is divided into haves (separated by a colon ":"). The first half is the // attribute value which consists of the concatenation of all non-white space // tokens before the colon. These tokens will have spaces eseparating them. // The attribute vlue is the first token after the colon (any thing after // it will be ignored ; // For example : // if the configuration file has the following entry: // // NO. ITERATIONS MODEL 2 : 10 // // then the attribute is "NO. ITERATIONS MODEL 2" , and the attribute value // is "10" (these do not include the quotation marks). { string line, word, attrib, attribval ; ifstream Config_File(fname); if(!Config_File){ cerr << "ERROR: Cannot open configuration file " << fname << "!\n" ; exit(1); } cout << "The following options are from the config file and will be overwritten by any command line options.\n"; while(getline(Config_File, line)){ istrstream buffer(line.c_str()); word = attrib = attribval = "" ; buffer >> word ; if (word != "//"){ // if line does not start with "//" (i.e. not a comment) attrib = word ; while((buffer >> word) && (word != ":")){ attrib += " " + word ; } if(!(buffer >> attribval)) { istrstream buffer2(line.c_str()); buffer2>>attrib; buffer2>>attribval; } // This# is where (1) the configuration file is defined and // (2) parsing of its attributes occurs. if(attrib == "t FILE"){ t_Filename = attribval; cout << "\tt file: " << t_Filename << '\n'; } else if(attrib == "a FILE"){ a_Filename = attribval; cout << "\ta file: " << a_Filename << '\n'; } else if(attrib == "d FILE"){ d_Filename = attribval; cout << "\td file: " << d_Filename << '\n'; } else if(attrib == "n FILE"){ n_Filename = attribval; cout << "\tn file: " << n_Filename << '\n'; } else if(attrib == "p0 FILE"){ p0_Filename = attribval; cout << "\tp0 file: " << p0_Filename << '\n'; } else if ( line == ""){} else if( !makeSetCommand(attrib,attribval,getGlobalParSet(),2) ) cerr << "ERROR: Unrecognized attribute :" << attrib << '\n'; } }