/* * This computes the function value at a given weight vector. * This should be parallelized by dividing the training examples * into subsets and doing each on a different thread. * * @param weights - the value of the weights * @param the value of the function * */ double WFST_Trainer_Local::value2(const column_vector &weights) { cout << "LIKELIHOOD\n"; double likelihood = 0.0; update_arc_weights(weights); // ``filler'' FSTs to replace // the results from the composition // declared outside for loop for efficiency VectorFst<LogArc> medial; VectorFst<LogArc> final; medial.SetInputSymbols(fst->InputSymbols()); medial.SetOutputSymbols(fst->OutputSymbols()); final.SetInputSymbols(fst->InputSymbols());
///functor: Run composition itself, separate from load times (e.g. which may include an fst expansion). VectorFst<Arc> * operator() () { if (!fst_.NumStates() ) { LWARN ("Empty lattice. ... Skipping LM application!"); return NULL; } while ( qc_.size() ) { StateId s = qc_.front(); qc_.pop(); pair<StateId, const typename KenLMModelT::State> p = get ( s ); StateId& s1 = p.first; const typename KenLMModelT::State s2 = p.second; for ( ArcIterator< VectorFst<Arc> > arc1 ( fst_, s1 ); !arc1.Done(); arc1.Next() ) { const Arc& a1 = arc1.Value(); float w = 0; float wp = wp_; typename KenLMModelT::State nextlmstate; if ( epsilons_.find ( a1.olabel ) == epsilons_.end() ) { w = lmmodel_.Score ( s2, idbridge_.map (a1.olabel), nextlmstate ) * natlog10_; //silly hack if ( a1.olabel <= 2 ) { wp = 0; if (a1.olabel == 1 ) w = 0; //get same result as srilm } } else { nextlmstate = s2; wp = 0; //We don't count epsilon labels } pair<StateId, bool> nextp = add ( nextlmstate , a1.nextstate , fst_.Final ( a1.nextstate ) ); StateId& newstate = nextp.first; bool visited = nextp.second; composed_->AddArc ( s , Arc ( a1.ilabel, a1.olabel , Times ( a1.weight, Times (mw_ ( w ) , mw_ (wp) ) ) , newstate ) ); //Finally, only add newstate to the queue if it hasn't been visited previously if ( !visited ) { qc_.push ( newstate ); } } } LINFO ( "Done! Number of states=" << composed_->NumStates() ); return composed_; };
/** * Constructor. Initializes on-the-fly composition with a language model. * \param fst Machine you want to apply the language to. Pass a delayed machine if you can, as it will expand it in constructor. * \param model A KenLM language model * \param epsilons List of words to work as epsilons * \param natlog Use or not natural logs * \param lmscale Language model scale */ ApplyLanguageModelOnTheFly ( const Fst<Arc>& fst, KenLMModelT& model, #ifndef USE_GOOGLE_SPARSE_HASH unordered_set<Label>& epsilons, #else google::dense_hash_set<Label>& epsilons, #endif bool natlog, float lmscale , float lmwp, const IdBridgeT& idbridge ) : composed_ ( NULL ) , natlog10_ ( natlog ? -lmscale* ::log ( 10.0 ) : -lmscale ), fst_ ( fst ), lmmodel_ ( model ), vocab_ ( model.GetVocabulary() ), wp_ ( lmwp ) , epsilons_ ( epsilons ) , history ( model.Order(), 0), idbridge_ (idbridge) { #ifdef USE_GOOGLE_SPARSE_HASH stateexistence_.set_empty_key ( numeric_limits<ull>::max() ); statemap_.set_empty_key ( numeric_limits<uint64_t>::max() ); basic_string<unsigned> aux (KENLM_MAX_ORDER, numeric_limits<unsigned>::max() ); seenlmstates_.set_empty_key ( aux ); #endif buffersize = ( model.Order() - 1 ) * sizeof ( unsigned int ); buffer = const_cast<unsigned *> ( history.c_str() ); if (!fst_.NumStates() ) { LWARN ("Empty lattice"); return; } composed_ = new VectorFst<Arc>; typename KenLMModelT::State bs = model.NullContextState(); ///Initialize with first state pair<StateId, bool> nextp = add ( bs, fst_.Start(), fst_.Final ( fst_.Start() ) ); qc_.push ( nextp.first ); composed_->SetStart ( nextp.first ); };
void process(const FstClass& _fst, const char *output, const string& separator, const string* space) { const Fst<Arc>& fst = *_fst.GetFst<Arc>(); Verify(fst); fst::SymbolTable * const_syms = new fst::SymbolTable("const syms"); const_syms->AddSymbol("<s>"); const_syms->AddSymbol("</s>"); const_syms->AddSymbol("<space>"); const_syms->AddSymbol("<phrase>"); const_syms->AddSymbol("<epsilon>"); const_syms->AddSymbol("!NULL"); VectorFst<Arc> ofst; SplitSymbols<Arc>(fst, &ofst, separator, space, const_syms); delete const_syms; FstWriteOptions opts(output); ofilter os(output); ofst.Write(os, opts); }
M2MFstAligner::M2MFstAligner( string _model_file, bool _penalize, bool _penalize_em, bool _restrict ){ /* Initialize the aligner with a previously trained model. The model requires that the first several symbols in the symbols table contain the separator and other bookkeeping info. */ restrict = _restrict; penalize = _penalize; penalize_em = _penalize_em; penalties.set_empty_key(0); VectorFst<LogArc>* model = VectorFst<LogArc>::Read( _model_file ); for( StateIterator<VectorFst<LogArc> > siter(*model); !siter.Done(); siter.Next() ){ LogArc::StateId q = siter.Value(); for( ArcIterator<VectorFst<LogArc> > aiter(*model, q); !aiter.Done(); aiter.Next() ){ const LogArc& arc = aiter.Value(); alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel,arc.weight) ); } } isyms = (SymbolTable*)model->InputSymbols(); int i = 0; eps = isyms->Find(i);//Can't write '0' here for some reason... skip = isyms->Find(1); string tie = "_"; //tie to pack parameters string sseps = isyms->Find(2); vector<string> seps = tokenize_utf8_string( &sseps, &tie ); seq1_sep = seps[0]; seq2_sep = seps[1]; s1s2_sep = isyms->Find(3); string sparams = isyms->Find(4); vector<string> params = tokenize_utf8_string( &sparams, &tie ); seq1_del = params[0].compare("true") ? false : true; seq2_del = params[1].compare("true") ? false : true; seq1_max = atoi(params[2].c_str()); seq2_max = atoi(params[3].c_str()); }
void M2MFstAligner::write_lattice(string lattice) { //Write out the entire training set in lattice format //Perform the union first. This output can then // be plugged directly in to a counter to obtain expected // alignment counts for the EM-trained corpus. Yields // far higher-quality joint n-gram models, which are also // more robust for smaller training corpora. //Make sure you call this BEFORE any call to // write_all_alignments // as the latter function will override some of the weights //Chaining the standard Union operation, including using a // rational FST still performs very poorly in the log semiring. //Presumably it's running push or something at each step. It // should be fine to do that just once at the end. //Rolling our own union turns out to be MUCH faster. VectorFst<LogArc> ufst; ufst.AddState(); ufst.SetStart(0); int total_states = 0; for (int i = 0; i < fsas.size(); i++) { TopSort(&fsas[i]); for (StateIterator<VectorFst<LogArc> > siter(fsas[i]); !siter.Done(); siter.Next()) { LogArc::StateId q = siter.Value(); LogArc::StateId r; if (q == 0) r = 0; else r = ufst.AddState(); for (ArcIterator <VectorFst<LogArc> > aiter(fsas[i], q); !aiter.Done(); aiter.Next()) { const LogArc & arc = aiter.Value(); ufst.AddArc(r, LogArc(arc.ilabel, arc.ilabel, arc.weight, arc.nextstate + total_states)); } if (fsas[i].Final(q) != LogWeight::Zero()) ufst.SetFinal(r, LogWeight::One()); } total_states += fsas[i].NumStates() - 1; } //Normalize weights Push(&ufst, REWEIGHT_TO_INITIAL); //Write the resulting lattice to disk ufst.Write(lattice); //Write the syms table too. isyms->WriteText("lattice.syms"); return; }
void M2MFstAligner::write_model(string _model_file) { VectorFst<LogArc> model; model.AddState(); model.SetStart(0); model.SetFinal(0, LogWeight::One()); map<LogArc::Label,LogWeight>::iterator it; for (it = alignment_model.begin(); it != alignment_model.end(); it++) model.AddArc(0, LogArc((*it).first, (*it).first, (*it).second, 0)); model.SetInputSymbols(isyms); model.Write(_model_file); return; }
/** * \brief Adds a state. * \return true if the state requested has already been visited, false otherwise. */ inline pair <StateId, bool> add ( typename KenLMModelT::State& m2nextstate, StateId m1nextstate, Weight m1stateweight ) { static StateId lm = 0; getIdx ( m2nextstate ); ///New history: if ( seenlmstates_.find ( history ) == seenlmstates_.end() ) { seenlmstates_[history] = ++lm; } uint64_t compound = m1nextstate * sid + seenlmstates_[history]; LDEBUG ( "compound id=" << compound ); if ( stateexistence_.find ( compound ) == stateexistence_.end() ) { LDEBUG ( "New State!" ); statemap_[composed_->NumStates()] = pair<StateId, const typename KenLMModelT::State > ( m1nextstate, m2nextstate ); composed_->AddState(); if ( m1stateweight != mw_ ( ZPosInfinity() ) ) composed_->SetFinal ( composed_->NumStates() - 1, m1stateweight ); stateexistence_[compound] = composed_->NumStates() - 1; return pair<StateId, bool> ( composed_->NumStates() - 1, false ); } return pair<StateId, bool> ( stateexistence_[compound], true ); };
void M2MFstAligner::_conditional_max( bool y_given_x ){ /* Compute the conditional distribution, P(Y|X) using the WFST paradigm. This is bassed on the approach from Shu and Hetherington 2002. It is assumed that all WFSTs and operations use the Log semiring. Given: FST1 = P(X,Y) Compute: FST2 = P(X) := Map_inv(Det(RmEps(Proj_i(FST1)))) FST3 = P(Y|X) := Compose(FST2,FST1) Proj_i: project on input labels RmEps: epsilon removal Det: determinize Map_inv: invert weights Notes: An analogous process may be used to compute P(X|Y). In this case one would project on OUTPUT labels - Proj_o, and reverse the composition order to Compose(FST1,FST2). Future work: What we are doing here in terms of *generating* the JOINT fst each time is really just a dumb hack. We *should* encode the model in an FST and encode the individual lattices, rather than doing the hacky manual label encoding that we currently rely on. */ //Joint distribution that we start with VectorFst<LogArc>* joint = new VectorFst<LogArc>(); SymbolTable* misyms = new SymbolTable("misyms"); SymbolTable* mosyms = new SymbolTable("mosyms"); joint->AddState(); joint->AddState(); joint->SetStart(0); joint->SetFinal(1,LogArc::Weight::One()); map<LogArc::Label,LogWeight>::iterator it; for( it=prev_alignment_model.begin(); it != prev_alignment_model.end(); it++ ){ string isym = isyms->Find((*it).first); vector<string> io = tokenize_utf8_string( &isym, &s1s2_sep ); LogArc arc( misyms->AddSymbol(io[0]), mosyms->AddSymbol(io[1]), (*it).second, 1 ); joint->AddArc( 0, arc ); } //VectorFst<LogArc>* joint = new VectorFst<LogArc>(); //Push<LogArc,REWEIGHT_TO_FINAL>(*_joint, joint, kPushWeights); //joint->SetFinal(1,LogWeight::One()); joint->Write("m2mjoint.fst"); //BEGIN COMPUTE MARGINAL P(X) VectorFst<LogArc>* dmarg; if( y_given_x ) dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_INPUT)); else dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_OUTPUT)); RmEpsilon(dmarg); VectorFst<LogArc>* marg = new VectorFst<LogArc>(); Determinize(*dmarg, marg); ArcMap(marg, InvertWeightMapper<LogArc>()); if( y_given_x ) ArcSort(marg, OLabelCompare<LogArc>()); else ArcSort(marg, ILabelCompare<LogArc>()); //END COMPUTE MARGINAL P(X) marg->Write("marg.fst"); //CONDITIONAL P(Y|X) VectorFst<LogArc>* cond = new VectorFst<LogArc>(); if( y_given_x ) Compose(*marg, *joint, cond); else Compose(*joint, *marg, cond); //cond now contains the conditional distribution P(Y|X) cond->Write("cond.fst"); //Now update the model with the new values for( MutableArcIterator<VectorFst<LogArc> > aiter(cond, 0); !aiter.Done(); aiter.Next() ){ LogArc arc = aiter.Value(); string lab = misyms->Find(arc.ilabel)+"}"+mosyms->Find(arc.olabel); int labi = isyms->Find(lab); alignment_model[labi] = arc.weight; prev_alignment_model[labi] = LogWeight::Zero(); } delete joint, marg, cond, dmarg; delete misyms, mosyms; return; }
/** * Create an FST based on an RNN */ void FlatBOFstBuilder::convertRNN(CRnnLM & rnnlm, VectorFst<LogArc> &fst) { queue<NeuronFstHistory> q; VectorFst<LogArc> new_fst; NeuronFstHistory fsth(rnnlm.getHiddenLayerSize(),getNumBins()); FstIndex id = 0; NeuronFstHistory new_fsth(rnnlm.getHiddenLayerSize(),getNumBins()); FstIndex new_id; NeuronFstHistory min_backoff(rnnlm.getHiddenLayerSize(),getNumBins()); set<NeuronFstHistory>set_min_backoff; NeuronFstHistory bo_fsth(rnnlm.getHiddenLayerSize(),getNumBins()); bool backoff = false; vector<FstIndex> deleted; real p = 0.00; real p_joint = 0.00; real entropy = 0.0; real delta = 0.0; vector<real> all_prob(rnnlm.getVocabSize()); vector<real> posterior(10); map< FstIndex,set<FstIndex> > pred; vector<bool> non_bo_pred(rnnlm.getVocabSize()); vector<int> to_be_added; vector<int> to_be_removed; for (int i = 0; i < rnnlm.getVocabSize(); i++) { to_be_removed.push_back(i); } vector<real> to_be_added_prob; FstIndex n_added = 0; FstIndex n_processed = 0; FstIndex next_n_added = 0; FstIndex next_n_processed = 0; FstIndex n_backoff = 0; FstIndex n_only_backoff = 0; int v = rnnlm.getVocabSize(); int w = 0; // Initialize rnnlm.copyHiddenLayerToInput(); // printNeurons(rnnlm.getInputLayer(),0,10); // Initial state ( 0 | hidden layer after </s>) printNeurons(rnnlm.getHiddenLayer(),0,10); fsth.setFstHistory(rnnlm, *dzer); fsth.setLastWord(0); q.push(fsth); addFstState(id, new NeuronFstHistory(fsth), fst); fst.SetStart(INIT_STATE); // Final state (don't care about the associated discrete representation) fst.AddState(); fst.SetFinal(FINAL_STATE, LogWeight::One()); /*posterior.at(INIT_STATE) = MY_LOG_ONE;*/ min_backoff.setLastWord(-1); computeEntropyAndConditionals(entropy, all_prob, rnnlm, min_backoff); min_backoff = getBackoff(rnnlm, min_backoff, set_min_backoff, all_prob, to_be_removed); cout << "MIN BACKOFF " << min_backoff.toString() << endl; set_min_backoff.insert(min_backoff); // addFstState(id, min_backoff, fst); // q.push(min_backoff); // Estimate number of backoff loop to bound the backoff path length // float ratioa = 0.0; // float ratiob = 0.0; float ratio = 0.0; // for (int i=0; i < min_backoff.getNumDims(); i++) { // if (min_backoff.getDim(i) == 1) { // ratioa++; // } // if (fsth.getDim(i) == 1) { // ratiob++; // } // } // ratioa /= min_backoff.getNumDims(); // ratiob /= min_backoff.getNumDims(); // ratio = (ratioa*(1.0-ratiob))+(ratiob*(1.0-ratioa)); ratio=1.0; // printf("ratio=%f\t%i BO loops\n", ratio, n_bo_loops); //foreach state in the queue while (!q.empty()) { fsth = q.front(); q.pop(); id = h2state[&fsth]; state2h.push_back(new NeuronFstHistory(fsth)); if (id == FINAL_STATE) { continue; } dprintf(1,"-- STUDY STATE %li = %s\n", id, fsth.toString().c_str()); /* try { posterior.at(id) = MY_LOG_ONE; } catch (exception e) { posterior.resize((int) (posterior.size()*1.5)+1); posterior.at(id) = MY_LOG_ONE; }*/ computeEntropyAndConditionals(entropy, all_prob, rnnlm, fsth); //compute BO in advance and check if it is a min BO node bo_fsth = getBackoff(rnnlm, fsth, set_min_backoff, all_prob, to_be_removed); if (bo_fsth == fsth) { bo_fsth = min_backoff; } //foreach w (ie, foreach word of each class c) //test if the edge has to kept or removed backoff = false; //no backoff yet since no edge has been removed for (w=0; w < rnnlm.getVocabSize(); w++) { p = all_prob[w]; /*p_joint = exp(-posterior[id]-p);*/ p_joint = exp(-p); delta = -1.0*p_joint*log2(p_joint); //accept edge if this leads to a minimum //relative gain of the entropy dprintf(2,"P = %e \tP_joint = %e \tH = %e \tDelta =%e \tDelta H = %.6f %%\n",exp(-p), p_joint, entropy, delta, 100.0*delta/entropy); if (set_min_backoff.find(fsth) != set_min_backoff.end() || (delta > pruning_threshold*entropy)) { // if ((fsth == min_backoff) || (delta > pruning_threshold*entropy)) { next_n_added++; to_be_added.push_back(w); to_be_added_prob.push_back(p); dprintf(2,"\tACCEPT [%li] -- %i (%s) / %f --> ...\t(%e > %e)\n", id, w, rnnlm.getWordString(w), p, delta, pruning_threshold*entropy); // to_be_removed.push_back(w); } //backoff else { // to_be_removed.push_back(w); backoff = true; dprintf(2,"\tPRUNE [%li] -- %i / %f --> ...\n", id, w, p); } //print if (next_n_processed % 100000 == 0) { fprintf(stderr, "\rH=%.5f / N proc'd=%li / N added=%li (%.5f %%) / N bo=%li (%.5f %%) / %li/%li Nodes (%2.1f %%) / N min BO=%i", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, n_backoff, ((float) n_backoff/ (float)n_added)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())), (int) set_min_backoff.size()); } next_n_processed++; // } } //Set a part of the new FST history new_fsth.setFstHistory(rnnlm, *dzer); //if at least one word is backing off if (backoff) { n_backoff++; if (to_be_added.size() == 0) { n_only_backoff++; } if (addFstState(new_id, new NeuronFstHistory(bo_fsth), fst)) { q.push(bo_fsth); try { non_bo_pred.at(new_id) = false; } catch (exception e) { non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1); non_bo_pred.at(new_id) = false; } } dprintf(1,"BACKOFF\t[%li]\t(%s)\n-------\t[%li]\t(%s)\n", id, fsth.toString().c_str(), new_id, bo_fsth.toString().c_str()); fst.AddArc(id, LogArc(EPSILON, EPSILON, LogWeight::Zero(), new_id)); addPred(pred, new_id, id); } vector<real>::iterator it_p = to_be_added_prob.begin(); for (vector<int>::iterator it = to_be_added.begin(); it != to_be_added.end(); ++it) { w = *it; p = *it_p; if (w == 0) { fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,FINAL_STATE)); dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] FINAL STATE)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, FINAL_STATE); } //accept edge else { new_fsth.setLastWord(w); //if sw not in the memory //then add a new state for sw in the FST and push sw in the queue if (addFstState(new_id, new NeuronFstHistory(new_fsth), fst)) { q.push(new_fsth); try { non_bo_pred.at(new_id) = true; } catch (exception e) { non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1); non_bo_pred.at(new_id) = true; } } else { /* already exists */ } //add the edge in the FST non_bo_pred.at(new_id) = true; fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,new_id)); dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] (%s)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, new_id, new_fsth.toString().c_str()); // posterior.at(new_id) += posterior[id]*p; } /*if (posterior[id]+p < LogWeight::Zero().Value()) { p_joint = exp(-posterior[id]-p); entropy -= p_joint*log2(p_joint); }*/ ++it_p; } n_added = next_n_added; n_processed = next_n_processed; //reset queues to_be_added.clear(); to_be_added_prob.clear(); // to_be_removed.clear(); } cout << endl; //compute backoff weights deleted = compactBackoffNodes(fst, pred, non_bo_pred); computeAllBackoff(fst, pred); //remove useless nodes removeStates(fst, new_fst, deleted); fst.DeleteStates(); fst = new_fst; //Fill the table of symbols SymbolTable dic("dictionnary"); dic.AddSymbol("*", 0); for (int i=0; i<rnnlm.getVocabSize(); i++) { dic.AddSymbol(string(rnnlm.getWordString(i)), i+1); } fst.SetInputSymbols(&dic); fst.SetOutputSymbols(&dic); //printf("H=%.5f / N proc'd=%li / N added=%li (%.5f %%) %li/%li Nodes (%2.1f %%)\n", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size()))); cout << "END" << endl; }
vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst, int nbest) { //Generic alignment generator VectorFst<StdArc> fst; Map(ifst, &fst, LogToStdMapper()); for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done(); siter.Next()) { StdArc::StateId q = siter.Value(); for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q); !aiter.Done(); aiter.Next()) { //Prior to decoding we make several 'heuristic' modifications to the weights: // 1. A multiplier is applied to any multi-token substrings // 2. Any LogWeight::Zero() arc weights are reset to '99'. // We are basically resetting 'Infinity' values to a 'smallest non-Infinity' // so that the ShortestPath algorithm actually produces something no matter what. // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1 // are set to '99' this forces shortestpath to choose arcs where one of the // following conditions holds true // * len(subseq1)>1 && len(subseq2)!=len(subseq1) // * len(subseq2)>1 && len(subseq1)!=len(subseq2) // * len(subseq1)==len(subseq2)==1 //I suspect these heuristics can be eliminated with a better choice of the initialization // function and maximization function, but this is the way that m2m-aligner works, so // it makes sense for our first cut implementation. //In any case, this guarantees that M2MFstAligner produces results identical to those // produced by m2m-aligner - but with a bit more reliability. //UPDATE: this now produces a better alignment than m2m-aligner. // The maxl heuristic is still in place. The aligner will produce *better* 1-best alignments // *without* the maxl heuristic below, BUT this comes at the cost of producing a less // flexible corpus. That is, for a small training corpus like nettalk, if we use the // best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen // data. Using the aignment lattices to train the joint ngram model solves this problem. // Oh baby. Can't wait to for everyone to see the paper! //NOTE: this is going to fail if we encounter any alignments in a new test item that never // occurred in the original model. StdArc arc = aiter.Value(); int maxl = get_max_length(isyms->Find(arc.ilabel)); if (maxl == -1) { arc.weight = 999; } else { //Optionally penalize m-to-1 / 1-to-m links. This produces // WORSE 1-best alignments, but results in better joint n-gram // models for small training corpora when using only the 1-best // alignment. By further favoring 1-to-1 alignments the 1-best // alignment corpus results in a more flexible joint n-gram model // with regard to previously unseen data. //if( penalize==true ){ arc.weight = alignment_model[arc.ilabel].Value() * maxl; //}else{ //For larger corpora this is probably unnecessary. //arc.weight = alignment_model[arc.ilabel].Value(); //} } if (arc.weight == LogWeight::Zero()) arc.weight = 999; if (arc.weight != arc.weight) arc.weight = 999; aiter.SetValue(arc); } } VectorFst<StdArc> shortest; ShortestPath(fst, &shortest, nbest); RmEpsilon(&shortest); //Skip empty results. This should only happen // in the following situations: // 1. seq1_del=false && len(seq1)<len(seq2) // 2. seq2_del=false && len(seq1)>len(seq2) //In both 1.and 2. the issue is that we need to // insert a 'skip' in order to guarantee at least // one valid alignment path through seq1*seq2, but // user params didn't allow us to. //Probably better to insert these where necessary // during initialization, regardless of user prefs. if (shortest.NumStates() == 0) { vector<PathData> dummy; return dummy; } FstPathFinder pathfinder(skipSeqs); pathfinder.isyms = isyms; pathfinder.findAllStrings(shortest); return pathfinder.paths; }
void train_model(string eps, string s1s2_sep, string skip, int order, string smooth, string prefix, string seq_sep, string prune, double theta, string count_pattern) { namespace s = fst::script; using fst::script::FstClass; using fst::script::MutableFstClass; using fst::script::VectorFstClass; using fst::script::WeightClass; // create symbols file cout << "Generating symbols..." << endl; NGramInput *ingram = new NGramInput(prefix + ".corpus.aligned", prefix + ".corpus.syms", "", eps, unknown_symbol, "", ""); ingram->ReadInput(0, 1); // compile strings into a far archive cout << "Compiling symbols into FAR archive..." << endl; fst::FarEntryType fet = fst::StringToFarEntryType(entry_type); fst::FarTokenType ftt = fst::StringToFarTokenType(token_type); fst::FarType fartype = fst::FarTypeFromString(far_type); delete ingram; vector<string> in_fname; in_fname.push_back(prefix + ".corpus.aligned"); fst::script::FarCompileStrings(in_fname, prefix + ".corpus.far", arc_type, fst_type, fartype, generate_keys, fet, ftt, prefix + ".corpus.syms", unknown_symbol, keep_symbols, initial_symbols, allow_negative_labels, file_list_input, key_prefix, key_suffix); //count n-grams cout << "Counting n-grams..." << endl; NGramCounter<Log64Weight> ngram_counter(order, epsilon_as_backoff); FstReadOptions opts; FarReader<StdArc> *far_reader; far_reader = FarReader<StdArc>::Open(prefix + ".corpus.far"); int fstnumber = 1; const Fst<StdArc> *ifst = 0, *lfst = 0; while (!far_reader->Done()) { if (ifst) delete ifst; ifst = far_reader->GetFst().Copy(); if (!ifst) { E_FATAL("ngramcount: unable to read fst #%d\n", fstnumber); //exit(1); } bool counted = false; if (ifst->Properties(kString | kUnweighted, true)) { counted = ngram_counter.Count(*ifst); } else { VectorFst<Log64Arc> log_ifst; Map(*ifst, &log_ifst, ToLog64Mapper<StdArc> ()); counted = ngram_counter.Count(&log_ifst); } if (!counted) cout << "ngramcount: fst #" << fstnumber << endl; if (ifst->InputSymbols() != 0) { // retain for symbol table if (lfst) delete lfst; // delete previously observed symbol table lfst = ifst; ifst = 0; } far_reader->Next(); ++fstnumber; } delete far_reader; if (!lfst) { E_FATAL("None of the input FSTs had a symbol table\n"); //exit(1); } VectorFst<StdArc> vfst; ngram_counter.GetFst(&vfst); ArcSort(&vfst, StdILabelCompare()); vfst.SetInputSymbols(lfst->InputSymbols()); vfst.SetOutputSymbols(lfst->InputSymbols()); vfst.Write(prefix + ".corpus.cnts"); StdMutableFst *fst = StdMutableFst::Read(prefix + ".corpus.cnts", true); if (smooth != "no") { cout << "Smoothing model..." << endl; bool prefix_norm = 0; if (smooth == "presmoothed") { // only for use with randgen counts prefix_norm = 1; smooth = "unsmoothed"; // normalizes only based on prefix count } if (smooth == "kneser_ney") { NGramKneserNey ngram(fst, backoff, backoff_label, norm_eps, check_consistency, discount_D, bins); ngram.MakeNGramModel(); fst = ngram.GetMutableFst(); } else if (smooth == "absolute") { NGramAbsolute ngram(fst, backoff, backoff_label, norm_eps, check_consistency, discount_D, bins); ngram.MakeNGramModel(); fst = ngram.GetMutableFst(); } else if (smooth == "katz") { NGramKatz ngram(fst, backoff, backoff_label, norm_eps, check_consistency, bins); ngram.MakeNGramModel(); fst = ngram.GetMutableFst(); } else if (smooth == "witten_bell") { NGramWittenBell ngram(fst, backoff, backoff_label, norm_eps, check_consistency, witten_bell_k); ngram.MakeNGramModel(); fst = ngram.GetMutableFst(); } else if (smooth == "unsmoothed") { NGramUnsmoothed ngram(fst, 1, prefix_norm, backoff_label, norm_eps, check_consistency); ngram.MakeNGramModel(); fst = ngram.GetMutableFst(); } else { E_FATAL("Bad smoothing method: %s\n", smooth.c_str()); } } if (prune != "no") { cout << "Pruning model..." << endl; if (prune == "count_prune") { NGramCountPrune ngramsh(fst, count_pattern, shrink_opt, total_unigram_count, backoff_label, norm_eps, check_consistency); ngramsh.ShrinkNGramModel(); } else if (prune == "relative_entropy") { NGramRelEntropy ngramsh(fst, theta, shrink_opt, total_unigram_count, backoff_label, norm_eps, check_consistency); ngramsh.ShrinkNGramModel(); } else if (prune == "seymore") { NGramSeymoreShrink ngramsh(fst, theta, shrink_opt, total_unigram_count, backoff_label, norm_eps, check_consistency); ngramsh.ShrinkNGramModel(); } else { E_FATAL("Bad shrink method: %s\n", prune.c_str()); } } cout << "Minimizing model..." << endl; MutableFstClass *minimized = new s::MutableFstClass(*fst); Minimize(minimized, 0, fst::kDelta); fst = minimized->GetMutableFst<StdArc>(); cout << "Correcting final model..." << endl; StdMutableFst *out = new StdVectorFst(); relabel(fst, out, prefix, eps, skip, s1s2_sep, seq_sep); cout << "Writing binary model to disk..." << endl; out->Write(prefix + ".fst"); }