///functor: Run composition itself, separate from load times (e.g. which may include an fst expansion). VectorFst<Arc> * operator() () { if (!fst_.NumStates() ) { LWARN ("Empty lattice. ... Skipping LM application!"); return NULL; } while ( qc_.size() ) { StateId s = qc_.front(); qc_.pop(); pair<StateId, const typename KenLMModelT::State> p = get ( s ); StateId& s1 = p.first; const typename KenLMModelT::State s2 = p.second; for ( ArcIterator< VectorFst<Arc> > arc1 ( fst_, s1 ); !arc1.Done(); arc1.Next() ) { const Arc& a1 = arc1.Value(); float w = 0; float wp = wp_; typename KenLMModelT::State nextlmstate; if ( epsilons_.find ( a1.olabel ) == epsilons_.end() ) { w = lmmodel_.Score ( s2, idbridge_.map (a1.olabel), nextlmstate ) * natlog10_; //silly hack if ( a1.olabel <= 2 ) { wp = 0; if (a1.olabel == 1 ) w = 0; //get same result as srilm } } else { nextlmstate = s2; wp = 0; //We don't count epsilon labels } pair<StateId, bool> nextp = add ( nextlmstate , a1.nextstate , fst_.Final ( a1.nextstate ) ); StateId& newstate = nextp.first; bool visited = nextp.second; composed_->AddArc ( s , Arc ( a1.ilabel, a1.olabel , Times ( a1.weight, Times (mw_ ( w ) , mw_ (wp) ) ) , newstate ) ); //Finally, only add newstate to the queue if it hasn't been visited previously if ( !visited ) { qc_.push ( newstate ); } } } LINFO ( "Done! Number of states=" << composed_->NumStates() ); return composed_; };
/** * \brief Adds a state. * \return true if the state requested has already been visited, false otherwise. */ inline pair <StateId, bool> add ( typename KenLMModelT::State& m2nextstate, StateId m1nextstate, Weight m1stateweight ) { static StateId lm = 0; getIdx ( m2nextstate ); ///New history: if ( seenlmstates_.find ( history ) == seenlmstates_.end() ) { seenlmstates_[history] = ++lm; } uint64_t compound = m1nextstate * sid + seenlmstates_[history]; LDEBUG ( "compound id=" << compound ); if ( stateexistence_.find ( compound ) == stateexistence_.end() ) { LDEBUG ( "New State!" ); statemap_[composed_->NumStates()] = pair<StateId, const typename KenLMModelT::State > ( m1nextstate, m2nextstate ); composed_->AddState(); if ( m1stateweight != mw_ ( ZPosInfinity() ) ) composed_->SetFinal ( composed_->NumStates() - 1, m1stateweight ); stateexistence_[compound] = composed_->NumStates() - 1; return pair<StateId, bool> ( composed_->NumStates() - 1, false ); } return pair<StateId, bool> ( stateexistence_[compound], true ); };
/** * Constructor. Initializes on-the-fly composition with a language model. * \param fst Machine you want to apply the language to. Pass a delayed machine if you can, as it will expand it in constructor. * \param model A KenLM language model * \param epsilons List of words to work as epsilons * \param natlog Use or not natural logs * \param lmscale Language model scale */ ApplyLanguageModelOnTheFly ( const Fst<Arc>& fst, KenLMModelT& model, #ifndef USE_GOOGLE_SPARSE_HASH unordered_set<Label>& epsilons, #else google::dense_hash_set<Label>& epsilons, #endif bool natlog, float lmscale , float lmwp, const IdBridgeT& idbridge ) : composed_ ( NULL ) , natlog10_ ( natlog ? -lmscale* ::log ( 10.0 ) : -lmscale ), fst_ ( fst ), lmmodel_ ( model ), vocab_ ( model.GetVocabulary() ), wp_ ( lmwp ) , epsilons_ ( epsilons ) , history ( model.Order(), 0), idbridge_ (idbridge) { #ifdef USE_GOOGLE_SPARSE_HASH stateexistence_.set_empty_key ( numeric_limits<ull>::max() ); statemap_.set_empty_key ( numeric_limits<uint64_t>::max() ); basic_string<unsigned> aux (KENLM_MAX_ORDER, numeric_limits<unsigned>::max() ); seenlmstates_.set_empty_key ( aux ); #endif buffersize = ( model.Order() - 1 ) * sizeof ( unsigned int ); buffer = const_cast<unsigned *> ( history.c_str() ); if (!fst_.NumStates() ) { LWARN ("Empty lattice"); return; } composed_ = new VectorFst<Arc>; typename KenLMModelT::State bs = model.NullContextState(); ///Initialize with first state pair<StateId, bool> nextp = add ( bs, fst_.Start(), fst_.Final ( fst_.Start() ) ); qc_.push ( nextp.first ); composed_->SetStart ( nextp.first ); };
vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst, int nbest) { //Generic alignment generator VectorFst<StdArc> fst; Map(ifst, &fst, LogToStdMapper()); for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done(); siter.Next()) { StdArc::StateId q = siter.Value(); for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q); !aiter.Done(); aiter.Next()) { //Prior to decoding we make several 'heuristic' modifications to the weights: // 1. A multiplier is applied to any multi-token substrings // 2. Any LogWeight::Zero() arc weights are reset to '99'. // We are basically resetting 'Infinity' values to a 'smallest non-Infinity' // so that the ShortestPath algorithm actually produces something no matter what. // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1 // are set to '99' this forces shortestpath to choose arcs where one of the // following conditions holds true // * len(subseq1)>1 && len(subseq2)!=len(subseq1) // * len(subseq2)>1 && len(subseq1)!=len(subseq2) // * len(subseq1)==len(subseq2)==1 //I suspect these heuristics can be eliminated with a better choice of the initialization // function and maximization function, but this is the way that m2m-aligner works, so // it makes sense for our first cut implementation. //In any case, this guarantees that M2MFstAligner produces results identical to those // produced by m2m-aligner - but with a bit more reliability. //UPDATE: this now produces a better alignment than m2m-aligner. // The maxl heuristic is still in place. The aligner will produce *better* 1-best alignments // *without* the maxl heuristic below, BUT this comes at the cost of producing a less // flexible corpus. That is, for a small training corpus like nettalk, if we use the // best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen // data. Using the aignment lattices to train the joint ngram model solves this problem. // Oh baby. Can't wait to for everyone to see the paper! //NOTE: this is going to fail if we encounter any alignments in a new test item that never // occurred in the original model. StdArc arc = aiter.Value(); int maxl = get_max_length(isyms->Find(arc.ilabel)); if (maxl == -1) { arc.weight = 999; } else { //Optionally penalize m-to-1 / 1-to-m links. This produces // WORSE 1-best alignments, but results in better joint n-gram // models for small training corpora when using only the 1-best // alignment. By further favoring 1-to-1 alignments the 1-best // alignment corpus results in a more flexible joint n-gram model // with regard to previously unseen data. //if( penalize==true ){ arc.weight = alignment_model[arc.ilabel].Value() * maxl; //}else{ //For larger corpora this is probably unnecessary. //arc.weight = alignment_model[arc.ilabel].Value(); //} } if (arc.weight == LogWeight::Zero()) arc.weight = 999; if (arc.weight != arc.weight) arc.weight = 999; aiter.SetValue(arc); } } VectorFst<StdArc> shortest; ShortestPath(fst, &shortest, nbest); RmEpsilon(&shortest); //Skip empty results. This should only happen // in the following situations: // 1. seq1_del=false && len(seq1)<len(seq2) // 2. seq2_del=false && len(seq1)>len(seq2) //In both 1.and 2. the issue is that we need to // insert a 'skip' in order to guarantee at least // one valid alignment path through seq1*seq2, but // user params didn't allow us to. //Probably better to insert these where necessary // during initialization, regardless of user prefs. if (shortest.NumStates() == 0) { vector<PathData> dummy; return dummy; } FstPathFinder pathfinder(skipSeqs); pathfinder.isyms = isyms; pathfinder.findAllStrings(shortest); return pathfinder.paths; }