///functor: Run composition itself, separate from load times (e.g. which may include an fst expansion). VectorFst<Arc> * operator() () { if (!fst_.NumStates() ) { LWARN ("Empty lattice. ... Skipping LM application!"); return NULL; } while ( qc_.size() ) { StateId s = qc_.front(); qc_.pop(); pair<StateId, const typename KenLMModelT::State> p = get ( s ); StateId& s1 = p.first; const typename KenLMModelT::State s2 = p.second; for ( ArcIterator< VectorFst<Arc> > arc1 ( fst_, s1 ); !arc1.Done(); arc1.Next() ) { const Arc& a1 = arc1.Value(); float w = 0; float wp = wp_; typename KenLMModelT::State nextlmstate; if ( epsilons_.find ( a1.olabel ) == epsilons_.end() ) { w = lmmodel_.Score ( s2, idbridge_.map (a1.olabel), nextlmstate ) * natlog10_; //silly hack if ( a1.olabel <= 2 ) { wp = 0; if (a1.olabel == 1 ) w = 0; //get same result as srilm } } else { nextlmstate = s2; wp = 0; //We don't count epsilon labels } pair<StateId, bool> nextp = add ( nextlmstate , a1.nextstate , fst_.Final ( a1.nextstate ) ); StateId& newstate = nextp.first; bool visited = nextp.second; composed_->AddArc ( s , Arc ( a1.ilabel, a1.olabel , Times ( a1.weight, Times (mw_ ( w ) , mw_ (wp) ) ) , newstate ) ); //Finally, only add newstate to the queue if it hasn't been visited previously if ( !visited ) { qc_.push ( newstate ); } } } LINFO ( "Done! Number of states=" << composed_->NumStates() ); return composed_; };
/** * Constructor. Initializes on-the-fly composition with a language model. * \param fst Machine you want to apply the language to. Pass a delayed machine if you can, as it will expand it in constructor. * \param model A KenLM language model * \param epsilons List of words to work as epsilons * \param natlog Use or not natural logs * \param lmscale Language model scale */ ApplyLanguageModelOnTheFly ( const Fst<Arc>& fst, KenLMModelT& model, #ifndef USE_GOOGLE_SPARSE_HASH unordered_set<Label>& epsilons, #else google::dense_hash_set<Label>& epsilons, #endif bool natlog, float lmscale , float lmwp, const IdBridgeT& idbridge ) : composed_ ( NULL ) , natlog10_ ( natlog ? -lmscale* ::log ( 10.0 ) : -lmscale ), fst_ ( fst ), lmmodel_ ( model ), vocab_ ( model.GetVocabulary() ), wp_ ( lmwp ) , epsilons_ ( epsilons ) , history ( model.Order(), 0), idbridge_ (idbridge) { #ifdef USE_GOOGLE_SPARSE_HASH stateexistence_.set_empty_key ( numeric_limits<ull>::max() ); statemap_.set_empty_key ( numeric_limits<uint64_t>::max() ); basic_string<unsigned> aux (KENLM_MAX_ORDER, numeric_limits<unsigned>::max() ); seenlmstates_.set_empty_key ( aux ); #endif buffersize = ( model.Order() - 1 ) * sizeof ( unsigned int ); buffer = const_cast<unsigned *> ( history.c_str() ); if (!fst_.NumStates() ) { LWARN ("Empty lattice"); return; } composed_ = new VectorFst<Arc>; typename KenLMModelT::State bs = model.NullContextState(); ///Initialize with first state pair<StateId, bool> nextp = add ( bs, fst_.Start(), fst_.Final ( fst_.Start() ) ); qc_.push ( nextp.first ); composed_->SetStart ( nextp.first ); };