///functor:  Run composition itself, separate from load times (e.g. which may include an fst expansion).
 VectorFst<Arc> * operator() () {
   if (!fst_.NumStates() ) {
     LWARN ("Empty lattice. ... Skipping LM application!");
     return NULL;
   }
   while ( qc_.size() ) {
     StateId s = qc_.front();
     qc_.pop();
     pair<StateId, const typename KenLMModelT::State> p = get ( s );
     StateId& s1 = p.first;
     const typename KenLMModelT::State s2 = p.second;
     for ( ArcIterator< VectorFst<Arc> > arc1 ( fst_, s1 ); !arc1.Done();
           arc1.Next() ) {
       const Arc& a1 = arc1.Value();
       float w = 0;
       float wp = wp_;
       typename KenLMModelT::State nextlmstate;
       if ( epsilons_.find ( a1.olabel ) == epsilons_.end() ) {
         w = lmmodel_.Score ( s2, idbridge_.map (a1.olabel), nextlmstate ) * natlog10_;
         //silly hack
         if ( a1.olabel <= 2  )  {
           wp = 0;
           if (a1.olabel == 1 ) w = 0; //get same result as srilm
         }
       } else {
         nextlmstate = s2;
         wp = 0; //We don't count epsilon labels
       }
       pair<StateId, bool> nextp = add ( nextlmstate
                                         , a1.nextstate
                                         , fst_.Final ( a1.nextstate ) );
       StateId& newstate = nextp.first;
       bool visited = nextp.second;
       composed_->AddArc ( s
                           , Arc ( a1.ilabel, a1.olabel
                                   , Times ( a1.weight, Times (mw_ ( w ) , mw_ (wp) ) )
                                   , newstate ) );
       //Finally, only add newstate to the queue if it hasn't been visited previously
       if ( !visited ) {
         qc_.push ( newstate );
       }
     }
   }
   LINFO ( "Done! Number of states=" << composed_->NumStates() );
   return composed_;
 };
 /**
  * \brief Adds a state.
  * \return true if the state requested has already been visited, false otherwise.
  */
 inline pair <StateId, bool> add ( typename KenLMModelT::State& m2nextstate,
                                   StateId m1nextstate, Weight m1stateweight ) {
   static StateId lm = 0;
   getIdx ( m2nextstate );
   ///New history:
   if ( seenlmstates_.find ( history ) == seenlmstates_.end() ) {
     seenlmstates_[history] = ++lm;
   }
   uint64_t compound = m1nextstate * sid + seenlmstates_[history];
   LDEBUG ( "compound id=" << compound );
   if ( stateexistence_.find ( compound ) == stateexistence_.end() ) {
     LDEBUG ( "New State!" );
     statemap_[composed_->NumStates()] =
       pair<StateId, const typename KenLMModelT::State > ( m1nextstate, m2nextstate );
     composed_->AddState();
     if ( m1stateweight != mw_ ( ZPosInfinity() ) ) composed_->SetFinal (
         composed_->NumStates() - 1, m1stateweight );
     stateexistence_[compound] = composed_->NumStates() - 1;
     return pair<StateId, bool> ( composed_->NumStates() - 1, false );
   }
   return pair<StateId, bool> ( stateexistence_[compound], true );
 };
  /**
   * Constructor. Initializes on-the-fly composition with a language model.
   * \param fst         Machine you want to apply the language to. Pass a delayed machine if you can, as it will expand it in constructor.
   * \param model       A KenLM language model
   * \param epsilons    List of words to work as epsilons
   * \param natlog      Use or not natural logs
   * \param lmscale      Language model scale
   */
  ApplyLanguageModelOnTheFly ( const Fst<Arc>& fst,
                               KenLMModelT& model,
#ifndef USE_GOOGLE_SPARSE_HASH
                               unordered_set<Label>& epsilons,
#else
                               google::dense_hash_set<Label>& epsilons,
#endif
                               bool natlog,
                               float lmscale ,
                               float lmwp,
                               const IdBridgeT& idbridge
                             ) :
    composed_ ( NULL ) ,
    natlog10_ ( natlog ? -lmscale* ::log ( 10.0 ) : -lmscale ),
    fst_ ( fst ),
    lmmodel_ ( model ),
    vocab_ ( model.GetVocabulary() ),
    wp_ ( lmwp ) ,
    epsilons_ ( epsilons ) ,
    history ( model.Order(), 0),
    idbridge_ (idbridge) {
#ifdef USE_GOOGLE_SPARSE_HASH
    stateexistence_.set_empty_key ( numeric_limits<ull>::max() );
    statemap_.set_empty_key ( numeric_limits<uint64_t>::max() );
    basic_string<unsigned> aux (KENLM_MAX_ORDER, numeric_limits<unsigned>::max() );
    seenlmstates_.set_empty_key ( aux );
#endif
    buffersize = ( model.Order() - 1 ) * sizeof ( unsigned int );
    buffer = const_cast<unsigned *> ( history.c_str() );
    if (!fst_.NumStates() )  {
      LWARN ("Empty lattice");
      return;
    }
    composed_ = new VectorFst<Arc>;
    typename KenLMModelT::State bs = model.NullContextState();
    ///Initialize with first state
    pair<StateId, bool> nextp = add ( bs, fst_.Start(),
                                      fst_.Final ( fst_.Start() ) );
    qc_.push ( nextp.first );
    composed_->SetStart ( nextp.first );
  };
Example #4
0
vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst,
        int nbest)
{
    //Generic alignment generator
    VectorFst<StdArc> fst;
    Map(ifst, &fst, LogToStdMapper());

    for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done();
            siter.Next()) {
        StdArc::StateId q = siter.Value();
        for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q);
                !aiter.Done(); aiter.Next()) {
            //Prior to decoding we make several 'heuristic' modifications to the weights:
            // 1. A multiplier is applied to any multi-token substrings
            // 2. Any LogWeight::Zero() arc weights are reset to '99'.
            //    We are basically resetting 'Infinity' values to a 'smallest non-Infinity'
            //     so that the ShortestPath algorithm actually produces something no matter what.
            // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1
            //       are set to '99' this forces shortestpath to choose arcs where one of the
            //       following conditions holds true
            //      * len(subseq1)>1 && len(subseq2)!=len(subseq1)
            //      * len(subseq2)>1 && len(subseq1)!=len(subseq2)
            //      * len(subseq1)==len(subseq2)==1
            //I suspect these heuristics can be eliminated with a better choice of the initialization
            // function and maximization function, but this is the way that m2m-aligner works, so
            // it makes sense for our first cut implementation.
            //In any case, this guarantees that M2MFstAligner produces results identical to those
            // produced by m2m-aligner - but with a bit more reliability.
            //UPDATE: this now produces a better alignment than m2m-aligner.
            //  The maxl heuristic is still in place.  The aligner will produce *better* 1-best alignments
            //  *without* the maxl heuristic below, BUT this comes at the cost of producing a less
            //  flexible corpus.  That is, for a small training corpus like nettalk, if we use the
            //  best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen
            //  data.  Using the aignment lattices to train the joint ngram model solves this problem.
            //  Oh baby.  Can't wait to for everyone to see the paper!
            //NOTE: this is going to fail if we encounter any alignments in a new test item that never
            // occurred in the original model.
            StdArc
            arc = aiter.Value();
            int
            maxl = get_max_length(isyms->Find(arc.ilabel));
            if (maxl == -1) {
                arc.weight = 999;
            }
            else {
                //Optionally penalize m-to-1 / 1-to-m links.  This produces
                // WORSE 1-best alignments, but results in better joint n-gram
                // models for small training corpora when using only the 1-best
                // alignment.  By further favoring 1-to-1 alignments the 1-best
                // alignment corpus results in a more flexible joint n-gram model
                // with regard to previously unseen data.
                //if( penalize==true ){
                arc.weight = alignment_model[arc.ilabel].Value() * maxl;
                //}else{
                //For larger corpora this is probably unnecessary.
                //arc.weight = alignment_model[arc.ilabel].Value();
                //}
            }
            if (arc.weight == LogWeight::Zero())
                arc.weight = 999;
            if (arc.weight != arc.weight)
                arc.weight = 999;
            aiter.SetValue(arc);
        }
    }

    VectorFst<StdArc> shortest;
    ShortestPath(fst, &shortest, nbest);
    RmEpsilon(&shortest);
    //Skip empty results.  This should only happen
    // in the following situations:
    //  1. seq1_del=false && len(seq1)<len(seq2)
    //  2. seq2_del=false && len(seq1)>len(seq2)
    //In both 1.and 2. the issue is that we need to
    // insert a 'skip' in order to guarantee at least
    // one valid alignment path through seq1*seq2, but
    // user params didn't allow us to.
    //Probably better to insert these where necessary
    // during initialization, regardless of user prefs.
    if (shortest.NumStates() == 0) {
        vector<PathData> dummy;
        return dummy;
    }
    FstPathFinder
    pathfinder(skipSeqs);
    pathfinder.isyms = isyms;
    pathfinder.findAllStrings(shortest);
    return pathfinder.paths;
}