/*
 * This computes the function value at a given weight vector.
 * This should be parallelized by dividing the training examples
 * into subsets and doing each on a different thread.
 *
 * @param weights - the value of the weights
 * @param the value of the function
 *
 */
double WFST_Trainer_Local::value2(const column_vector &weights) {
  
  cout << "LIKELIHOOD\n";
  double likelihood = 0.0;
  update_arc_weights(weights);

  // ``filler'' FSTs to replace
  // the results from the composition
  // declared outside for loop for efficiency
  VectorFst<LogArc> medial;
  VectorFst<LogArc> final;
  
  medial.SetInputSymbols(fst->InputSymbols());
  medial.SetOutputSymbols(fst->OutputSymbols());
  final.SetInputSymbols(fst->InputSymbols());
 ///functor:  Run composition itself, separate from load times (e.g. which may include an fst expansion).
 VectorFst<Arc> * operator() () {
   if (!fst_.NumStates() ) {
     LWARN ("Empty lattice. ... Skipping LM application!");
     return NULL;
   }
   while ( qc_.size() ) {
     StateId s = qc_.front();
     qc_.pop();
     pair<StateId, const typename KenLMModelT::State> p = get ( s );
     StateId& s1 = p.first;
     const typename KenLMModelT::State s2 = p.second;
     for ( ArcIterator< VectorFst<Arc> > arc1 ( fst_, s1 ); !arc1.Done();
           arc1.Next() ) {
       const Arc& a1 = arc1.Value();
       float w = 0;
       float wp = wp_;
       typename KenLMModelT::State nextlmstate;
       if ( epsilons_.find ( a1.olabel ) == epsilons_.end() ) {
         w = lmmodel_.Score ( s2, idbridge_.map (a1.olabel), nextlmstate ) * natlog10_;
         //silly hack
         if ( a1.olabel <= 2  )  {
           wp = 0;
           if (a1.olabel == 1 ) w = 0; //get same result as srilm
         }
       } else {
         nextlmstate = s2;
         wp = 0; //We don't count epsilon labels
       }
       pair<StateId, bool> nextp = add ( nextlmstate
                                         , a1.nextstate
                                         , fst_.Final ( a1.nextstate ) );
       StateId& newstate = nextp.first;
       bool visited = nextp.second;
       composed_->AddArc ( s
                           , Arc ( a1.ilabel, a1.olabel
                                   , Times ( a1.weight, Times (mw_ ( w ) , mw_ (wp) ) )
                                   , newstate ) );
       //Finally, only add newstate to the queue if it hasn't been visited previously
       if ( !visited ) {
         qc_.push ( newstate );
       }
     }
   }
   LINFO ( "Done! Number of states=" << composed_->NumStates() );
   return composed_;
 };
  /**
   * Constructor. Initializes on-the-fly composition with a language model.
   * \param fst         Machine you want to apply the language to. Pass a delayed machine if you can, as it will expand it in constructor.
   * \param model       A KenLM language model
   * \param epsilons    List of words to work as epsilons
   * \param natlog      Use or not natural logs
   * \param lmscale      Language model scale
   */
  ApplyLanguageModelOnTheFly ( const Fst<Arc>& fst,
                               KenLMModelT& model,
#ifndef USE_GOOGLE_SPARSE_HASH
                               unordered_set<Label>& epsilons,
#else
                               google::dense_hash_set<Label>& epsilons,
#endif
                               bool natlog,
                               float lmscale ,
                               float lmwp,
                               const IdBridgeT& idbridge
                             ) :
    composed_ ( NULL ) ,
    natlog10_ ( natlog ? -lmscale* ::log ( 10.0 ) : -lmscale ),
    fst_ ( fst ),
    lmmodel_ ( model ),
    vocab_ ( model.GetVocabulary() ),
    wp_ ( lmwp ) ,
    epsilons_ ( epsilons ) ,
    history ( model.Order(), 0),
    idbridge_ (idbridge) {
#ifdef USE_GOOGLE_SPARSE_HASH
    stateexistence_.set_empty_key ( numeric_limits<ull>::max() );
    statemap_.set_empty_key ( numeric_limits<uint64_t>::max() );
    basic_string<unsigned> aux (KENLM_MAX_ORDER, numeric_limits<unsigned>::max() );
    seenlmstates_.set_empty_key ( aux );
#endif
    buffersize = ( model.Order() - 1 ) * sizeof ( unsigned int );
    buffer = const_cast<unsigned *> ( history.c_str() );
    if (!fst_.NumStates() )  {
      LWARN ("Empty lattice");
      return;
    }
    composed_ = new VectorFst<Arc>;
    typename KenLMModelT::State bs = model.NullContextState();
    ///Initialize with first state
    pair<StateId, bool> nextp = add ( bs, fst_.Start(),
                                      fst_.Final ( fst_.Start() ) );
    qc_.push ( nextp.first );
    composed_->SetStart ( nextp.first );
  };
Example #4
0
void process(const FstClass& _fst, const char *output, const string& separator, const string* space) {
  const Fst<Arc>& fst = *_fst.GetFst<Arc>();
  Verify(fst);

  fst::SymbolTable * const_syms = new fst::SymbolTable("const syms");
  const_syms->AddSymbol("<s>");
  const_syms->AddSymbol("</s>");
  const_syms->AddSymbol("<space>");
  const_syms->AddSymbol("<phrase>");
  const_syms->AddSymbol("<epsilon>");
  const_syms->AddSymbol("!NULL");

  VectorFst<Arc> ofst;
  SplitSymbols<Arc>(fst, &ofst, separator, space, const_syms);

  delete const_syms;

  FstWriteOptions opts(output);
  ofilter os(output);
  ofst.Write(os, opts);
}
Example #5
0
M2MFstAligner::M2MFstAligner( string _model_file, bool _penalize, bool _penalize_em, bool _restrict  ){
  /*
    Initialize the aligner with a previously trained model.
    The model requires that the first several symbols in the 
    symbols table contain the separator and other bookkeeping info.
  */

  restrict    = _restrict;
  penalize    = _penalize;
  penalize_em = _penalize_em;
  penalties.set_empty_key(0);
  VectorFst<LogArc>* model = VectorFst<LogArc>::Read( _model_file );
  for( StateIterator<VectorFst<LogArc> > siter(*model); !siter.Done(); siter.Next() ){
    LogArc::StateId q = siter.Value();
    for( ArcIterator<VectorFst<LogArc> > aiter(*model, q); !aiter.Done(); aiter.Next() ){
      const LogArc& arc = aiter.Value();
      alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel,arc.weight) );
    }
  }      
  isyms = (SymbolTable*)model->InputSymbols();
  int i = 0;
  eps      = isyms->Find(i);//Can't write '0' here for some reason...
  skip     = isyms->Find(1);
  string tie = "_"; //tie to pack parameters

  string sseps = isyms->Find(2);
  vector<string> seps = tokenize_utf8_string( &sseps, &tie );
  seq1_sep = seps[0];
  seq2_sep = seps[1];
  s1s2_sep = isyms->Find(3);

  string sparams = isyms->Find(4);
  vector<string> params = tokenize_utf8_string( &sparams, &tie );
  seq1_del = params[0].compare("true") ? false : true;
  seq2_del = params[1].compare("true") ? false : true;
  seq1_max = atoi(params[2].c_str());
  seq2_max = atoi(params[3].c_str());

}
Example #6
0
void
M2MFstAligner::write_lattice(string lattice)
{
    //Write out the entire training set in lattice format
    //Perform the union first.  This output can then
    // be plugged directly in to a counter to obtain expected
    // alignment counts for the EM-trained corpus.  Yields
    // far higher-quality joint n-gram models, which are also
    // more robust for smaller training corpora.
    //Make sure you call this BEFORE any call to
    // write_all_alignments
    // as the latter function will override some of the weights

    //Chaining the standard Union operation, including using a
    // rational FST still performs very poorly in the log semiring.
    //Presumably it's running push or something at each step.  It
    // should be fine to do that just once at the end.
    //Rolling our own union turns out to be MUCH faster.
    VectorFst<LogArc> ufst;
    ufst.AddState();
    ufst.SetStart(0);
    int total_states = 0;
    for (int i = 0; i < fsas.size(); i++) {
        TopSort(&fsas[i]);
        for (StateIterator<VectorFst<LogArc> > siter(fsas[i]);
                !siter.Done(); siter.Next()) {
            LogArc::StateId q = siter.Value();
            LogArc::StateId r;
            if (q == 0)
                r = 0;
            else
                r = ufst.AddState();

            for (ArcIterator <VectorFst<LogArc> > aiter(fsas[i], q);
                    !aiter.Done(); aiter.Next()) {
                const LogArc & arc = aiter.Value();
                ufst.AddArc(r,
                            LogArc(arc.ilabel, arc.ilabel, arc.weight,
                                   arc.nextstate + total_states));
            }
            if (fsas[i].Final(q) != LogWeight::Zero())
                ufst.SetFinal(r, LogWeight::One());
        }
        total_states += fsas[i].NumStates() - 1;
    }
    //Normalize weights
    Push(&ufst, REWEIGHT_TO_INITIAL);
    //Write the resulting lattice to disk
    ufst.Write(lattice);
    //Write the syms table too.
    isyms->WriteText("lattice.syms");
    return;
}
Example #7
0
void
M2MFstAligner::write_model(string _model_file)
{
    VectorFst<LogArc> model;
    model.AddState();
    model.SetStart(0);
    model.SetFinal(0, LogWeight::One());
    map<LogArc::Label,LogWeight>::iterator it;
    for (it = alignment_model.begin(); it != alignment_model.end(); it++)
        model.AddArc(0, LogArc((*it).first, (*it).first, (*it).second, 0));
    model.SetInputSymbols(isyms);
    model.Write(_model_file);
    return;
}
 /**
  * \brief Adds a state.
  * \return true if the state requested has already been visited, false otherwise.
  */
 inline pair <StateId, bool> add ( typename KenLMModelT::State& m2nextstate,
                                   StateId m1nextstate, Weight m1stateweight ) {
   static StateId lm = 0;
   getIdx ( m2nextstate );
   ///New history:
   if ( seenlmstates_.find ( history ) == seenlmstates_.end() ) {
     seenlmstates_[history] = ++lm;
   }
   uint64_t compound = m1nextstate * sid + seenlmstates_[history];
   LDEBUG ( "compound id=" << compound );
   if ( stateexistence_.find ( compound ) == stateexistence_.end() ) {
     LDEBUG ( "New State!" );
     statemap_[composed_->NumStates()] =
       pair<StateId, const typename KenLMModelT::State > ( m1nextstate, m2nextstate );
     composed_->AddState();
     if ( m1stateweight != mw_ ( ZPosInfinity() ) ) composed_->SetFinal (
         composed_->NumStates() - 1, m1stateweight );
     stateexistence_[compound] = composed_->NumStates() - 1;
     return pair<StateId, bool> ( composed_->NumStates() - 1, false );
   }
   return pair<StateId, bool> ( stateexistence_[compound], true );
 };
Example #9
0
void M2MFstAligner::_conditional_max( bool y_given_x ){
  /*
    Compute the conditional distribution, P(Y|X) using the WFST paradigm.
    This is bassed on the approach from Shu and Hetherington 2002.
    It is assumed that all WFSTs and operations use the Log semiring.

    Given: 
           FST1 = P(X,Y)
    Compute:
           FST2 = P(X) 
             := Map_inv(Det(RmEps(Proj_i(FST1))))
           FST3 = P(Y|X)
             := Compose(FST2,FST1)

    Proj_i:  project on input labels
    RmEps:   epsilon removal
    Det:     determinize
    Map_inv: invert weights

    Notes: An analogous process may be used to compute P(X|Y).  In this
      case one would project on OUTPUT labels - Proj_o, and reverse the
      composition order to Compose(FST1,FST2).

    Future work:
      What we are doing here in terms of *generating* the JOINT fst each
      time is really just a dumb hack.  We *should* encode the model in an
      FST and encode the individual lattices, rather than doing the hacky
      manual label encoding that we currently rely on.
  */

  //Joint distribution that we start with
  VectorFst<LogArc>* joint  = new VectorFst<LogArc>();
  SymbolTable* misyms = new SymbolTable("misyms");
  SymbolTable* mosyms = new SymbolTable("mosyms");
  joint->AddState();
  joint->AddState();
  joint->SetStart(0);
  joint->SetFinal(1,LogArc::Weight::One());
  map<LogArc::Label,LogWeight>::iterator it;
  for( it=prev_alignment_model.begin(); it != prev_alignment_model.end(); it++ ){
    string isym = isyms->Find((*it).first); 
    vector<string> io = tokenize_utf8_string( &isym, &s1s2_sep );
    LogArc arc( misyms->AddSymbol(io[0]), mosyms->AddSymbol(io[1]), (*it).second, 1 );
    joint->AddArc( 0, arc );
  }
  //VectorFst<LogArc>* joint  = new VectorFst<LogArc>();
  //Push<LogArc,REWEIGHT_TO_FINAL>(*_joint, joint, kPushWeights);
  //joint->SetFinal(1,LogWeight::One());
  joint->Write("m2mjoint.fst");
  //BEGIN COMPUTE MARGINAL P(X)  
  VectorFst<LogArc>* dmarg;
  if( y_given_x )
    dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_INPUT));
  else
    dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_OUTPUT));

  RmEpsilon(dmarg);
  VectorFst<LogArc>* marg = new VectorFst<LogArc>();
  Determinize(*dmarg, marg);
  ArcMap(marg, InvertWeightMapper<LogArc>());

  if( y_given_x )
    ArcSort(marg, OLabelCompare<LogArc>());
  else
    ArcSort(marg, ILabelCompare<LogArc>());
  //END COMPUTE MARGINAL P(X)
  marg->Write("marg.fst");

  //CONDITIONAL P(Y|X)
  VectorFst<LogArc>* cond = new VectorFst<LogArc>();
  if( y_given_x )
    Compose(*marg, *joint, cond);
  else
    Compose(*joint, *marg, cond);
  //cond now contains the conditional distribution P(Y|X)
  cond->Write("cond.fst");
  //Now update the model with the new values
  for( MutableArcIterator<VectorFst<LogArc> > aiter(cond, 0); !aiter.Done(); aiter.Next() ){
    LogArc arc = aiter.Value();
    string lab = misyms->Find(arc.ilabel)+"}"+mosyms->Find(arc.olabel);
    int   labi = isyms->Find(lab);
    alignment_model[labi]      = arc.weight;
    prev_alignment_model[labi] = LogWeight::Zero();
  }
  delete joint, marg, cond, dmarg;
  delete misyms, mosyms;
  return;
}
/**
 * Create an FST based on an RNN
 */
void FlatBOFstBuilder::convertRNN(CRnnLM & rnnlm, VectorFst<LogArc> &fst) {
	queue<NeuronFstHistory> q;
	VectorFst<LogArc> new_fst;
	
	NeuronFstHistory fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	FstIndex id = 0;
	
	NeuronFstHistory new_fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	FstIndex new_id;

	NeuronFstHistory min_backoff(rnnlm.getHiddenLayerSize(),getNumBins());
	set<NeuronFstHistory>set_min_backoff;
	
	NeuronFstHistory bo_fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	bool backoff = false;
	vector<FstIndex> deleted;


	real p = 0.00;
	real p_joint = 0.00;
	real entropy = 0.0;
	real delta = 0.0;
	vector<real> all_prob(rnnlm.getVocabSize());
 	vector<real> posterior(10);
	
	map< FstIndex,set<FstIndex> > pred;
	vector<bool> non_bo_pred(rnnlm.getVocabSize());
	vector<int> to_be_added;
	vector<int> to_be_removed;
	for (int i = 0; i < rnnlm.getVocabSize(); i++) {
		to_be_removed.push_back(i);
	}
	vector<real> to_be_added_prob;


 	FstIndex n_added = 0;
 	FstIndex n_processed = 0;
 	FstIndex next_n_added = 0;
 	FstIndex next_n_processed = 0;
 	FstIndex n_backoff = 0;
 	FstIndex n_only_backoff = 0;
 	
	int v = rnnlm.getVocabSize();
	int w = 0;


	// Initialize
	rnnlm.copyHiddenLayerToInput();
//	printNeurons(rnnlm.getInputLayer(),0,10);

	// Initial state ( 0 | hidden layer after </s>)
	printNeurons(rnnlm.getHiddenLayer(),0,10);
	fsth.setFstHistory(rnnlm, *dzer);
	fsth.setLastWord(0);
	q.push(fsth);
	addFstState(id, new NeuronFstHistory(fsth), fst);
	fst.SetStart(INIT_STATE);
	
	// Final state (don't care about the associated discrete representation)
	fst.AddState();
	fst.SetFinal(FINAL_STATE, LogWeight::One());
	
 	/*posterior.at(INIT_STATE) = MY_LOG_ONE;*/
	min_backoff.setLastWord(-1);
	computeEntropyAndConditionals(entropy, all_prob, rnnlm, min_backoff);
	min_backoff = getBackoff(rnnlm, min_backoff, set_min_backoff, all_prob, to_be_removed);
	cout << "MIN BACKOFF " << min_backoff.toString() << endl;
	set_min_backoff.insert(min_backoff);
	
//	addFstState(id, min_backoff, fst);
//	q.push(min_backoff);
	

	
	// Estimate number of backoff loop to bound the backoff path length
// 	float ratioa = 0.0;
// 	float ratiob = 0.0;
	float ratio = 0.0;
// 	for (int i=0; i < min_backoff.getNumDims(); i++) {
// 		if (min_backoff.getDim(i) == 1) {
// 			ratioa++;
// 		}
// 		if (fsth.getDim(i) == 1) {
// 			ratiob++;
// 		}
// 	}
// 	ratioa /= min_backoff.getNumDims();
// 	ratiob /= min_backoff.getNumDims();
// 	ratio = (ratioa*(1.0-ratiob))+(ratiob*(1.0-ratioa));
	ratio=1.0;

//	printf("ratio=%f\t%i BO loops\n", ratio, n_bo_loops);
	
	
	
	//foreach state in the queue
	while (!q.empty()) {
		fsth = q.front();
		q.pop();
		id = h2state[&fsth];
		state2h.push_back(new NeuronFstHistory(fsth));
		if (id == FINAL_STATE) { continue; }


		
		
	dprintf(1,"-- STUDY STATE %li = %s\n", id, fsth.toString().c_str());
	

/*		try { posterior.at(id) = MY_LOG_ONE; }
		catch (exception e) {
			posterior.resize((int) (posterior.size()*1.5)+1);
			posterior.at(id) = MY_LOG_ONE;
		}*/
		
		computeEntropyAndConditionals(entropy, all_prob, rnnlm, fsth);
		
		//compute BO in advance and check if it is a min BO node
		bo_fsth = getBackoff(rnnlm, fsth, set_min_backoff, all_prob, to_be_removed);
		if (bo_fsth == fsth) { bo_fsth = min_backoff; }
			
		//foreach w (ie, foreach word of each class c)
		//test if the edge has to kept or removed
		backoff = false; //no backoff yet since no edge has been removed
		for (w=0; w < rnnlm.getVocabSize(); w++) {
				p = all_prob[w];
				
				/*p_joint = exp(-posterior[id]-p);*/
				p_joint = exp(-p);
				delta = -1.0*p_joint*log2(p_joint);
				
				//accept edge if this leads to a minimum
				//relative gain of the entropy

				dprintf(2,"P = %e \tP_joint = %e \tH = %e \tDelta =%e \tDelta H = %.6f %%\n",exp(-p), p_joint, entropy, delta, 100.0*delta/entropy);

				if (set_min_backoff.find(fsth) != set_min_backoff.end() || (delta > pruning_threshold*entropy)) {
//				if ((fsth == min_backoff) || (delta > pruning_threshold*entropy)) {
					next_n_added++;
					to_be_added.push_back(w);
					to_be_added_prob.push_back(p);
					dprintf(2,"\tACCEPT [%li] -- %i (%s) / %f --> ...\t(%e > %e)\n", id, w, rnnlm.getWordString(w), p, delta, pruning_threshold*entropy);
//					to_be_removed.push_back(w);
 				}
 				//backoff
				else {
//					to_be_removed.push_back(w);
					backoff = true;
					dprintf(2,"\tPRUNE [%li] -- %i / %f --> ...\n", id, w, p);
 				}
 				
 				//print
				if (next_n_processed % 100000 == 0) {
						fprintf(stderr, "\rH=%.5f / N proc'd=%li / N added=%li (%.5f %%) / N bo=%li (%.5f %%) / %li/%li Nodes (%2.1f %%) / N min BO=%i", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, n_backoff, ((float) n_backoff/ (float)n_added)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())), (int) set_min_backoff.size());
				}
				next_n_processed++;
 				
//			}
		}


		//Set a part of the new FST history
		new_fsth.setFstHistory(rnnlm, *dzer);

		//if at least one word is backing off
		if (backoff) {
			
			n_backoff++;
			if (to_be_added.size() == 0) {
				n_only_backoff++;
			}
			
			
			if (addFstState(new_id, new NeuronFstHistory(bo_fsth), fst)) {
				q.push(bo_fsth);
				try { non_bo_pred.at(new_id) = false; }
				catch (exception e) {
					non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1);
					non_bo_pred.at(new_id) = false;
				}
				
			}
			dprintf(1,"BACKOFF\t[%li]\t(%s)\n-------\t[%li]\t(%s)\n", id, fsth.toString().c_str(), new_id, bo_fsth.toString().c_str());

			fst.AddArc(id, LogArc(EPSILON, EPSILON, LogWeight::Zero(), new_id));
			
			addPred(pred, new_id, id);
			
		}
		
		
		vector<real>::iterator it_p = to_be_added_prob.begin();
		for (vector<int>::iterator it = to_be_added.begin(); it != to_be_added.end(); ++it) {
			w = *it;
			p = *it_p;

			if (w == 0) {
				fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,FINAL_STATE));
				dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] FINAL STATE)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, FINAL_STATE);				
			}
		
			//accept edge
			else {
				new_fsth.setLastWord(w);
	
				//if sw not in the memory
				//then add a new state for sw in the FST and push sw in the queue
				if (addFstState(new_id, new NeuronFstHistory(new_fsth), fst)) {
					q.push(new_fsth);
					try { non_bo_pred.at(new_id) = true; }
					catch (exception e) {
						non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1);
						non_bo_pred.at(new_id) = true;
					}
				}
				else { /* already exists */ }
			
				//add the edge in the FST
				non_bo_pred.at(new_id) = true;
				fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,new_id));
				dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] (%s)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, new_id, new_fsth.toString().c_str());				

//				posterior.at(new_id) += posterior[id]*p;

			}
			
			/*if (posterior[id]+p < LogWeight::Zero().Value()) {
				p_joint = exp(-posterior[id]-p);
				entropy -= p_joint*log2(p_joint);
			}*/
			
			++it_p;
		}
		
		n_added = next_n_added;
		n_processed = next_n_processed;
		
		//reset queues
		to_be_added.clear();
		to_be_added_prob.clear();
//		to_be_removed.clear();
		
	}

	cout << endl;
	
	//compute backoff weights
	deleted = compactBackoffNodes(fst, pred, non_bo_pred);
	computeAllBackoff(fst, pred);


	//remove useless nodes
	removeStates(fst, new_fst, deleted);
	fst.DeleteStates();
	fst = new_fst;
	
	//Fill the table of symbols
	SymbolTable dic("dictionnary");
	dic.AddSymbol("*", 0);
	for (int i=0; i<rnnlm.getVocabSize(); i++) {
		dic.AddSymbol(string(rnnlm.getWordString(i)), i+1);
	}
	fst.SetInputSymbols(&dic);
	fst.SetOutputSymbols(&dic);

						//printf("H=%.5f / N proc'd=%li / N added=%li (%.5f %%) %li/%li Nodes (%2.1f %%)\n", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())));
	cout << "END" << endl;
	
}
Example #11
0
vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst,
        int nbest)
{
    //Generic alignment generator
    VectorFst<StdArc> fst;
    Map(ifst, &fst, LogToStdMapper());

    for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done();
            siter.Next()) {
        StdArc::StateId q = siter.Value();
        for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q);
                !aiter.Done(); aiter.Next()) {
            //Prior to decoding we make several 'heuristic' modifications to the weights:
            // 1. A multiplier is applied to any multi-token substrings
            // 2. Any LogWeight::Zero() arc weights are reset to '99'.
            //    We are basically resetting 'Infinity' values to a 'smallest non-Infinity'
            //     so that the ShortestPath algorithm actually produces something no matter what.
            // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1
            //       are set to '99' this forces shortestpath to choose arcs where one of the
            //       following conditions holds true
            //      * len(subseq1)>1 && len(subseq2)!=len(subseq1)
            //      * len(subseq2)>1 && len(subseq1)!=len(subseq2)
            //      * len(subseq1)==len(subseq2)==1
            //I suspect these heuristics can be eliminated with a better choice of the initialization
            // function and maximization function, but this is the way that m2m-aligner works, so
            // it makes sense for our first cut implementation.
            //In any case, this guarantees that M2MFstAligner produces results identical to those
            // produced by m2m-aligner - but with a bit more reliability.
            //UPDATE: this now produces a better alignment than m2m-aligner.
            //  The maxl heuristic is still in place.  The aligner will produce *better* 1-best alignments
            //  *without* the maxl heuristic below, BUT this comes at the cost of producing a less
            //  flexible corpus.  That is, for a small training corpus like nettalk, if we use the
            //  best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen
            //  data.  Using the aignment lattices to train the joint ngram model solves this problem.
            //  Oh baby.  Can't wait to for everyone to see the paper!
            //NOTE: this is going to fail if we encounter any alignments in a new test item that never
            // occurred in the original model.
            StdArc
            arc = aiter.Value();
            int
            maxl = get_max_length(isyms->Find(arc.ilabel));
            if (maxl == -1) {
                arc.weight = 999;
            }
            else {
                //Optionally penalize m-to-1 / 1-to-m links.  This produces
                // WORSE 1-best alignments, but results in better joint n-gram
                // models for small training corpora when using only the 1-best
                // alignment.  By further favoring 1-to-1 alignments the 1-best
                // alignment corpus results in a more flexible joint n-gram model
                // with regard to previously unseen data.
                //if( penalize==true ){
                arc.weight = alignment_model[arc.ilabel].Value() * maxl;
                //}else{
                //For larger corpora this is probably unnecessary.
                //arc.weight = alignment_model[arc.ilabel].Value();
                //}
            }
            if (arc.weight == LogWeight::Zero())
                arc.weight = 999;
            if (arc.weight != arc.weight)
                arc.weight = 999;
            aiter.SetValue(arc);
        }
    }

    VectorFst<StdArc> shortest;
    ShortestPath(fst, &shortest, nbest);
    RmEpsilon(&shortest);
    //Skip empty results.  This should only happen
    // in the following situations:
    //  1. seq1_del=false && len(seq1)<len(seq2)
    //  2. seq2_del=false && len(seq1)>len(seq2)
    //In both 1.and 2. the issue is that we need to
    // insert a 'skip' in order to guarantee at least
    // one valid alignment path through seq1*seq2, but
    // user params didn't allow us to.
    //Probably better to insert these where necessary
    // during initialization, regardless of user prefs.
    if (shortest.NumStates() == 0) {
        vector<PathData> dummy;
        return dummy;
    }
    FstPathFinder
    pathfinder(skipSeqs);
    pathfinder.isyms = isyms;
    pathfinder.findAllStrings(shortest);
    return pathfinder.paths;
}
Example #12
0
void
train_model(string eps, string s1s2_sep, string skip, int order,
            string smooth, string prefix, string seq_sep, string prune,
            double theta, string count_pattern)
{
    namespace s = fst::script;
    using fst::script::FstClass;
    using fst::script::MutableFstClass;
    using fst::script::VectorFstClass;
    using fst::script::WeightClass;

    // create symbols file
    cout << "Generating symbols..." << endl;
    NGramInput *ingram =
        new NGramInput(prefix + ".corpus.aligned", prefix + ".corpus.syms",
                       "", eps, unknown_symbol, "", "");
    ingram->ReadInput(0, 1);

    // compile strings into a far archive
    cout << "Compiling symbols into FAR archive..." << endl;
    fst::FarEntryType fet = fst::StringToFarEntryType(entry_type);
    fst::FarTokenType ftt = fst::StringToFarTokenType(token_type);
    fst::FarType fartype = fst::FarTypeFromString(far_type);

    delete ingram;

    vector<string> in_fname;
    in_fname.push_back(prefix + ".corpus.aligned");

    fst::script::FarCompileStrings(in_fname, prefix + ".corpus.far",
                                   arc_type, fst_type, fartype,
                                   generate_keys, fet, ftt,
                                   prefix + ".corpus.syms", unknown_symbol,
                                   keep_symbols, initial_symbols,
                                   allow_negative_labels, file_list_input,
                                   key_prefix, key_suffix);

    //count n-grams
    cout << "Counting n-grams..." << endl;
    NGramCounter<Log64Weight> ngram_counter(order, epsilon_as_backoff);

    FstReadOptions opts;
    FarReader<StdArc> *far_reader;
    far_reader = FarReader<StdArc>::Open(prefix + ".corpus.far");
    int fstnumber = 1;
    const Fst<StdArc> *ifst = 0, *lfst = 0;
    while (!far_reader->Done()) {
        if (ifst)
            delete ifst;
        ifst = far_reader->GetFst().Copy();

        if (!ifst) {
            E_FATAL("ngramcount: unable to read fst #%d\n", fstnumber);
            //exit(1);
        }

        bool counted = false;
        if (ifst->Properties(kString | kUnweighted, true)) {
            counted = ngram_counter.Count(*ifst);
        }
        else {
            VectorFst<Log64Arc> log_ifst;
            Map(*ifst, &log_ifst, ToLog64Mapper<StdArc> ());
            counted = ngram_counter.Count(&log_ifst);
        }
        if (!counted)
            cout << "ngramcount: fst #" << fstnumber << endl;

        if (ifst->InputSymbols() != 0) {        // retain for symbol table
            if (lfst)
                delete lfst;    // delete previously observed symbol table
            lfst = ifst;
            ifst = 0;
        }
        far_reader->Next();
        ++fstnumber;
    }
    delete far_reader;

    if (!lfst) {
        E_FATAL("None of the input FSTs had a symbol table\n");
        //exit(1);
    }

    VectorFst<StdArc> vfst;
    ngram_counter.GetFst(&vfst);
    ArcSort(&vfst, StdILabelCompare());
    vfst.SetInputSymbols(lfst->InputSymbols());
    vfst.SetOutputSymbols(lfst->InputSymbols());
    vfst.Write(prefix + ".corpus.cnts");
    StdMutableFst *fst =
        StdMutableFst::Read(prefix + ".corpus.cnts", true);
    if (smooth != "no") {
        cout << "Smoothing model..." << endl;

        bool prefix_norm = 0;
        if (smooth == "presmoothed") {  // only for use with randgen counts
            prefix_norm = 1;
            smooth = "unsmoothed";      // normalizes only based on prefix count
        }
        if (smooth == "kneser_ney") {
            NGramKneserNey ngram(fst, backoff, backoff_label,
                                 norm_eps, check_consistency,
                                 discount_D, bins);
            ngram.MakeNGramModel();
            fst = ngram.GetMutableFst();
        }
        else if (smooth == "absolute") {
            NGramAbsolute ngram(fst, backoff, backoff_label,
                                norm_eps, check_consistency,
                                discount_D, bins);
            ngram.MakeNGramModel();
            fst = ngram.GetMutableFst();
        }
        else if (smooth == "katz") {
            NGramKatz ngram(fst, backoff, backoff_label,
                            norm_eps, check_consistency, bins);
            ngram.MakeNGramModel();
            fst = ngram.GetMutableFst();
        }
        else if (smooth == "witten_bell") {
            NGramWittenBell ngram(fst, backoff, backoff_label,
                                  norm_eps, check_consistency,
                                  witten_bell_k);
            ngram.MakeNGramModel();
            fst = ngram.GetMutableFst();
        }
        else if (smooth == "unsmoothed") {
            NGramUnsmoothed ngram(fst, 1, prefix_norm, backoff_label,
                                  norm_eps, check_consistency);
            ngram.MakeNGramModel();
            fst = ngram.GetMutableFst();
        }
        else {
            E_FATAL("Bad smoothing method: %s\n", smooth.c_str());
        }
    }
    if (prune != "no") {
        cout << "Pruning model..." << endl;

        if (prune == "count_prune") {
            NGramCountPrune ngramsh(fst, count_pattern,
                                    shrink_opt, total_unigram_count,
                                    backoff_label, norm_eps,
                                    check_consistency);
            ngramsh.ShrinkNGramModel();
        }
        else if (prune == "relative_entropy") {
            NGramRelEntropy ngramsh(fst, theta, shrink_opt,
                                    total_unigram_count, backoff_label,
                                    norm_eps, check_consistency);
            ngramsh.ShrinkNGramModel();
        }
        else if (prune == "seymore") {
            NGramSeymoreShrink ngramsh(fst, theta, shrink_opt,
                                       total_unigram_count, backoff_label,
                                       norm_eps, check_consistency);
            ngramsh.ShrinkNGramModel();
        }
        else {
            E_FATAL("Bad shrink method:  %s\n", prune.c_str());
        }
    }

    cout << "Minimizing model..." << endl;
    MutableFstClass *minimized = new s::MutableFstClass(*fst);
    Minimize(minimized, 0, fst::kDelta);
    fst = minimized->GetMutableFst<StdArc>();

    cout << "Correcting final model..." << endl;
    StdMutableFst *out = new StdVectorFst();
    relabel(fst, out, prefix, eps, skip, s1s2_sep, seq_sep);

    cout << "Writing binary model to disk..." << endl;
    out->Write(prefix + ".fst");
}