Example #1
0
void
M2MFstAligner::write_lattice(string lattice)
{
    //Write out the entire training set in lattice format
    //Perform the union first.  This output can then
    // be plugged directly in to a counter to obtain expected
    // alignment counts for the EM-trained corpus.  Yields
    // far higher-quality joint n-gram models, which are also
    // more robust for smaller training corpora.
    //Make sure you call this BEFORE any call to
    // write_all_alignments
    // as the latter function will override some of the weights

    //Chaining the standard Union operation, including using a
    // rational FST still performs very poorly in the log semiring.
    //Presumably it's running push or something at each step.  It
    // should be fine to do that just once at the end.
    //Rolling our own union turns out to be MUCH faster.
    VectorFst<LogArc> ufst;
    ufst.AddState();
    ufst.SetStart(0);
    int total_states = 0;
    for (int i = 0; i < fsas.size(); i++) {
        TopSort(&fsas[i]);
        for (StateIterator<VectorFst<LogArc> > siter(fsas[i]);
                !siter.Done(); siter.Next()) {
            LogArc::StateId q = siter.Value();
            LogArc::StateId r;
            if (q == 0)
                r = 0;
            else
                r = ufst.AddState();

            for (ArcIterator <VectorFst<LogArc> > aiter(fsas[i], q);
                    !aiter.Done(); aiter.Next()) {
                const LogArc & arc = aiter.Value();
                ufst.AddArc(r,
                            LogArc(arc.ilabel, arc.ilabel, arc.weight,
                                   arc.nextstate + total_states));
            }
            if (fsas[i].Final(q) != LogWeight::Zero())
                ufst.SetFinal(r, LogWeight::One());
        }
        total_states += fsas[i].NumStates() - 1;
    }
    //Normalize weights
    Push(&ufst, REWEIGHT_TO_INITIAL);
    //Write the resulting lattice to disk
    ufst.Write(lattice);
    //Write the syms table too.
    isyms->WriteText("lattice.syms");
    return;
}
Example #2
0
void
M2MFstAligner::write_model(string _model_file)
{
    VectorFst<LogArc> model;
    model.AddState();
    model.SetStart(0);
    model.SetFinal(0, LogWeight::One());
    map<LogArc::Label,LogWeight>::iterator it;
    for (it = alignment_model.begin(); it != alignment_model.end(); it++)
        model.AddArc(0, LogArc((*it).first, (*it).first, (*it).second, 0));
    model.SetInputSymbols(isyms);
    model.Write(_model_file);
    return;
}
/**
 * Create an FST based on an RNN
 */
void FlatBOFstBuilder::convertRNN(CRnnLM & rnnlm, VectorFst<LogArc> &fst) {
	queue<NeuronFstHistory> q;
	VectorFst<LogArc> new_fst;
	
	NeuronFstHistory fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	FstIndex id = 0;
	
	NeuronFstHistory new_fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	FstIndex new_id;

	NeuronFstHistory min_backoff(rnnlm.getHiddenLayerSize(),getNumBins());
	set<NeuronFstHistory>set_min_backoff;
	
	NeuronFstHistory bo_fsth(rnnlm.getHiddenLayerSize(),getNumBins());
	bool backoff = false;
	vector<FstIndex> deleted;


	real p = 0.00;
	real p_joint = 0.00;
	real entropy = 0.0;
	real delta = 0.0;
	vector<real> all_prob(rnnlm.getVocabSize());
 	vector<real> posterior(10);
	
	map< FstIndex,set<FstIndex> > pred;
	vector<bool> non_bo_pred(rnnlm.getVocabSize());
	vector<int> to_be_added;
	vector<int> to_be_removed;
	for (int i = 0; i < rnnlm.getVocabSize(); i++) {
		to_be_removed.push_back(i);
	}
	vector<real> to_be_added_prob;


 	FstIndex n_added = 0;
 	FstIndex n_processed = 0;
 	FstIndex next_n_added = 0;
 	FstIndex next_n_processed = 0;
 	FstIndex n_backoff = 0;
 	FstIndex n_only_backoff = 0;
 	
	int v = rnnlm.getVocabSize();
	int w = 0;


	// Initialize
	rnnlm.copyHiddenLayerToInput();
//	printNeurons(rnnlm.getInputLayer(),0,10);

	// Initial state ( 0 | hidden layer after </s>)
	printNeurons(rnnlm.getHiddenLayer(),0,10);
	fsth.setFstHistory(rnnlm, *dzer);
	fsth.setLastWord(0);
	q.push(fsth);
	addFstState(id, new NeuronFstHistory(fsth), fst);
	fst.SetStart(INIT_STATE);
	
	// Final state (don't care about the associated discrete representation)
	fst.AddState();
	fst.SetFinal(FINAL_STATE, LogWeight::One());
	
 	/*posterior.at(INIT_STATE) = MY_LOG_ONE;*/
	min_backoff.setLastWord(-1);
	computeEntropyAndConditionals(entropy, all_prob, rnnlm, min_backoff);
	min_backoff = getBackoff(rnnlm, min_backoff, set_min_backoff, all_prob, to_be_removed);
	cout << "MIN BACKOFF " << min_backoff.toString() << endl;
	set_min_backoff.insert(min_backoff);
	
//	addFstState(id, min_backoff, fst);
//	q.push(min_backoff);
	

	
	// Estimate number of backoff loop to bound the backoff path length
// 	float ratioa = 0.0;
// 	float ratiob = 0.0;
	float ratio = 0.0;
// 	for (int i=0; i < min_backoff.getNumDims(); i++) {
// 		if (min_backoff.getDim(i) == 1) {
// 			ratioa++;
// 		}
// 		if (fsth.getDim(i) == 1) {
// 			ratiob++;
// 		}
// 	}
// 	ratioa /= min_backoff.getNumDims();
// 	ratiob /= min_backoff.getNumDims();
// 	ratio = (ratioa*(1.0-ratiob))+(ratiob*(1.0-ratioa));
	ratio=1.0;

//	printf("ratio=%f\t%i BO loops\n", ratio, n_bo_loops);
	
	
	
	//foreach state in the queue
	while (!q.empty()) {
		fsth = q.front();
		q.pop();
		id = h2state[&fsth];
		state2h.push_back(new NeuronFstHistory(fsth));
		if (id == FINAL_STATE) { continue; }


		
		
	dprintf(1,"-- STUDY STATE %li = %s\n", id, fsth.toString().c_str());
	

/*		try { posterior.at(id) = MY_LOG_ONE; }
		catch (exception e) {
			posterior.resize((int) (posterior.size()*1.5)+1);
			posterior.at(id) = MY_LOG_ONE;
		}*/
		
		computeEntropyAndConditionals(entropy, all_prob, rnnlm, fsth);
		
		//compute BO in advance and check if it is a min BO node
		bo_fsth = getBackoff(rnnlm, fsth, set_min_backoff, all_prob, to_be_removed);
		if (bo_fsth == fsth) { bo_fsth = min_backoff; }
			
		//foreach w (ie, foreach word of each class c)
		//test if the edge has to kept or removed
		backoff = false; //no backoff yet since no edge has been removed
		for (w=0; w < rnnlm.getVocabSize(); w++) {
				p = all_prob[w];
				
				/*p_joint = exp(-posterior[id]-p);*/
				p_joint = exp(-p);
				delta = -1.0*p_joint*log2(p_joint);
				
				//accept edge if this leads to a minimum
				//relative gain of the entropy

				dprintf(2,"P = %e \tP_joint = %e \tH = %e \tDelta =%e \tDelta H = %.6f %%\n",exp(-p), p_joint, entropy, delta, 100.0*delta/entropy);

				if (set_min_backoff.find(fsth) != set_min_backoff.end() || (delta > pruning_threshold*entropy)) {
//				if ((fsth == min_backoff) || (delta > pruning_threshold*entropy)) {
					next_n_added++;
					to_be_added.push_back(w);
					to_be_added_prob.push_back(p);
					dprintf(2,"\tACCEPT [%li] -- %i (%s) / %f --> ...\t(%e > %e)\n", id, w, rnnlm.getWordString(w), p, delta, pruning_threshold*entropy);
//					to_be_removed.push_back(w);
 				}
 				//backoff
				else {
//					to_be_removed.push_back(w);
					backoff = true;
					dprintf(2,"\tPRUNE [%li] -- %i / %f --> ...\n", id, w, p);
 				}
 				
 				//print
				if (next_n_processed % 100000 == 0) {
						fprintf(stderr, "\rH=%.5f / N proc'd=%li / N added=%li (%.5f %%) / N bo=%li (%.5f %%) / %li/%li Nodes (%2.1f %%) / N min BO=%i", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, n_backoff, ((float) n_backoff/ (float)n_added)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())), (int) set_min_backoff.size());
				}
				next_n_processed++;
 				
//			}
		}


		//Set a part of the new FST history
		new_fsth.setFstHistory(rnnlm, *dzer);

		//if at least one word is backing off
		if (backoff) {
			
			n_backoff++;
			if (to_be_added.size() == 0) {
				n_only_backoff++;
			}
			
			
			if (addFstState(new_id, new NeuronFstHistory(bo_fsth), fst)) {
				q.push(bo_fsth);
				try { non_bo_pred.at(new_id) = false; }
				catch (exception e) {
					non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1);
					non_bo_pred.at(new_id) = false;
				}
				
			}
			dprintf(1,"BACKOFF\t[%li]\t(%s)\n-------\t[%li]\t(%s)\n", id, fsth.toString().c_str(), new_id, bo_fsth.toString().c_str());

			fst.AddArc(id, LogArc(EPSILON, EPSILON, LogWeight::Zero(), new_id));
			
			addPred(pred, new_id, id);
			
		}
		
		
		vector<real>::iterator it_p = to_be_added_prob.begin();
		for (vector<int>::iterator it = to_be_added.begin(); it != to_be_added.end(); ++it) {
			w = *it;
			p = *it_p;

			if (w == 0) {
				fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,FINAL_STATE));
				dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] FINAL STATE)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, FINAL_STATE);				
			}
		
			//accept edge
			else {
				new_fsth.setLastWord(w);
	
				//if sw not in the memory
				//then add a new state for sw in the FST and push sw in the queue
				if (addFstState(new_id, new NeuronFstHistory(new_fsth), fst)) {
					q.push(new_fsth);
					try { non_bo_pred.at(new_id) = true; }
					catch (exception e) {
						non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1);
						non_bo_pred.at(new_id) = true;
					}
				}
				else { /* already exists */ }
			
				//add the edge in the FST
				non_bo_pred.at(new_id) = true;
				fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,new_id));
				dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] (%s)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, new_id, new_fsth.toString().c_str());				

//				posterior.at(new_id) += posterior[id]*p;

			}
			
			/*if (posterior[id]+p < LogWeight::Zero().Value()) {
				p_joint = exp(-posterior[id]-p);
				entropy -= p_joint*log2(p_joint);
			}*/
			
			++it_p;
		}
		
		n_added = next_n_added;
		n_processed = next_n_processed;
		
		//reset queues
		to_be_added.clear();
		to_be_added_prob.clear();
//		to_be_removed.clear();
		
	}

	cout << endl;
	
	//compute backoff weights
	deleted = compactBackoffNodes(fst, pred, non_bo_pred);
	computeAllBackoff(fst, pred);


	//remove useless nodes
	removeStates(fst, new_fst, deleted);
	fst.DeleteStates();
	fst = new_fst;
	
	//Fill the table of symbols
	SymbolTable dic("dictionnary");
	dic.AddSymbol("*", 0);
	for (int i=0; i<rnnlm.getVocabSize(); i++) {
		dic.AddSymbol(string(rnnlm.getWordString(i)), i+1);
	}
	fst.SetInputSymbols(&dic);
	fst.SetOutputSymbols(&dic);

						//printf("H=%.5f / N proc'd=%li / N added=%li (%.5f %%) %li/%li Nodes (%2.1f %%)\n", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())));
	cout << "END" << endl;
	
}
 LogArc operator()(const LogArc &arc) const {
   int ret(InputArcInfos_.empty() ? arc.ilabel : InputArcInfos_[arc.ilabel].label);
   return LogArc(ret, arc.olabel, arc.weight, arc.nextstate);
 }
 LogArc operator()(const LogArc &arc) const {
   int ret(arc.olabel <= eps_label_ ? 0 : arc.olabel);
   return LogArc(ret, arc.olabel, arc.weight, arc.nextstate);
 }
 LogArc operator()(const LogArc &arc) const {
   LogWeight ret(arc.weight.Value() * (arc.weight.Value() == FloatLimits<float>::PosInfinity() ? 1 : weight_));
   return LogArc(arc.ilabel, arc.olabel, ret, arc.nextstate);
 }