void PhonetisaurusE2F::entry_to_fsa_w( vector<string>* tokens ){ /* Convert an input grapheme sequence to an equivalent Finite-State Machine. */ //Build a linear FSA representing the input _entry_to_skip_fsa( tokens ); //Add any multi-grapheme arcs word = VectorFst<StdArc>(ComposeFst<StdArc>(word,ifilter)); Project(&word, PROJECT_OUTPUT); //Now optimize the result RmEpsilon(&word); ArcSort(&word,OLabelCompare<StdArc>()); return; }
void M2MFstAligner::_conditional_max( bool y_given_x ){ /* Compute the conditional distribution, P(Y|X) using the WFST paradigm. This is bassed on the approach from Shu and Hetherington 2002. It is assumed that all WFSTs and operations use the Log semiring. Given: FST1 = P(X,Y) Compute: FST2 = P(X) := Map_inv(Det(RmEps(Proj_i(FST1)))) FST3 = P(Y|X) := Compose(FST2,FST1) Proj_i: project on input labels RmEps: epsilon removal Det: determinize Map_inv: invert weights Notes: An analogous process may be used to compute P(X|Y). In this case one would project on OUTPUT labels - Proj_o, and reverse the composition order to Compose(FST1,FST2). Future work: What we are doing here in terms of *generating* the JOINT fst each time is really just a dumb hack. We *should* encode the model in an FST and encode the individual lattices, rather than doing the hacky manual label encoding that we currently rely on. */ //Joint distribution that we start with VectorFst<LogArc>* joint = new VectorFst<LogArc>(); SymbolTable* misyms = new SymbolTable("misyms"); SymbolTable* mosyms = new SymbolTable("mosyms"); joint->AddState(); joint->AddState(); joint->SetStart(0); joint->SetFinal(1,LogArc::Weight::One()); map<LogArc::Label,LogWeight>::iterator it; for( it=prev_alignment_model.begin(); it != prev_alignment_model.end(); it++ ){ string isym = isyms->Find((*it).first); vector<string> io = tokenize_utf8_string( &isym, &s1s2_sep ); LogArc arc( misyms->AddSymbol(io[0]), mosyms->AddSymbol(io[1]), (*it).second, 1 ); joint->AddArc( 0, arc ); } //VectorFst<LogArc>* joint = new VectorFst<LogArc>(); //Push<LogArc,REWEIGHT_TO_FINAL>(*_joint, joint, kPushWeights); //joint->SetFinal(1,LogWeight::One()); joint->Write("m2mjoint.fst"); //BEGIN COMPUTE MARGINAL P(X) VectorFst<LogArc>* dmarg; if( y_given_x ) dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_INPUT)); else dmarg = new VectorFst<LogArc>(ProjectFst<LogArc>(*joint, PROJECT_OUTPUT)); RmEpsilon(dmarg); VectorFst<LogArc>* marg = new VectorFst<LogArc>(); Determinize(*dmarg, marg); ArcMap(marg, InvertWeightMapper<LogArc>()); if( y_given_x ) ArcSort(marg, OLabelCompare<LogArc>()); else ArcSort(marg, ILabelCompare<LogArc>()); //END COMPUTE MARGINAL P(X) marg->Write("marg.fst"); //CONDITIONAL P(Y|X) VectorFst<LogArc>* cond = new VectorFst<LogArc>(); if( y_given_x ) Compose(*marg, *joint, cond); else Compose(*joint, *marg, cond); //cond now contains the conditional distribution P(Y|X) cond->Write("cond.fst"); //Now update the model with the new values for( MutableArcIterator<VectorFst<LogArc> > aiter(cond, 0); !aiter.Done(); aiter.Next() ){ LogArc arc = aiter.Value(); string lab = misyms->Find(arc.ilabel)+"}"+mosyms->Find(arc.olabel); int labi = isyms->Find(lab); alignment_model[labi] = arc.weight; prev_alignment_model[labi] = LogWeight::Zero(); } delete joint, marg, cond, dmarg; delete misyms, mosyms; return; }
vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst, int nbest) { //Generic alignment generator VectorFst<StdArc> fst; Map(ifst, &fst, LogToStdMapper()); for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done(); siter.Next()) { StdArc::StateId q = siter.Value(); for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q); !aiter.Done(); aiter.Next()) { //Prior to decoding we make several 'heuristic' modifications to the weights: // 1. A multiplier is applied to any multi-token substrings // 2. Any LogWeight::Zero() arc weights are reset to '99'. // We are basically resetting 'Infinity' values to a 'smallest non-Infinity' // so that the ShortestPath algorithm actually produces something no matter what. // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1 // are set to '99' this forces shortestpath to choose arcs where one of the // following conditions holds true // * len(subseq1)>1 && len(subseq2)!=len(subseq1) // * len(subseq2)>1 && len(subseq1)!=len(subseq2) // * len(subseq1)==len(subseq2)==1 //I suspect these heuristics can be eliminated with a better choice of the initialization // function and maximization function, but this is the way that m2m-aligner works, so // it makes sense for our first cut implementation. //In any case, this guarantees that M2MFstAligner produces results identical to those // produced by m2m-aligner - but with a bit more reliability. //UPDATE: this now produces a better alignment than m2m-aligner. // The maxl heuristic is still in place. The aligner will produce *better* 1-best alignments // *without* the maxl heuristic below, BUT this comes at the cost of producing a less // flexible corpus. That is, for a small training corpus like nettalk, if we use the // best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen // data. Using the aignment lattices to train the joint ngram model solves this problem. // Oh baby. Can't wait to for everyone to see the paper! //NOTE: this is going to fail if we encounter any alignments in a new test item that never // occurred in the original model. StdArc arc = aiter.Value(); int maxl = get_max_length(isyms->Find(arc.ilabel)); if (maxl == -1) { arc.weight = 999; } else { //Optionally penalize m-to-1 / 1-to-m links. This produces // WORSE 1-best alignments, but results in better joint n-gram // models for small training corpora when using only the 1-best // alignment. By further favoring 1-to-1 alignments the 1-best // alignment corpus results in a more flexible joint n-gram model // with regard to previously unseen data. //if( penalize==true ){ arc.weight = alignment_model[arc.ilabel].Value() * maxl; //}else{ //For larger corpora this is probably unnecessary. //arc.weight = alignment_model[arc.ilabel].Value(); //} } if (arc.weight == LogWeight::Zero()) arc.weight = 999; if (arc.weight != arc.weight) arc.weight = 999; aiter.SetValue(arc); } } VectorFst<StdArc> shortest; ShortestPath(fst, &shortest, nbest); RmEpsilon(&shortest); //Skip empty results. This should only happen // in the following situations: // 1. seq1_del=false && len(seq1)<len(seq2) // 2. seq2_del=false && len(seq1)>len(seq2) //In both 1.and 2. the issue is that we need to // insert a 'skip' in order to guarantee at least // one valid alignment path through seq1*seq2, but // user params didn't allow us to. //Probably better to insert these where necessary // during initialization, regardless of user prefs. if (shortest.NumStates() == 0) { vector<PathData> dummy; return dummy; } FstPathFinder pathfinder(skipSeqs); pathfinder.isyms = isyms; pathfinder.findAllStrings(shortest); return pathfinder.paths; }