void PhonetisaurusE2F::_make_loop_and_iomap( const EncodeTable<StdArc>& table ){ loop = new VectorFst<StdArc>(); loop->AddState(); loop->SetStart(0); if( verbose==true ){ for( size_t i=1; i<=table.Size(); i++ ){ const EncodeTable<StdArc>::Tuple *t = table.Decode(i); cout << "i=" << i << " in: " << isyms->Find(t->ilabel) << " out: " << osyms->Find(t->olabel) << endl; } } for( size_t i=2; i<=table.Size(); i++ ){ const EncodeTable<StdArc>::Tuple *t = table.Decode(i); if( i2omap->find(t->ilabel)==i2omap->end() ){ vector<size_t> m; m.push_back(t->olabel); i2omap->insert(pair<size_t, vector<size_t> >(t->ilabel, m)); loop->AddArc( 0, StdArc( t->ilabel, t->olabel, StdArc::Weight::One(), 0 ) ); }else{ (*i2omap)[t->ilabel].push_back(t->olabel); loop->AddArc( 0, StdArc( t->ilabel, t->olabel, StdArc::Weight::One(), 0 ) ); } } loop->SetFinal(0, StdArc::Weight::One()); ArcSort(loop, ILabelCompare<StdArc>()); return; }
//STEP 2: Create a filter, which adds multi-token links and skip support void PhonetisaurusE2F::_make_ifilter( ){ /* Create a filter FST. This will map arcs in the linear input FSA to longer clusters wherever appropriate. A more advanced version can be used to also place restrictions on how many phoneme insertions to allow, or how to penalize them. */ ifilter.AddState(); ifilter.SetStart(0); for( size_t j=2; j<isyms->NumSymbols(); j++ ){ ifilter.AddArc( 0, StdArc( j, j, StdArc::Weight::One(), 0 ) ); } typedef map<vector<string>, size_t>::iterator cl_iter; size_t k = 1; for( cl_iter it=iclusters->begin(); it != iclusters->end(); it++){ ifilter.AddState(); ifilter.AddArc( 0, StdArc( isyms->Find(it->first.at(0)), it->second, StdArc::Weight::One(), k ) ); ifilter.AddArc( k, StdArc( isyms->Find(it->first.at(1)), 0, StdArc::Weight::One(), 0 ) ); k++; } ifilter.SetFinal( 0, StdArc::Weight::One() ); return; }
//STEP 1: Create a linear FSA with skip loops void PhonetisaurusE2F::_entry_to_skip_fsa( vector<string>* tokens ){ word = VectorFst<StdArc>(); word.AddState(); word.SetStart(0); size_t i=0; for( i=0; i<tokens->size(); i++){ word.AddState(); string ch = tokens->at(i); word.AddArc( i, StdArc( isyms->Find(ch), isyms->Find(ch), StdArc::Weight::One(), i+1 ) ); //If phoneme insertions are to be allowed if( allow_ins==true ) word.AddArc( i, StdArc( 2, 2, StdArc::Weight::One(), i ) ); } if( allow_ins==true ) word.AddArc( i, StdArc( 2, 2, StdArc::Weight::One(), i ) ); word.SetFinal( i, StdArc::Weight::One() ); ArcSort(&word,OLabelCompare<StdArc>()); return; }
void PhonetisaurusE2F::entry_to_fst_m( vector<string>* tokens ){ /* Convert an input word into an equivalent FST. In this case the entire process is achieved via a 'mechanical' algorithm rather than a series of atomic WFST-based operations. */ word.AddState(); word.SetStart(0); //Build the basic FST size_t i=0; for( i=0; i<tokens->size(); i++){ word.AddState(); size_t il = isyms->Find(tokens->at(i)); for( size_t j=0; j<(*i2omap)[il].size(); j++ ) word.AddArc( i, StdArc( il, (*i2omap)[il][j], StdArc::Weight::One(), i+1 )); if( allow_ins==true ) for( size_t j=0; j<(*i2omap)[2].size(); j++ ) word.AddArc( i, StdArc( 2, (*i2omap)[2][j], StdArc::Weight::One(), i ) ); } if( allow_ins==true ) for( size_t j=0; j<(*i2omap)[2].size(); j++ ) word.AddArc( i, StdArc( 2, (*i2omap)[2][j], StdArc::Weight::One(), i ) ); //Add any cluster arcs map<vector<string>,size_t>::iterator it_i; for( it_i=iclusters->begin(); it_i!=iclusters->end(); it_i++ ){ vector<string>::iterator it_j; vector<string>::iterator start = tokens->begin(); vector<string> cluster = (*it_i).first; while( it_j != tokens->end() ){ it_j = search( start, tokens->end(), cluster.begin(), cluster.end() ); if( it_j != tokens->end() ){ for( size_t j=0; j<(*i2omap)[(*it_i).second].size(); j++ ) word.AddArc( it_j-tokens->begin(), StdArc( (*it_i).second, //input symbol (*i2omap)[(*it_i).second][j], //output symbol 0, //weight it_j-tokens->begin()+cluster.size() //destination state ) ); start = it_j+cluster.size(); } } } word.SetFinal( i, StdArc::Weight::One() ); return; }
StdArc operator() (const LexStdArc& arc) const { W w; if (i_ == 0) w = Times ( arc.weight.Value1(), arc.weight.Value2() ); if (i_ == 1) w = arc.weight.Value1(); if (i_ == 2) w = arc.weight.Value2(); return StdArc (arc.ilabel, arc.olabel, w, arc.nextstate); }
void Arpa2OpenFST::make_arc( string istate, string ostate, string isym, string osym, double weight ){ //Build up an arc for the WFST. Weights default to the Log semiring. if( ssyms->Find(istate) == -1 ){ int new_ssym_id = arpafst.AddState(); ssyms->AddSymbol( istate, new_ssym_id ); } if( ssyms->Find(ostate) == -1 ){ int new_ssym_id = arpafst.AddState(); ssyms->AddSymbol( ostate, new_ssym_id ); } weight = log10_2tropical(weight); vector<string> io = tokenize_utf8_string( &isym, &delim ); if( io.size()==2 ){ if( io[0].compare(null_sep)==0 ) io[0] = eps; arpafst.AddArc( ssyms->Find(istate), StdArc( isyms->AddSymbol(io[0]), osyms->AddSymbol(io[1]), weight, ssyms->Find(ostate)) ); }else{ arpafst.AddArc( ssyms->Find(istate), StdArc( isyms->AddSymbol(isym), osyms->AddSymbol(osym), weight, ssyms->Find(ostate)) ); } return; }
void ARPA2WFST::_make_arc( string istate, string ostate, string isym, double weight ){ //Build up an arc for the WFST. Weights default to the Log semiring. int is_id = ssyms->Find(istate); int os_id = ssyms->Find(ostate); if( is_id == -1 ){ is_id = arpafst.AddState(); ssyms->AddSymbol( istate, is_id ); } if( os_id == -1 ){ os_id = arpafst.AddState(); ssyms->AddSymbol( ostate, os_id ); } weight = log10_2tropical(weight); int sid = isyms->AddSymbol(isym); arpafst.AddArc( is_id, StdArc( sid, sid, weight, os_id) ); return; }