示例#1
0
void M2MFstAligner::Sequences2FST( VectorFst<LogArc>* fst, vector<string>* seq1, vector<string>* seq2 ){
  /*
    Build an FST that represents all possible alignments between seq1 and seq2, given the 
     parameter values input by the user.  Here we encode the input and output labels, in fact
     creating a WFSA.  This simplifies the training process, but means that we can only 
     easily compute a joint maximization.  In practice joint maximization seems to give the 
     best results anyway, so it probably doesn't matter.

    Note: this also performs the initizization routine.  It performs a UNIFORM initialization
     meaning that every non-null alignment sequence is eventually initialized to 1/Num(unique_alignments).
     It might be more appropriate to consider subsequence length here, but for now we stick 
     to the m2m-aligner approach.

    TODO: Add an FST version and support for conditional maximization.  May be useful for languages
     like Japanese where there is a distinct imbalance in the seq1->seq2 length correspondences.
  */
  int istate=0; int ostate=0;
  for( unsigned int i=0; i<=seq1->size(); i++ ){
    for( unsigned int j=0; j<=seq2->size(); j++ ){
      fst->AddState();
      istate = i*(seq2->size()+1)+j;

      //Epsilon arcs for seq1
      if( seq1_del==true )
	for( unsigned int l=1; l<=seq2_max; l++ ){
	  if( j+l<=seq2->size() ){
	    vector<string> subseq2( seq2->begin()+j, seq2->begin()+j+l );
	    int is = isyms->AddSymbol(skip+s1s2_sep+vec2str(subseq2, seq2_sep));
	    ostate = i*(seq2->size()+1) + (j+l);
	    LogArc arc( is, is, 99, ostate );
	    fst->AddArc( istate, arc );
	    /*
	    if( prev_alignment_model.find(arc.ilabel)==prev_alignment_model.end() ){
	      prev_alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel,arc.weight) );
	      _compute_penalties( arc.ilabel, 1, l, true, false );
	    }else{
	      prev_alignment_model[arc.ilabel] = Plus(prev_alignment_model[arc.ilabel],arc.weight);
	    }
	    total = Plus( total, arc.weight );
	    */
	  }
	}

      //Epsilon arcs for seq2
      if( seq2_del==true )
	for( unsigned int k=1; k<=seq1_max; k++ ){
	  if( i+k<=seq1->size() ){
	    vector<string> subseq1( seq1->begin()+i, seq1->begin()+i+k );
	    int is = isyms->AddSymbol(vec2str(subseq1, seq1_sep)+s1s2_sep+skip);
	    ostate = (i+k)*(seq2->size()+1) + j;
	    LogArc arc( is, is, 99, ostate );
	    fst->AddArc( istate, arc );
	    /*
	    if( prev_alignment_model.find(arc.ilabel)==prev_alignment_model.end() ){
	      prev_alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel,arc.weight) );
	      _compute_penalties( arc.ilabel, k, 1, false, true );
	    }else{
	      prev_alignment_model[arc.ilabel] = Plus(prev_alignment_model[arc.ilabel],arc.weight);
	    }
	    total = Plus(total, arc.weight);
	    */
	  }
	}

      //All the other arcs
      for( unsigned int k=1; k<=seq1_max; k++ ){
	for( unsigned int l=1; l<=seq2_max; l++ ){
	  if( i+k<=seq1->size() && j+l<=seq2->size() ){
	    vector<string> subseq1( seq1->begin()+i, seq1->begin()+i+k );
	    string s1 = vec2str(subseq1, seq1_sep);
	    vector<string> subseq2( seq2->begin()+j, seq2->begin()+j+l );
	    string s2 = vec2str(subseq2, seq2_sep);
	    //This says only 1-M and N-1 allowed, no M-N links!
	    if( restrict==true && l>1 && k>1)
	      continue;
	    int is = isyms->AddSymbol(s1+s1s2_sep+s2);
	    ostate = (i+k)*(seq2->size()+1) + (j+l);
	    LogArc arc( is, is, LogWeight::One().Value()*(k+l), ostate );
	    fst->AddArc( istate, arc );
	    //During the initialization phase, just count non-eps transitions
	    //We currently initialize to uniform probability so there is also 
            // no need to tally anything here.
	    /*
	    if( prev_alignment_model.find(arc.ilabel)==prev_alignment_model.end() ){
	      prev_alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel, arc.weight) );
	      _compute_penalties( arc.ilabel, k, l, false, false );
	    }else{
	      prev_alignment_model[arc.ilabel] = Plus(prev_alignment_model[arc.ilabel],arc.weight);
	    }
	    total = Plus( total, arc.weight );
	    */
	  }
	}
      }

    }
  }

  fst->SetStart(0);
  fst->SetFinal( ((seq1->size()+1)*(seq2->size()+1))-1, LogWeight::One() );
  //Unless seq1_del==true && seq2_del==true we will have unconnected states
  // thus we need to run connect to clean out these states
  if( seq1_del==false || seq2_del==false )
    Connect(fst);

  //Only add arcs that are in the FINAL fst to the model
  for( StateIterator<VectorFst<LogArc> > siter(*fst); !siter.Done(); siter.Next() ){
    LogArc::StateId q = siter.Value();
    for( ArcIterator<VectorFst<LogArc> > aiter(*fst, q); !aiter.Done(); aiter.Next() ){
      const LogArc& arc = aiter.Value();
      if( prev_alignment_model.find(arc.ilabel)==prev_alignment_model.end() ){
	prev_alignment_model.insert( pair<LogArc::Label,LogWeight>(arc.ilabel, arc.weight) );
	string sym = isyms->Find(arc.ilabel);
	size_t del = sym.find("}");
	size_t ski = sym.find("_");
	size_t chu = sym.find("|");
	int k=1; int l=1;
	bool xd = false; bool yd = false;
	if( chu!=string::npos ){
	  if( chu<del )
	    k += 1;
	  else
	    l += 1;
	}
	if( ski!=string::npos ){
	  if( ski<del )
	    xd = true;
	  else
	    yd = true;
	}
	_compute_penalties( arc.ilabel, k, l, false, false );
      }else{
	prev_alignment_model[arc.ilabel] = Plus(prev_alignment_model[arc.ilabel],arc.weight);
      }
      total = Plus( total, arc.weight );
    }
  }

  return;
}
示例#2
0
void M2MFstAligner::Sequences2FSTNoInit( VectorFst<LogArc>* fst, vector<string>* seq1, vector<string>* seq2 ){
  /*
    Build an FST that represents all possible alignments between seq1 and seq2, given the 
     parameter values input by the user.  Here we encode the input and output labels, in fact
     creating a WFSA.  This simplifies the training process, but means that we can only 
     easily compute a joint maximization.  In practice joint maximization seems to give the 
     best results anyway, so it probably doesn't matter.
  */
  int istate=0; int ostate=0;
  for( unsigned int i=0; i<=seq1->size(); i++ ){
    for( unsigned int j=0; j<=seq2->size(); j++ ){
      fst->AddState();
      istate = i*(seq2->size()+1)+j;

      //Epsilon arcs for seq1
      if( seq1_del==true )
	for( unsigned int l=1; l<=seq2_max; l++ ){
	  if( j+l<=seq2->size() ){
	    vector<string> subseq2( seq2->begin()+j, seq2->begin()+j+l );
	    int is = isyms->Find(skip+s1s2_sep+vec2str(subseq2, seq2_sep));
	    ostate = i*(seq2->size()+1) + (j+l);
	    LogArc arc( is, is, alignment_model[is], ostate );
	    _compute_penalties( arc.ilabel, 1, l, true, false );
	    fst->AddArc( istate, arc );
	  }
	}

      //Epsilon arcs for seq2
      if( seq2_del==true )
	for( unsigned int k=1; k<=seq1_max; k++ ){
	  if( i+k<=seq1->size() ){
	    vector<string> subseq1( seq1->begin()+i, seq1->begin()+i+k );
	    int is = isyms->Find(vec2str(subseq1, seq1_sep)+s1s2_sep+skip);
	    ostate = (i+k)*(seq2->size()+1) + j;
	    LogArc arc( is, is, alignment_model[is], ostate );
	    _compute_penalties( arc.ilabel, k, 1, false, true );
	    fst->AddArc( istate, arc );
	  }
	}

      //All the other arcs
      for( unsigned int k=1; k<=seq1_max; k++ ){
	for( unsigned int l=1; l<=seq2_max; l++ ){
	  if( i+k<=seq1->size() && j+l<=seq2->size() ){
	    vector<string> subseq1( seq1->begin()+i, seq1->begin()+i+k );
	    string s1 = vec2str(subseq1, seq1_sep);
	    vector<string> subseq2( seq2->begin()+j, seq2->begin()+j+l );
	    string s2 = vec2str(subseq2, seq2_sep);
	    if( restrict==true && l>1 && k>1)
	      continue;
	    int is = isyms->Find(s1+s1s2_sep+s2);
	    ostate = (i+k)*(seq2->size()+1) + (j+l);
	    LogArc arc( is, is, alignment_model[is], ostate );
	    _compute_penalties( arc.ilabel, k, l, false, false );
	    fst->AddArc( istate, arc );
	  }
	}
      }

    }
  }

  fst->SetStart(0);
  fst->SetFinal( ((seq1->size()+1)*(seq2->size()+1))-1, LogWeight::One() );
  //Unless seq1_del==true && seq2_del==true we will have unconnected states
  // thus we need to run connect to clean out these states
  if( seq1_del==false || seq2_del==false )
    Connect(fst);
  return;
}
示例#3
0
void
M2MFstAligner::Sequences2FST(VectorFst<LogArc> *fst,
                             vector<string> *seq1,
                             vector<string> *seq2)
{
    /*
       Build an FST that represents all possible alignments between seq1 and seq2, given the
       parameter values input by the user.  Here we encode the input and output labels, in fact
       creating a WFSA.  This simplifies the training process, but means that we can only
       easily compute a joint maximization.  In practice joint maximization seems to give the
       best results anyway, so it probably doesn't matter.

       Note: this also performs the initizization routine.  It performs a UNIFORM initialization
       meaning that every non-null alignment sequence is eventually initialized to 1/Num(unique_alignments).
       It might be more appropriate to consider subsequence length here, but for now we stick
       to the m2m-aligner approach.

       TODO: Add an FST version and support for conditional maximization.  May be useful for languages
       like Japanese where there is a distinct imbalance in the seq1->seq2 length correspondences.
     */
    int istate = 0;
    int ostate = 0;
    for (int i = 0; i <= seq1->size(); i++) {
        for (int j = 0; j <= seq2->size(); j++) {
            fst->AddState();
            istate = i * (seq2->size() + 1) + j;

            //Epsilon arcs for seq1
            if (seq1_del == true)
                for (int l = 1; l <= seq2_max; l++) {
                    if (j + l <= seq2->size()) {
                        vector<string> subseq2(seq2->begin() + j,
                                                  seq2->begin() + j + l);
                        int is =
                            isyms->AddSymbol(skip + s1s2_sep +
                                             vec2str(subseq2, seq2_sep));
                        ostate = i * (seq2->size() + 1) + (j + l);
                        //LogArc arc( is, is, LogWeight::One().Value()*(l+1)*2, ostate );
                        LogArc arc(is, is, 99, ostate);
                        //LogArc arc( is, is, LogWeight::Zero(), ostate );
                        fst->AddArc(istate, arc);
                        if (prev_alignment_model.find(arc.ilabel) ==
                                prev_alignment_model.end())
                            prev_alignment_model.insert(pair <
                                                        LogArc::Label,
                                                        LogWeight >
                                                        (arc.ilabel,
                                                         arc.weight));
                        else
                            prev_alignment_model[arc.ilabel] =
                                Plus(prev_alignment_model[arc.ilabel],
                                     arc.weight);
                        total = Plus(total, arc.weight);
                    }
                }

            //Epsilon arcs for seq2
            if (seq2_del == true)
                for (int k = 1; k <= seq1_max; k++) {
                    if (i + k <= seq1->size()) {
                        vector<string> subseq1(seq1->begin() + i,
                                                  seq1->begin() + i + k);
                        int is =
                            isyms->AddSymbol(vec2str(subseq1, seq1_sep) +
                                             s1s2_sep + skip);
                        ostate = (i + k) * (seq2->size() + 1) + j;
                        //LogArc arc( is, is, LogWeight::One().Value()*(k+1)*2, ostate );
                        LogArc arc(is, is, 99, ostate);
                        //LogArc arc( is, is, LogWeight::Zero(), ostate );
                        fst->AddArc(istate, arc);
                        if (prev_alignment_model.find(arc.ilabel) ==
                                prev_alignment_model.end())
                            prev_alignment_model.insert(pair <
                                                        LogArc::Label,
                                                        LogWeight >
                                                        (arc.ilabel,
                                                         arc.weight));
                        else
                            prev_alignment_model[arc.ilabel] =
                                Plus(prev_alignment_model[arc.ilabel],
                                     arc.weight);
                        total = Plus(total, arc.weight);
                    }
                }

            //All the other arcs
            for (int k = 1; k <= seq1_max; k++) {
                for (int l = 1; l <= seq2_max; l++) {
                    if (i + k <= seq1->size() && j + l <= seq2->size()) {
                        vector<string> subseq1(seq1->begin() + i,
                                                  seq1->begin() + i + k);
                        string s1 = vec2str(subseq1, seq1_sep);
                        vector<string> subseq2(seq2->begin() + j,
                                                  seq2->begin() + j + l);
                        string s2 = vec2str(subseq2, seq2_sep);
                        if (l > 1 && k > 1)
                            continue;
                        int is = isyms->AddSymbol(s1 + s1s2_sep + s2);
                        ostate = (i + k) * (seq2->size() + 1) + (j + l);
                        LogArc arc(is, is,
                                   LogWeight::One().Value() * (k + l),
                                   ostate);
                        //LogArc arc( is, is, LogWeight::One().Value(), ostate );
                        fst->AddArc(istate, arc);
                        //During the initialization phase, just count non-eps transitions
                        //We currently initialize to uniform probability so there is also
                        // no need to tally anything here.
                        if (prev_alignment_model.find(arc.ilabel) ==
                                prev_alignment_model.end())
                            prev_alignment_model.insert(pair <
                                                        LogArc::Label,
                                                        LogWeight >
                                                        (arc.ilabel,
                                                         arc.weight));
                        else
                            prev_alignment_model[arc.ilabel] =
                                Plus(prev_alignment_model[arc.ilabel],
                                     arc.weight);
                        total = Plus(total, arc.weight);
                    }
                }
            }

        }
    }

    fst->SetStart(0);
    fst->SetFinal(((seq1->size() + 1) * (seq2->size() + 1)) - 1,
                  LogWeight::One());
    //Unless seq1_del==true && seq2_del==true we will have unconnected states
    // thus we need to run connect to clean out these states
    //fst->SetInputSymbols(isyms);
    //fst->Write("right.nc.fsa");
    if (seq1_del == false or seq2_del == false)
        Connect(fst);
    //fst->Write("right.c.fsa");
    return;
}