Example #1
0
log_double_t correction(const data_partition& P,const vector<int>& nodes)
{
    if (P.variable_alignment())
    {
        // get the lengths of then internal node
        int length = P.seqlength(nodes[0]);

        return pow(P.sequence_length_pr(length), 2);
    }
    else
        return 1;
}
Example #2
0
vector<HMM::bitmask_t> get_bitpath(const data_partition& P, const vector<int>& nodes)
{
    auto t = P.t();

    int b1 = t.find_branch(nodes[1],nodes[0]);
    int b2 = t.find_branch(nodes[0],nodes[2]);
    int b3 = t.find_branch(nodes[0],nodes[3]);

    vector<HMM::bitmask_t> a1 = convert_to_bits(P.get_pairwise_alignment(b1),0,3);
    vector<HMM::bitmask_t> a2 = convert_to_bits(P.get_pairwise_alignment(b2),3,1);
    vector<HMM::bitmask_t> a3 = convert_to_bits(P.get_pairwise_alignment(b3),3,2);

    vector<HMM::bitmask_t> a123 = Glue_A(a1, Glue_A(a2, a3));

    return a123;
}
Example #3
0
efloat_t prior_HMM_rootless_scale(const data_partition& P)
{
  const Tree& T = *P.T;

#ifndef NDEBUG
  assert(P.has_IModel());
  check_internal_nodes_connected(*P.A,T);
#endif
  
  efloat_t Pr = 1;

  for(int i=T.n_leaves();i<T.n_nodes();i++) {
    int l = P.seqlength(i);
    efloat_t temp = P.IModel().lengthp(l);
    Pr /= (temp*temp);
  }

  return Pr;
}
Example #4
0
/// Probability of a multiple alignment if branch alignments independant
efloat_t prior_HMM_nogiven(const data_partition& P) 
{
  const alignment& A = *P.A;
  const Tree& T = *P.T;

#ifndef NDEBUG
  assert(P.has_IModel());
  check_internal_nodes_connected(A,T);
#endif
  
  efloat_t Pr = 1;

  for(int b=0;b<T.n_branches();b++) {
    int target = T.branch(b).target();
    int source  = T.branch(b).source();
    Pr *= prior_branch(A, P.branch_HMMs[b], target, source);
  }
  
  return Pr;
}
Example #5
0
boost::shared_ptr<DParrayConstrained> sample_node_base(data_partition& P,const vector<int>& nodes)
{
    default_timer_stack.push_timer("alignment::DP1/3-way");
    const Tree& T = *P.T;

    assert(P.variable_alignment());

    alignment old = *P.A;

    //  std::cerr<<"old = "<<old<<endl;

    /*------------- Compute sequence properties --------------*/
    int n0 = nodes[0];
    int n1 = nodes[1];
    int n2 = nodes[2];
    int n3 = nodes[3];
    vector<int> columns = getorder(old,n0,n1,n2,n3);

    //  std::cerr<<"n0 = "<<n0<<"   n1 = "<<n1<<"    n2 = "<<n2<<"    n3 = "<<n3<<std::endl;
    //  std::cerr<<"old (reordered) = "<<project(old,n0,n1,n2,n3)<<endl;

    // Find sub-alignments and sequences
    vector<int> seq1;
    vector<int> seq2;
    vector<int> seq3;
    vector<int> seq123;
    for(int i=0; i<columns.size(); i++) {
        int column = columns[i];
        if (not old.gap(column,n1))
            seq1.push_back(column);
        if (not old.gap(column,n2))
            seq2.push_back(column);
        if (not old.gap(column,n3))
            seq3.push_back(column);
        if (not old.gap(column,n1) or not old.gap(column,n2) or not old.gap(column,n3))
            seq123.push_back(column);
    }

    // Map columns with n2 or n3 to single index 'c'
    vector<int> icol(seq123.size()+1);
    vector<int> jcol(seq123.size()+1);
    vector<int> kcol(seq123.size()+1);

    icol[0] = 0;
    jcol[0] = 0;
    kcol[0] = 0;
    for(int c=1,i=0,j=0,k=0; c<seq123.size()+1; c++) {
        if (not old.gap(seq123[c-1],n1))
            i++;
        if (not old.gap(seq123[c-1],n2))
            j++;
        if (not old.gap(seq123[c-1],n3))
            k++;
        icol[c] = i;
        jcol[c] = j;
        kcol[c] = k;
    }


    /*-------------- Create alignment matrices ---------------*/

    // Cache which states emit which sequences
    vector<int> state_emit(nstates+1);
    for(int S2=0; S2<state_emit.size(); S2++) {
        state_emit[S2] = 0;

        if (di(S2) or dj(S2) or dk(S2))
            state_emit[S2] |= (1<<0);
    }


    vector<int> branches;
    for(int i=1; i<nodes.size(); i++)
        branches.push_back(T.branch(nodes[0],nodes[i]) );

    const Matrix Q = createQ(P.branch_HMMs,branches);
    vector<double> start_P = get_start_P(P.branch_HMMs,branches);


    // Actually create the Matrices & Chain
    boost::shared_ptr<DParrayConstrained>
    Matrices( new DParrayConstrained(seq123.size(),state_emit,start_P,Q, P.get_beta())
            );

    // Determine which states are allowed to match (c2)
    for(int c2=0; c2<Matrices->size(); c2++) {
        int i2 = icol[c2];
        int j2 = jcol[c2];
        int k2 = kcol[c2];
        Matrices->states(c2).reserve(Matrices->nstates());
        for(int i=0; i<Matrices->nstates(); i++) {
            int S2 = Matrices->order(i);

            //---------- Get (i1,j1,k1) ----------
            int i1 = i2;
            if (di(S2)) i1--;

            int j1 = j2;
            if (dj(S2)) j1--;

            int k1 = k2;
            if (dk(S2)) k1--;

            //------ Get c1, check if valid ------
            if (c2==0
                    or (i1 == i2 and j1 == j2 and k1 == k2)
                    or (i1 == icol[c2-1] and j1 == jcol[c2-1] and k1 == kcol[c2-1]) )
                Matrices->states(c2).push_back(S2);
            else
            { } // this state not allowed here
        }
    }


    /*------------------ Compute the DP matrix ---------------------*/
    // Matrices.prune();  prune is broken!
    Matrices->forward();

    //------------- Sample a path from the matrix -------------------//

    vector<int> path_g = Matrices->sample_path();
    vector<int> path = Matrices->ungeneralize(path_g);

    *P.A = construct(old,path,n0,n1,n2,n3,T,seq1,seq2,seq3);
    for(int i=1; i<4; i++) {
        int b = T.branch(nodes[0],nodes[i]);
        P.note_alignment_changed_on_branch(b);
    }

#ifndef NDEBUG
    vector<int> path_new = get_path_3way(project(*P.A,n0,n1,n2,n3),0,1,2,3);
    vector<int> path_new2 = get_path_3way(*P.A,n0,n1,n2,n3);
    assert(path_new == path_new2); // <- current implementation probably guarantees this
    //    but its not a NECESSARY effect of the routine.

    // get the generalized paths - no sequential silent states that can loop
    vector<int> path_new_g = Matrices->generalize(path_new);
    assert(path_new_g == path_g);
    assert(valid(*P.A));


#endif

    default_timer_stack.pop_timer();
    return Matrices;
}
Example #6
0
boost::shared_ptr<DPmatrixSimple> sample_alignment_base(data_partition& P,int b) 
{
  assert(P.has_IModel());

  dynamic_bitset<> s1 = constraint_satisfied(P.alignment_constraint, *P.A);

  const Tree& T = *P.T;
  //FIXME - partitions
  data_partition P0 = P;  // We COULD make this conditional... perhaps we should
  //FIXME - partitions
  alignment& old = *P0.A;

  const Matrix frequency = substitution::frequency_matrix(P.SModel());

  int node1 = T.branch(b).target();
  int node2 = T.branch(b).source();

  dynamic_bitset<> group1 = T.partition(node2,node1);

  // Find sub-alignments and sequences
  vector<int> seq1;
  vector<int> seq2;
  vector<int> seq12;

  for(int column=0;column<old.length();column++)
  {
    if (not old.gap(column,node1))
      seq1.push_back(column);
    if (not old.gap(column,node2))
      seq2.push_back(column);

    if (not old.gap(column,node1) or old.gap(column,node2))
      seq12.push_back(column);
  }

  //FIXME - this makes the debug routines crash
  if (not seq1.size() or not seq2.size()) 
    return boost::shared_ptr<DPmatrixSimple>(); //NULL;

  /******** Precompute distributions at node2 from the 2 subtrees **********/
  distributions_t_local distributions = distributions_tree;
  if (not P.smodel_full_tree)
    distributions = distributions_star;

  vector< Matrix > dists1 = distributions(P0,seq1,b,true);
  vector< Matrix > dists2 = distributions(P0,seq2,b,false);

  vector<int> state_emit(4,0);
  state_emit[0] |= (1<<1)|(1<<0);
  state_emit[1] |= (1<<1);
  state_emit[2] |= (1<<0);
  state_emit[3] |= 0;

  boost::shared_ptr<DPmatrixSimple> 
    Matrices( new DPmatrixSimple(state_emit, P.branch_HMMs[b].start_pi(),
				 P.branch_HMMs[b], P.beta[0], 
				 P.SModel().distribution(), dists1, dists2, frequency)
	      );

  //------------------ Compute the DP matrix ---------------------//
  vector<int> path_old = get_path(old,node1,node2);
  vector<vector<int> > pins = get_pins(P.alignment_constraint,old,group1,~group1,seq1,seq2,seq12);

  vector<int> path = Matrices->forward(pins);

  path.erase(path.begin()+path.size()-1);

  *P.A = construct(old,path,node1,node2,T,seq1,seq2);
  P.LC.set_length(P.A->length());
  P.LC.invalidate_branch_alignment(T,b);
  P.note_alignment_changed_on_branch(b);

#ifndef NDEBUG_DP
  assert(valid(*P.A));
  dynamic_bitset<> s2 = constraint_satisfied(P.alignment_constraint, *P.A);
  report_constraints(s1,s2);

  vector<int> path_new = get_path(*P.A, node1, node2);
  path.push_back(3);
  assert(path_new == path);
#endif

  return Matrices;
}
Example #7
0
 efloat_t Pr(const data_partition& P,Likelihood_Cache& LC) {
   return Pr(*P.A, P.MC, *P.T, LC, P.SModel());
 }
Example #8
0
  /// Find the probabilities of each letter at the root, given the data at the nodes in 'group'
  vector<Matrix>
  get_column_likelihoods(const data_partition& P, const vector<int>& b,
			 const vector<int>& req,const vector<int>& seq,int delta)
  {
    const alphabet& a = P.get_alphabet();

    const alignment& A = *P.A;
    const Tree& T = *P.T;
    Likelihood_Cache& LC = P.LC;

#ifndef NDEBUG
    subA_index_check_footprint(A,T);
    subA_index_check_regenerate(A,T);
#endif

    //------ Check that all branches point to a 'root' node -----------//
    assert(b.size());
    int root = T.directed_branch(b[0]).target();
    for(int i=1;i<b.size();i++)
      assert(T.directed_branch(b[i]).target() == root);
    LC.root = root;

    ublas::matrix<int> index = subA_index_any(b,A,T,req,seq);

    IF_DEBUG(int n_br =) calculate_caches(P);
#ifndef NDEBUG
    std::clog<<"get_column_likelihoods: Peeled on "<<n_br<<" branches.\n";
#endif

    vector<Matrix> L;
    L.reserve(A.length()+2);

    Matrix& S = LC.scratch(0);
    const int n_models = S.size1();
    const int n_states = S.size2();

    //Add the padding matrices
    {
      for(int i=0;i<S.size1();i++)
	for(int j=0;j<S.size2();j++)
	  S(i,j) = 0;

      for(int i=0;i<delta;i++)
	L.push_back(S);
    }

    const vector<unsigned>& smap = P.SModel().state_letters();

    for(int i=0;i<index.size1();i++) {

      for(int m=0;m<n_models;m++) {
	for(int s=0;s<n_states;s++) 
	  S(m,s) = 1;

	//-------------- Propagate and collect information at 'root' -----------//
	for(int j=0;j<b.size();j++) {
	  int i0 = index(i,j);
	  if (i0 != alphabet::gap)
	    for(int s=0;s<n_states;s++) 
	      S(m,s) *= LC(i0,b[j])(m,s);
	}

	if (root < T.n_leaves()) {
	  int rl = A.seq(root)[i];
	  if (a.is_letter_class(rl))
	    for(int s=0;s<n_states;s++)
	      if (not a.matches(smap[s],rl))
		S(m,s) = 0;
	}
      }
      L.push_back(S);
    }
    return L;
  }
Example #9
0
 int calculate_caches(const data_partition& P) {
   return calculate_caches(*P.A, P.MC, *P.T, P.LC, P.SModel());
 }
Example #10
0
  efloat_t calc_root_probability(const data_partition& P,const vector<int>& rb,
			       const ublas::matrix<int>& index) 
  {
    return calc_root_probability(*P.A, *P.T, P.LC, P.SModel(), rb, index);
  }
Example #11
0
boost::shared_ptr<DPmatrixConstrained> tri_sample_alignment_base(data_partition& P,const vector<int>& nodes)
{
  const Tree& T = *P.T;
  alignment& A = *P.A;

  assert(P.variable_alignment());

  assert(T.is_connected(nodes[0],nodes[1]));
  assert(T.is_connected(nodes[0],nodes[2]));
  assert(T.is_connected(nodes[0],nodes[3]));

  const Matrix frequency = substitution::frequency_matrix(P.SModel());

  // std::cerr<<"A = "<<A<<endl;

  //------------- Compute sequence properties --------------//
  dynamic_bitset<> group1 = T.partition(nodes[0],nodes[1]);
  dynamic_bitset<> group2 = T.partition(nodes[0],nodes[2]);
  dynamic_bitset<> group3 = T.partition(nodes[0],nodes[3]);


  //  std::clog<<"n0 = "<<nodes[0]<<"   n1 = "<<nodes[1]<<"    n2 = "<<nodes[2]<<"    n3 = "<<nodes[3]<<std::endl;
  //  std::clog<<"A (reordered) = "<<project(A,nodes[0],nodes[1],nodes[2],nodes[3])<<endl;
  vector<int> columns = getorder(A,nodes[0],nodes[1],nodes[2],nodes[3]);

#ifndef NDEBUG

  // getorder(project(A,...)...) is not the same as getorder(A,...) because columns that are
  // in both project(A,...) and A have different columns numbers in each alignment, and
  // project(A,...) is shorter.

  // However, the NUMBER of columns should be the same. 
  vector<int> columns2 = getorder(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3);
  assert(columns.size() == columns2.size());
#endif

  // Find sub-alignments and sequences
  vector<int> seq1; seq1.reserve(A.length());
  vector<int> seq2; seq2.reserve(A.length());
  vector<int> seq3; seq3.reserve(A.length());
  vector<int> seq23; seq23.reserve(A.length());
  for(int i=0;i<columns.size();i++) {
    int column = columns[i];
    if (not A.gap(column,nodes[1]))
      seq1.push_back(column);
    if (not A.gap(column,nodes[2]))
      seq2.push_back(column);
    if (not A.gap(column,nodes[3]))
      seq3.push_back(column);

    if (not A.gap(column,nodes[2]) or not A.gap(column,nodes[3]))
      seq23.push_back(column);
  }

  // Map columns with n2 or n3 to single index 'c'
  vector<int> jcol(seq23.size()+1);
  vector<int> kcol(seq23.size()+1);

  jcol[0] = 0;
  kcol[0] = 0;
  for(int c=1,j=0,k=0;c<seq23.size()+1;c++) {
    if (not A.gap(seq23[c-1],nodes[2]))
      j++;    
    if (not A.gap(seq23[c-1],nodes[3]))
      k++;
    jcol[c] = j;
    kcol[c] = k;
  }

  // Precompute distributions at nodes[0]
  distributions_t distributions = distributions_tree;
  if (not P.smodel_full_tree)
    distributions = distributions_star;

  vector< Matrix > dists1 = distributions(P,seq1,nodes[0],group1);
  vector< Matrix > dists23 = distributions(P,seq23,nodes[0],group2|group3);


  //-------------- Create alignment matrices ---------------//

  vector<int> branches(3);
  for(int i=0;i<3;i++)
    branches[i] = T.branch(nodes[0],nodes[i+1]);

  const Matrix Q = createQ(P.branch_HMMs, branches);
  vector<double> start_P = get_start_P(P.branch_HMMs,branches);

  // Actually create the Matrices & Chain
  boost::shared_ptr<DPmatrixConstrained> 
    Matrices(new DPmatrixConstrained(get_state_emit(), start_P, Q, P.beta[0],
				     P.SModel().distribution(), dists1, dists23, frequency)
	     );

  // Determine which states are allowed to match (,c2)
  for(int c2=0;c2<dists23.size()-1;c2++) 
  {
    int j2 = jcol[c2];
    int k2 = kcol[c2];
    Matrices->states(c2).reserve(Matrices->nstates());
    for(int i=0;i<Matrices->nstates();i++) {
      int S2 = Matrices->order(i);

      //---------- Get (,j1,k1) ----------
      int j1 = j2;
      if (dj(S2)) 
	j1--;

      int k1 = k2;
      if (dk(S2)) 
	k1--;
      
      //------ Get c1, check if valid ------
      if (c2==0 or (j1 == j2 and k1 == k2) or (j1 == jcol[c2-1] and k1 == kcol[c2-1]) )
	Matrices->states(c2+1).push_back(S2);
      else
	{ } // this state not allowed here
    }
  }


  //------------------ Compute the DP matrix ---------------------//

  //   Matrices.prune(); prune is broken!
  
  //  vector<int> path_old = get_path_3way(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3);
  //  vector<int> path_old_g = Matrices.generalize(path_old);

  //  vector<int> path_g = Matrices.forward(P.features,(int)P.constants[0],path_old_g);
  vector<vector<int> > pins = get_pins(P.alignment_constraint,A,group1,group2 | group3,seq1,seq23,columns);

  // if the constraints are currently met but cannot be met
  if (pins.size() == 1 and pins[0][0] == -1)
    ; //std::cerr<<"Constraints cannot be expressed in terms of DP matrix paths!"<<std::endl;
  else {
    Matrices->forward_constrained(pins);
    if (Matrices->Pr_sum_all_paths() <= 0.0) 
      std::cerr<<"Constraints give this choice probability 0"<<std::endl;
  }

  if (Matrices->Pr_sum_all_paths() <= 0.0) 
    return Matrices;

  vector<int> path_g = Matrices->sample_path();

  vector<int> path = Matrices->ungeneralize(path_g);

  A = construct(A,path,nodes[0],nodes[1],nodes[2],nodes[3],T,seq1,seq2,seq3);
  for(int i=1;i<4;i++) {
    int b = T.branch(nodes[0],nodes[i]);
    P.note_alignment_changed_on_branch(b);
  }

#ifndef NDEBUG_DP
  //--------------- Check alignment construction ------------------//
  vector<int> path_new = get_path_3way(project(A,nodes),0,1,2,3);

  vector<int> path_new2 = get_path_3way(A,nodes);
  assert(path_new == path_new2); // <- current implementation probably guarantees this
                                 //    but its not a NECESSARY effect of the routine.
                                 //    due to ordering stuff required in the path but
                                 //    not store in the alignment A.
  vector<int> path_new_g = Matrices->generalize(path_new);
  if (path_new_g != path_g) {
    std::clog<<"A' (reordered) = "<<project(A,nodes)<<endl;
    std::clog<<"A' = "<<A<<endl;
    std::abort();
  }

  assert(valid(A));
#endif

  //  std::cerr<<"[tri]bandwidth = "<<bandwidth(Matrices,path_g)<<std::endl;

  //  std::cerr<<"[tri]bandwidth2 = "<<bandwidth2(Matrices,path_g)<<std::endl;

#ifndef NDEBUG_DP
  check_alignment(A,T,"sample_tri_base:out");
#else
  Matrices->clear();
#endif

  P.LC.set_length(A.length());
  int b = T.branch(nodes[0],nodes[1]);
  P.LC.invalidate_branch_alignment(T, b);

  return Matrices;
}