Ejemplo n.º 1
0
boost::shared_ptr<DPmatrixSimple> sample_alignment_base(data_partition& P,int b) 
{
  assert(P.has_IModel());

  dynamic_bitset<> s1 = constraint_satisfied(P.alignment_constraint, *P.A);

  const Tree& T = *P.T;
  //FIXME - partitions
  data_partition P0 = P;  // We COULD make this conditional... perhaps we should
  //FIXME - partitions
  alignment& old = *P0.A;

  const Matrix frequency = substitution::frequency_matrix(P.SModel());

  int node1 = T.branch(b).target();
  int node2 = T.branch(b).source();

  dynamic_bitset<> group1 = T.partition(node2,node1);

  // Find sub-alignments and sequences
  vector<int> seq1;
  vector<int> seq2;
  vector<int> seq12;

  for(int column=0;column<old.length();column++)
  {
    if (not old.gap(column,node1))
      seq1.push_back(column);
    if (not old.gap(column,node2))
      seq2.push_back(column);

    if (not old.gap(column,node1) or old.gap(column,node2))
      seq12.push_back(column);
  }

  //FIXME - this makes the debug routines crash
  if (not seq1.size() or not seq2.size()) 
    return boost::shared_ptr<DPmatrixSimple>(); //NULL;

  /******** Precompute distributions at node2 from the 2 subtrees **********/
  distributions_t_local distributions = distributions_tree;
  if (not P.smodel_full_tree)
    distributions = distributions_star;

  vector< Matrix > dists1 = distributions(P0,seq1,b,true);
  vector< Matrix > dists2 = distributions(P0,seq2,b,false);

  vector<int> state_emit(4,0);
  state_emit[0] |= (1<<1)|(1<<0);
  state_emit[1] |= (1<<1);
  state_emit[2] |= (1<<0);
  state_emit[3] |= 0;

  boost::shared_ptr<DPmatrixSimple> 
    Matrices( new DPmatrixSimple(state_emit, P.branch_HMMs[b].start_pi(),
				 P.branch_HMMs[b], P.beta[0], 
				 P.SModel().distribution(), dists1, dists2, frequency)
	      );

  //------------------ Compute the DP matrix ---------------------//
  vector<int> path_old = get_path(old,node1,node2);
  vector<vector<int> > pins = get_pins(P.alignment_constraint,old,group1,~group1,seq1,seq2,seq12);

  vector<int> path = Matrices->forward(pins);

  path.erase(path.begin()+path.size()-1);

  *P.A = construct(old,path,node1,node2,T,seq1,seq2);
  P.LC.set_length(P.A->length());
  P.LC.invalidate_branch_alignment(T,b);
  P.note_alignment_changed_on_branch(b);

#ifndef NDEBUG_DP
  assert(valid(*P.A));
  dynamic_bitset<> s2 = constraint_satisfied(P.alignment_constraint, *P.A);
  report_constraints(s1,s2);

  vector<int> path_new = get_path(*P.A, node1, node2);
  path.push_back(3);
  assert(path_new == path);
#endif

  return Matrices;
}
Ejemplo n.º 2
0
 efloat_t Pr(const data_partition& P,Likelihood_Cache& LC) {
   return Pr(*P.A, P.MC, *P.T, LC, P.SModel());
 }
Ejemplo n.º 3
0
  /// Find the probabilities of each letter at the root, given the data at the nodes in 'group'
  vector<Matrix>
  get_column_likelihoods(const data_partition& P, const vector<int>& b,
			 const vector<int>& req,const vector<int>& seq,int delta)
  {
    const alphabet& a = P.get_alphabet();

    const alignment& A = *P.A;
    const Tree& T = *P.T;
    Likelihood_Cache& LC = P.LC;

#ifndef NDEBUG
    subA_index_check_footprint(A,T);
    subA_index_check_regenerate(A,T);
#endif

    //------ Check that all branches point to a 'root' node -----------//
    assert(b.size());
    int root = T.directed_branch(b[0]).target();
    for(int i=1;i<b.size();i++)
      assert(T.directed_branch(b[i]).target() == root);
    LC.root = root;

    ublas::matrix<int> index = subA_index_any(b,A,T,req,seq);

    IF_DEBUG(int n_br =) calculate_caches(P);
#ifndef NDEBUG
    std::clog<<"get_column_likelihoods: Peeled on "<<n_br<<" branches.\n";
#endif

    vector<Matrix> L;
    L.reserve(A.length()+2);

    Matrix& S = LC.scratch(0);
    const int n_models = S.size1();
    const int n_states = S.size2();

    //Add the padding matrices
    {
      for(int i=0;i<S.size1();i++)
	for(int j=0;j<S.size2();j++)
	  S(i,j) = 0;

      for(int i=0;i<delta;i++)
	L.push_back(S);
    }

    const vector<unsigned>& smap = P.SModel().state_letters();

    for(int i=0;i<index.size1();i++) {

      for(int m=0;m<n_models;m++) {
	for(int s=0;s<n_states;s++) 
	  S(m,s) = 1;

	//-------------- Propagate and collect information at 'root' -----------//
	for(int j=0;j<b.size();j++) {
	  int i0 = index(i,j);
	  if (i0 != alphabet::gap)
	    for(int s=0;s<n_states;s++) 
	      S(m,s) *= LC(i0,b[j])(m,s);
	}

	if (root < T.n_leaves()) {
	  int rl = A.seq(root)[i];
	  if (a.is_letter_class(rl))
	    for(int s=0;s<n_states;s++)
	      if (not a.matches(smap[s],rl))
		S(m,s) = 0;
	}
      }
      L.push_back(S);
    }
    return L;
  }
Ejemplo n.º 4
0
 int calculate_caches(const data_partition& P) {
   return calculate_caches(*P.A, P.MC, *P.T, P.LC, P.SModel());
 }
Ejemplo n.º 5
0
  efloat_t calc_root_probability(const data_partition& P,const vector<int>& rb,
			       const ublas::matrix<int>& index) 
  {
    return calc_root_probability(*P.A, *P.T, P.LC, P.SModel(), rb, index);
  }
Ejemplo n.º 6
0
boost::shared_ptr<DPmatrixConstrained> tri_sample_alignment_base(data_partition& P,const vector<int>& nodes)
{
  const Tree& T = *P.T;
  alignment& A = *P.A;

  assert(P.variable_alignment());

  assert(T.is_connected(nodes[0],nodes[1]));
  assert(T.is_connected(nodes[0],nodes[2]));
  assert(T.is_connected(nodes[0],nodes[3]));

  const Matrix frequency = substitution::frequency_matrix(P.SModel());

  // std::cerr<<"A = "<<A<<endl;

  //------------- Compute sequence properties --------------//
  dynamic_bitset<> group1 = T.partition(nodes[0],nodes[1]);
  dynamic_bitset<> group2 = T.partition(nodes[0],nodes[2]);
  dynamic_bitset<> group3 = T.partition(nodes[0],nodes[3]);


  //  std::clog<<"n0 = "<<nodes[0]<<"   n1 = "<<nodes[1]<<"    n2 = "<<nodes[2]<<"    n3 = "<<nodes[3]<<std::endl;
  //  std::clog<<"A (reordered) = "<<project(A,nodes[0],nodes[1],nodes[2],nodes[3])<<endl;
  vector<int> columns = getorder(A,nodes[0],nodes[1],nodes[2],nodes[3]);

#ifndef NDEBUG

  // getorder(project(A,...)...) is not the same as getorder(A,...) because columns that are
  // in both project(A,...) and A have different columns numbers in each alignment, and
  // project(A,...) is shorter.

  // However, the NUMBER of columns should be the same. 
  vector<int> columns2 = getorder(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3);
  assert(columns.size() == columns2.size());
#endif

  // Find sub-alignments and sequences
  vector<int> seq1; seq1.reserve(A.length());
  vector<int> seq2; seq2.reserve(A.length());
  vector<int> seq3; seq3.reserve(A.length());
  vector<int> seq23; seq23.reserve(A.length());
  for(int i=0;i<columns.size();i++) {
    int column = columns[i];
    if (not A.gap(column,nodes[1]))
      seq1.push_back(column);
    if (not A.gap(column,nodes[2]))
      seq2.push_back(column);
    if (not A.gap(column,nodes[3]))
      seq3.push_back(column);

    if (not A.gap(column,nodes[2]) or not A.gap(column,nodes[3]))
      seq23.push_back(column);
  }

  // Map columns with n2 or n3 to single index 'c'
  vector<int> jcol(seq23.size()+1);
  vector<int> kcol(seq23.size()+1);

  jcol[0] = 0;
  kcol[0] = 0;
  for(int c=1,j=0,k=0;c<seq23.size()+1;c++) {
    if (not A.gap(seq23[c-1],nodes[2]))
      j++;    
    if (not A.gap(seq23[c-1],nodes[3]))
      k++;
    jcol[c] = j;
    kcol[c] = k;
  }

  // Precompute distributions at nodes[0]
  distributions_t distributions = distributions_tree;
  if (not P.smodel_full_tree)
    distributions = distributions_star;

  vector< Matrix > dists1 = distributions(P,seq1,nodes[0],group1);
  vector< Matrix > dists23 = distributions(P,seq23,nodes[0],group2|group3);


  //-------------- Create alignment matrices ---------------//

  vector<int> branches(3);
  for(int i=0;i<3;i++)
    branches[i] = T.branch(nodes[0],nodes[i+1]);

  const Matrix Q = createQ(P.branch_HMMs, branches);
  vector<double> start_P = get_start_P(P.branch_HMMs,branches);

  // Actually create the Matrices & Chain
  boost::shared_ptr<DPmatrixConstrained> 
    Matrices(new DPmatrixConstrained(get_state_emit(), start_P, Q, P.beta[0],
				     P.SModel().distribution(), dists1, dists23, frequency)
	     );

  // Determine which states are allowed to match (,c2)
  for(int c2=0;c2<dists23.size()-1;c2++) 
  {
    int j2 = jcol[c2];
    int k2 = kcol[c2];
    Matrices->states(c2).reserve(Matrices->nstates());
    for(int i=0;i<Matrices->nstates();i++) {
      int S2 = Matrices->order(i);

      //---------- Get (,j1,k1) ----------
      int j1 = j2;
      if (dj(S2)) 
	j1--;

      int k1 = k2;
      if (dk(S2)) 
	k1--;
      
      //------ Get c1, check if valid ------
      if (c2==0 or (j1 == j2 and k1 == k2) or (j1 == jcol[c2-1] and k1 == kcol[c2-1]) )
	Matrices->states(c2+1).push_back(S2);
      else
	{ } // this state not allowed here
    }
  }


  //------------------ Compute the DP matrix ---------------------//

  //   Matrices.prune(); prune is broken!
  
  //  vector<int> path_old = get_path_3way(project(A,nodes[0],nodes[1],nodes[2],nodes[3]),0,1,2,3);
  //  vector<int> path_old_g = Matrices.generalize(path_old);

  //  vector<int> path_g = Matrices.forward(P.features,(int)P.constants[0],path_old_g);
  vector<vector<int> > pins = get_pins(P.alignment_constraint,A,group1,group2 | group3,seq1,seq23,columns);

  // if the constraints are currently met but cannot be met
  if (pins.size() == 1 and pins[0][0] == -1)
    ; //std::cerr<<"Constraints cannot be expressed in terms of DP matrix paths!"<<std::endl;
  else {
    Matrices->forward_constrained(pins);
    if (Matrices->Pr_sum_all_paths() <= 0.0) 
      std::cerr<<"Constraints give this choice probability 0"<<std::endl;
  }

  if (Matrices->Pr_sum_all_paths() <= 0.0) 
    return Matrices;

  vector<int> path_g = Matrices->sample_path();

  vector<int> path = Matrices->ungeneralize(path_g);

  A = construct(A,path,nodes[0],nodes[1],nodes[2],nodes[3],T,seq1,seq2,seq3);
  for(int i=1;i<4;i++) {
    int b = T.branch(nodes[0],nodes[i]);
    P.note_alignment_changed_on_branch(b);
  }

#ifndef NDEBUG_DP
  //--------------- Check alignment construction ------------------//
  vector<int> path_new = get_path_3way(project(A,nodes),0,1,2,3);

  vector<int> path_new2 = get_path_3way(A,nodes);
  assert(path_new == path_new2); // <- current implementation probably guarantees this
                                 //    but its not a NECESSARY effect of the routine.
                                 //    due to ordering stuff required in the path but
                                 //    not store in the alignment A.
  vector<int> path_new_g = Matrices->generalize(path_new);
  if (path_new_g != path_g) {
    std::clog<<"A' (reordered) = "<<project(A,nodes)<<endl;
    std::clog<<"A' = "<<A<<endl;
    std::abort();
  }

  assert(valid(A));
#endif

  //  std::cerr<<"[tri]bandwidth = "<<bandwidth(Matrices,path_g)<<std::endl;

  //  std::cerr<<"[tri]bandwidth2 = "<<bandwidth2(Matrices,path_g)<<std::endl;

#ifndef NDEBUG_DP
  check_alignment(A,T,"sample_tri_base:out");
#else
  Matrices->clear();
#endif

  P.LC.set_length(A.length());
  int b = T.branch(nodes[0],nodes[1]);
  P.LC.invalidate_branch_alignment(T, b);

  return Matrices;
}