Esempio n. 1
0
/// \brief  Remap the leaf indices of tree \a T to match the alignment \a A: check the result
///
/// \param A The alignment.
/// \param T The tree.
/// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree?
///
void link(alignment& A,SequenceTree& T,bool internal_sequences) 
{
  check_names_unique(A);

  // Later, might we WANT sub-branches???
  if (has_sub_branches(T))
    remove_sub_branches(T);

  if (internal_sequences and not is_Cayley(T) and T.n_leaves() > 1) {
    assert(has_polytomy(T));
    throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences.";
  }

  //------ IF sequences < leaf nodes THEN complain ---------//
  if (A.n_sequences() < T.n_leaves())
    throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has "
		       <<A.n_sequences()<<" sequences.";

  //----- IF sequences = leaf nodes THEN maybe add internal sequences.
  else if (A.n_sequences() == T.n_leaves()) 
  {
    A = remap_A_indices(A,T);

    if (internal_sequences)
    {
      add_internal_labels(T);
      A = add_internal(A,T);
      connect_leaf_characters(A,T);
    }
  }
  //----- IF sequences > leaf nodes THEN maybe complain -------//
  else if (A.n_sequences() > T.n_nodes())
    throw myexception()<<"More alignment sequences ("<<A.n_sequences()<<") than tree nodes ("<<T.n_nodes()<<")!";
  else if (A.n_sequences() < T.n_nodes())
    throw myexception()<<"Fewer alignment sequences ("<<A.n_sequences()<<") than tree nodes ("<<T.n_nodes()<<")!";
  else
  {
    A = remap_A_indices(A,T);
  
    if (not internal_sequences) 
      A = chop_internal(A);
  }
  
  //---------- double-check that we have the right number of sequences ---------//
  if (internal_sequences)
    assert(A.n_sequences() == T.n_nodes());
  else
    assert(A.n_sequences() == T.n_leaves());

  //----- Check that each alignment sequence maps to a corresponding name in the tree -----//
  for(int i=0;i<A.n_sequences();i++)
    assert(T.get_label(i) == A.seq(i).name);

  //---- Check to see that internal nodes satisfy constraints ----//
  check_alignment(A,T,internal_sequences);
}
Esempio n. 2
0
vector<vector<int> > get_all_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T,
					       const ublas::matrix<int>& cost)
{
  int root = T.directed_branch(0).target();

  ublas::matrix<int> n_muts(T.n_nodes(), a.size());
  peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) );

  // get an order list of branches point away from the root;
  vector<const_branchview> branches = branches_from_node(T,root);
  std::reverse(branches.begin(),branches.end());
  
  // Allocate space to store the letters for each node
  vector<vector<int> > node_letters(T.n_nodes());

  const unsigned A = a.size();

  // choose the cheapest letters at the root
  {
    double m = row_min(n_muts,root);
    for(int l=0;l<A;l++)
      if (n_muts(root,l) <= m)
	node_letters[root].push_back(l);
  }

  vector<double> temp(A);

  for(int i=0;i<branches.size();i++) 
  {
    int s = branches[i].source();
    int t = branches[i].target();

    vector<double> best(node_letters[s].size());

    for(int j=0;j<node_letters[s].size();j++) 
    {
      for(int l=0;l<A;l++)
	temp[l] = n_muts(t,l)+cost(l,node_letters[s][j]);
      best[j] = min(temp);
    }
    
    for(int l=0;l<A;l++) 
    {
      bool is_best = false;
      for(int j=0;j<node_letters[s].size() and not is_best;j++) 
	if (n_muts(t,l)+cost(l,node_letters[s][j]) <= best[j])
	  is_best=true;
      if (is_best)
	node_letters[t].push_back(l);
    }

  }

  return node_letters;
}
Esempio n. 3
0
/// \brief  Remap the leaf indices of tree \a T to match the alignment \a A: check the result
///
/// \param A The alignment.
/// \param T The tree.
/// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree?
///
void link(alignment& A,SequenceTree& T,bool internal_sequences) 
{
  check_names_unique(A);

  // Later, might we WANT sub-branches???
  if (has_sub_branches(T))
    remove_sub_branches(T);

  if (internal_sequences and not is_Cayley(T)) {
    assert(has_polytomy(T));
    throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences.";
  }

  //------ IF sequences < leaf nodes THEN complain ---------//
  if (A.n_sequences() < T.n_leaves())
    throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has "
		       <<A.n_sequences()<<" sequences.";

  //----- IF sequences = leaf nodes THEN maybe add internal sequences.
  else if (A.n_sequences() == T.n_leaves()) {
    if (internal_sequences)
      A = add_internal(A,T);
  }
  //----- IF sequences > leaf nodes THEN maybe complain -------//
  else {
    if (not internal_sequences) {
      alignment A2 = chop_internal(A);
      if (A2.n_sequences() == T.n_leaves()) {
	A = A2;
      }
      else
	throw myexception()<<"More alignment sequences than leaf nodes!";
    } 
    else if (A.n_sequences() > T.n_nodes())
      throw myexception()<<"More alignment sequences than tree nodes!";
    else if (A.n_sequences() < T.n_nodes())
      throw myexception()<<"Fewer alignment sequences than tree nodes!";
  }
  
  //---------- double-check that we have the right number of sequences ---------//
  if (internal_sequences)
    assert(A.n_sequences() == T.n_nodes());
  else
    assert(A.n_sequences() == T.n_leaves());

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  remap_T_indices(T,A);

  if (internal_sequences)
    connect_leaf_characters(A,T);

  //---- Check to see that internal nodes satisfy constraints ----//
  check_alignment(A,T,internal_sequences);
}
Esempio n. 4
0
alignment add_internal(alignment A,const SequenceTree& T) 
{
  // Complain if A and T don't correspond
  if (A.n_sequences() != T.n_leaves())
    throw myexception()<<"Number of sequence in alignment doesn't match number of leaves in tree"
		       <<"- can't add internal sequences";

  // Add empty sequences
  vector<sequence> S;
  for(int i=T.n_leaves();i<T.n_nodes();i++) 
  {
    sequence s;

    if (T.label(i) == "")
      throw myexception()<<"Adding internal sequences: Tree has missing internal node name!";

    s.name = T.label(i);

    S.push_back(s);
  }

  A.add_sequences(S);

  return A;
}
Esempio n. 5
0
void peel_n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,
		      const ublas::matrix<B>& cost,ublas::matrix<B>& n_muts,
		      const vector<const_branchview>& branches)
{
  const int A = a.size();

  assert(letters.size() == T.n_leaves());
  assert(cost.size1() == A);
  assert(cost.size2() == A);

  // we need a scratch row in the matrix
  assert(n_muts.size1() == T.n_nodes());
  assert(n_muts.size2() == A);

  // compute the max cost -- is this approach a good idea?
  // Well... this apparently doesn't work.
  B max_cost = 0;
  for(int i=0;i<A;i++)
    for(int j=0;j<A;j++)
      max_cost = std::max(cost(i,j)+1, max_cost);
    
  // clear the length matrix.
  for(int i=0;i<n_muts.size1();i++)
    for(int j=0;j<n_muts.size2();j++)
      n_muts(i,j)=0;
  
  // set the leaf costs
  for(int s=0;s<T.n_leaves();s++)
  {
    int L = letters[s];

    if (a.is_letter_class(L))
      for(int l=0;l<A;l++)
	if (a.matches(l,L))
	  n_muts(s,l) = 0;
	else
	  n_muts(s,l) = max_cost;
  }


  // compute the costs for letters at each node
  for(int i=0;i<branches.size();i++)
  {
    int s = branches[i].source();
    int t = branches[i].target();

    // for each letter l of node target...
    for(int l=0;l<A;l++)
    {
      // compute minimum treelength for data behind source.
      B temp = n_muts(s,0)+cost(0,l);
      for(int k=1;k<A;k++)
	temp = min(temp, n_muts(s,k)+cost(k,l) );

      // add it to treelengths for data behind target
      n_muts(t,l) += temp;
    }
  }
}
Esempio n. 6
0
 accum_branch_lengths_same_topology(const SequenceTree& T)
   :
   n_samples(0),
   n_matches(0),
   Q(T),
   m1(0.0, Q.n_branches()),
   m2(0.0, Q.n_branches()),
   n1(0.0, Q.n_nodes())
 {}
Esempio n. 7
0
void add_internal_labels(SequenceTree& T)
{
  for(int i=0;i<T.n_nodes();i++)
    if (T.node(i).is_internal_node())
    {
      if (T.get_label(i) == "")
	T.set_label(i, string("A") + convertToString(i));
    }
}
Esempio n. 8
0
B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost)
{
  int root = T.directed_branch(0).target();

  vector<const_branchview> branches = branches_toward_node(T,root);

  ublas::matrix<B> n_muts(T.n_nodes(), a.size());

  return n_mutations(a,letters,T,cost,n_muts,branches);
}
Esempio n. 9
0
vector<int> get_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T,
				  const ublas::matrix<int>& cost)
{
  int root = T.directed_branch(0).target();
  ublas::matrix<int> n_muts(T.n_nodes(),a.size());

  peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) );

  // get an order list of branches point away from the root;
  vector<const_branchview> branches = branches_from_node(T,root);
  std::reverse(branches.begin(),branches.end());
  
  // Allocate space to store the letter for each node
  vector<int> node_letters(T.n_nodes(),-1);

  // choose the cheapest letter at the root
  node_letters[root] = row_min(n_muts,root);

  const unsigned A = a.size();
  vector<double> temp(A);

  for(int i=0;i<branches.size();i++) 
  {
    int s = branches[i].source();
    int t = branches[i].target();

    int k = node_letters[s];
    assert(k != -1);

    for(int l=0;l<A;l++)
      temp[l] = n_muts(t,l)+cost(l,k);

    node_letters[t] = argmin(temp);
  }

  return node_letters;
}
Esempio n. 10
0
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A.
///
/// \param T The leaf-labelled tree.
/// \param A A multiple sequence alignment.
///
alignment remap_A_indices(alignment& A, const SequenceTree& T)
{
  vector<string> labels = T.get_labels();

  if (A.n_sequences() == T.n_leaves())
  {
    labels.resize(T.n_leaves());

  }
  else if (A.n_sequences() != T.n_nodes())
    throw myexception()<<"Cannot map alignment onto tree:\n  Alignment has "<<A.n_sequences()<<" sequences.\n  Tree has "<<T.n_leaves()<<" leaves and "<<T.n_nodes()<<" nodes.";
      

  for(int i=0;i<labels.size();i++)
    if (labels[i] == "")
    {
      if (i<T.n_leaves())
	throw myexception()<<"Tree has empty label for a leaf node: not allowed!";
      else
	throw myexception()<<"Alignment has internal node information, but tree has empty label for an internal node: not allowed!";
    }

  assert(A.n_sequences() == labels.size());

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    vector<int> mapping = compute_mapping(labels, sequence_names(A));

    return reorder_sequences(A,mapping);
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2 = b;
    b2.clear();
    if (b.from == 0)
      b2<<"Couldn't find sequence \""<<b2.missing<<"\" in alignment.";
    else
      b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree.";
    throw b2;
  }
}
Esempio n. 11
0
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost)
{
  const alphabet& a = A.get_alphabet();

  vector<int> letters(T.n_leaves());

  int root = T.directed_branch(0).target();

  vector<const_branchview> branches = branches_toward_node(T,root);

  ublas::matrix<B> n_muts(T.n_nodes(), a.size());

  double tree_length = 0;
  for(int c=0;c<A.length();c++) {
    for(int i=0;i<T.n_leaves();i++)
      letters[i] = A(c,i);
    double length = n_mutations<B>(a,letters,T,cost,n_muts,branches);
    tree_length += length;
  }

  return tree_length;
}
Esempio n. 12
0
// mark nodes in T according to what node of Q they map to
vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T,
			  const vector<int>& branches_map)
{
  assert(branches_map.size() == Q.n_branches() * 2);

  vector<int> nodes_map(T.n_nodes(),-1);

  // map nodes from T -> Q that are in both trees
  for(int b=0;b<Q.n_branches();b++)
  {
    int Q_source = Q.branch(b).source();
    int Q_target = Q.branch(b).target();

    int b2 = branches_map[b];

    int T_source = T.directed_branch(b2).source();
    int T_target = T.directed_branch(b2).target();

    if (nodes_map[T_source] == -1)
      nodes_map[T_source] = Q_source;
    else
      assert(nodes_map[T_source] == Q_source);

    if (nodes_map[T_target] == -1)
      nodes_map[T_target] = Q_target;
    else
      assert(nodes_map[T_target] == Q_target);
  }

  // map the rest of the nodes from T -> Q
  for(int i=Q.n_leaves();i<Q.n_nodes();i++) 
  {
    unsigned D = Q[i].degree();
    if (D <= 3) continue;

    // get a branch of Q pointing into the node
    const_branchview outside = *(Q[i].branches_in());
    // get a branch of T pointing into the node
    outside = T.directed_branch(branches_map[outside.name()]);

    list<const_branchview> branches;
    typedef list<const_branchview>::iterator list_iterator;
    append(outside.branches_after(),branches);
    for(list_iterator b = branches.begin() ; b != branches.end();)
    {
      int node = (*b).target();
      if (nodes_map[node] == -1)
	nodes_map[node] = i;

      if (nodes_map[node] == i) {
	append((*b).branches_after(),branches);
	b++;
      }
      else {
	list_iterator prev = b;
	b++;
	branches.erase(prev);
      }
    }
    assert(branches.size() == D-3);
  }

  for(int i=0;i<nodes_map.size();i++)
    assert(nodes_map[i] != -1);

  return nodes_map;
}
Esempio n. 13
0
int main(int argc,char* argv[]) 
{ 
  try {
    //----------- Parse command line  ----------//
    variables_map args = parse_cmd_line(argc,argv);

    int skip = args["skip"].as<int>();

    int max = -1;
    if (args.count("max"))
      max = args["max"].as<int>();

    int subsample = args["sub-sample"].as<int>();

    vector<string> prune;
    if (args.count("prune")) {
      string p = args["prune"].as<string>();
      prune = split(p,',');
    }
      

    //----------- Read the topology -----------//
    SequenceTree Q = load_T(args);
    standardize(Q);
    const int B = Q.n_branches();
    const int N = Q.n_nodes();
    vector<double> bf(B);
    for(int b=0;b<bf.size();b++)
      bf[b] = Q.branch(b).length();

    //-------- Read in the tree samples --------//
    if ( args.count("simple") ) {
      accum_branch_lengths_ignore_topology A(Q);
      scan_trees(std::cin,skip,subsample,max,prune,A);
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;
      exit(0);
    }

    accum_branch_lengths_same_topology A(Q);

    try {
      scan_trees(std::cin,skip,subsample,max,prune,A);
    }
    catch (std::exception& e) 
    {
      if (args.count("safe"))
	cout<<Q.write(false)<<endl;
      std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
      exit(0);
    }

    if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology";
    if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl;

    //------- Merge lengths and topology -------//
    if (args.count("var")) {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m2[b]);
      cout<<Q;
      exit(0);
    }
    else {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);

      if (not args.count("no-node-lengths") and 
	  not args.count("show-node-lengths")) {
	for(int n=0;n<N;n++) {
	  int degree = Q[n].neighbors().size();
	  for(out_edges_iterator b = Q[n].branches_out();b;b++)
	    (*b).set_length((*b).length() + A.n1[n]/degree);
	}
      }

      //------- Print Tree and branch lengths -------//
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;

      //------------ Print node lengths -------------//
      if (args.count("show-node-lengths"))
	for(int n=0;n<Q.n_nodes();n++) {
	  if (A.n1[n] > 0) {
	    cout<<"node "<<A.n1[n]<<endl;
	    int b = (*Q[n].branches_in()).name();
	    cout<<partition_from_branch(Q,b)<<endl;
	  }
	}

    }
  }
  catch (std::exception& e) {
    std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
    exit(1);
  }
  return 0;
}