/// \brief Remap the leaf indices of tree \a T to match the alignment \a A: check the result /// /// \param A The alignment. /// \param T The tree. /// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree? /// void link(alignment& A,SequenceTree& T,bool internal_sequences) { check_names_unique(A); // Later, might we WANT sub-branches??? if (has_sub_branches(T)) remove_sub_branches(T); if (internal_sequences and not is_Cayley(T) and T.n_leaves() > 1) { assert(has_polytomy(T)); throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences."; } //------ IF sequences < leaf nodes THEN complain ---------// if (A.n_sequences() < T.n_leaves()) throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has " <<A.n_sequences()<<" sequences."; //----- IF sequences = leaf nodes THEN maybe add internal sequences. else if (A.n_sequences() == T.n_leaves()) { A = remap_A_indices(A,T); if (internal_sequences) { add_internal_labels(T); A = add_internal(A,T); connect_leaf_characters(A,T); } } //----- IF sequences > leaf nodes THEN maybe complain -------// else if (A.n_sequences() > T.n_nodes()) throw myexception()<<"More alignment sequences ("<<A.n_sequences()<<") than tree nodes ("<<T.n_nodes()<<")!"; else if (A.n_sequences() < T.n_nodes()) throw myexception()<<"Fewer alignment sequences ("<<A.n_sequences()<<") than tree nodes ("<<T.n_nodes()<<")!"; else { A = remap_A_indices(A,T); if (not internal_sequences) A = chop_internal(A); } //---------- double-check that we have the right number of sequences ---------// if (internal_sequences) assert(A.n_sequences() == T.n_nodes()); else assert(A.n_sequences() == T.n_leaves()); //----- Check that each alignment sequence maps to a corresponding name in the tree -----// for(int i=0;i<A.n_sequences();i++) assert(T.get_label(i) == A.seq(i).name); //---- Check to see that internal nodes satisfy constraints ----// check_alignment(A,T,internal_sequences); }
vector<vector<int> > get_all_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<int>& cost) { int root = T.directed_branch(0).target(); ublas::matrix<int> n_muts(T.n_nodes(), a.size()); peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) ); // get an order list of branches point away from the root; vector<const_branchview> branches = branches_from_node(T,root); std::reverse(branches.begin(),branches.end()); // Allocate space to store the letters for each node vector<vector<int> > node_letters(T.n_nodes()); const unsigned A = a.size(); // choose the cheapest letters at the root { double m = row_min(n_muts,root); for(int l=0;l<A;l++) if (n_muts(root,l) <= m) node_letters[root].push_back(l); } vector<double> temp(A); for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); vector<double> best(node_letters[s].size()); for(int j=0;j<node_letters[s].size();j++) { for(int l=0;l<A;l++) temp[l] = n_muts(t,l)+cost(l,node_letters[s][j]); best[j] = min(temp); } for(int l=0;l<A;l++) { bool is_best = false; for(int j=0;j<node_letters[s].size() and not is_best;j++) if (n_muts(t,l)+cost(l,node_letters[s][j]) <= best[j]) is_best=true; if (is_best) node_letters[t].push_back(l); } } return node_letters; }
/// \brief Remap the leaf indices of tree \a T to match the alignment \a A: check the result /// /// \param A The alignment. /// \param T The tree. /// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree? /// void link(alignment& A,SequenceTree& T,bool internal_sequences) { check_names_unique(A); // Later, might we WANT sub-branches??? if (has_sub_branches(T)) remove_sub_branches(T); if (internal_sequences and not is_Cayley(T)) { assert(has_polytomy(T)); throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences."; } //------ IF sequences < leaf nodes THEN complain ---------// if (A.n_sequences() < T.n_leaves()) throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has " <<A.n_sequences()<<" sequences."; //----- IF sequences = leaf nodes THEN maybe add internal sequences. else if (A.n_sequences() == T.n_leaves()) { if (internal_sequences) A = add_internal(A,T); } //----- IF sequences > leaf nodes THEN maybe complain -------// else { if (not internal_sequences) { alignment A2 = chop_internal(A); if (A2.n_sequences() == T.n_leaves()) { A = A2; } else throw myexception()<<"More alignment sequences than leaf nodes!"; } else if (A.n_sequences() > T.n_nodes()) throw myexception()<<"More alignment sequences than tree nodes!"; else if (A.n_sequences() < T.n_nodes()) throw myexception()<<"Fewer alignment sequences than tree nodes!"; } //---------- double-check that we have the right number of sequences ---------// if (internal_sequences) assert(A.n_sequences() == T.n_nodes()); else assert(A.n_sequences() == T.n_leaves()); //----- Remap leaf indices for T onto A's leaf sequence indices -----// remap_T_indices(T,A); if (internal_sequences) connect_leaf_characters(A,T); //---- Check to see that internal nodes satisfy constraints ----// check_alignment(A,T,internal_sequences); }
alignment add_internal(alignment A,const SequenceTree& T) { // Complain if A and T don't correspond if (A.n_sequences() != T.n_leaves()) throw myexception()<<"Number of sequence in alignment doesn't match number of leaves in tree" <<"- can't add internal sequences"; // Add empty sequences vector<sequence> S; for(int i=T.n_leaves();i<T.n_nodes();i++) { sequence s; if (T.label(i) == "") throw myexception()<<"Adding internal sequences: Tree has missing internal node name!"; s.name = T.label(i); S.push_back(s); } A.add_sequences(S); return A; }
void peel_n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<B>& cost,ublas::matrix<B>& n_muts, const vector<const_branchview>& branches) { const int A = a.size(); assert(letters.size() == T.n_leaves()); assert(cost.size1() == A); assert(cost.size2() == A); // we need a scratch row in the matrix assert(n_muts.size1() == T.n_nodes()); assert(n_muts.size2() == A); // compute the max cost -- is this approach a good idea? // Well... this apparently doesn't work. B max_cost = 0; for(int i=0;i<A;i++) for(int j=0;j<A;j++) max_cost = std::max(cost(i,j)+1, max_cost); // clear the length matrix. for(int i=0;i<n_muts.size1();i++) for(int j=0;j<n_muts.size2();j++) n_muts(i,j)=0; // set the leaf costs for(int s=0;s<T.n_leaves();s++) { int L = letters[s]; if (a.is_letter_class(L)) for(int l=0;l<A;l++) if (a.matches(l,L)) n_muts(s,l) = 0; else n_muts(s,l) = max_cost; } // compute the costs for letters at each node for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); // for each letter l of node target... for(int l=0;l<A;l++) { // compute minimum treelength for data behind source. B temp = n_muts(s,0)+cost(0,l); for(int k=1;k<A;k++) temp = min(temp, n_muts(s,k)+cost(k,l) ); // add it to treelengths for data behind target n_muts(t,l) += temp; } } }
accum_branch_lengths_same_topology(const SequenceTree& T) : n_samples(0), n_matches(0), Q(T), m1(0.0, Q.n_branches()), m2(0.0, Q.n_branches()), n1(0.0, Q.n_nodes()) {}
void add_internal_labels(SequenceTree& T) { for(int i=0;i<T.n_nodes();i++) if (T.node(i).is_internal_node()) { if (T.get_label(i) == "") T.set_label(i, string("A") + convertToString(i)); } }
B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost) { int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); return n_mutations(a,letters,T,cost,n_muts,branches); }
vector<int> get_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<int>& cost) { int root = T.directed_branch(0).target(); ublas::matrix<int> n_muts(T.n_nodes(),a.size()); peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) ); // get an order list of branches point away from the root; vector<const_branchview> branches = branches_from_node(T,root); std::reverse(branches.begin(),branches.end()); // Allocate space to store the letter for each node vector<int> node_letters(T.n_nodes(),-1); // choose the cheapest letter at the root node_letters[root] = row_min(n_muts,root); const unsigned A = a.size(); vector<double> temp(A); for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); int k = node_letters[s]; assert(k != -1); for(int l=0;l<A;l++) temp[l] = n_muts(t,l)+cost(l,k); node_letters[t] = argmin(temp); } return node_letters; }
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A. /// /// \param T The leaf-labelled tree. /// \param A A multiple sequence alignment. /// alignment remap_A_indices(alignment& A, const SequenceTree& T) { vector<string> labels = T.get_labels(); if (A.n_sequences() == T.n_leaves()) { labels.resize(T.n_leaves()); } else if (A.n_sequences() != T.n_nodes()) throw myexception()<<"Cannot map alignment onto tree:\n Alignment has "<<A.n_sequences()<<" sequences.\n Tree has "<<T.n_leaves()<<" leaves and "<<T.n_nodes()<<" nodes."; for(int i=0;i<labels.size();i++) if (labels[i] == "") { if (i<T.n_leaves()) throw myexception()<<"Tree has empty label for a leaf node: not allowed!"; else throw myexception()<<"Alignment has internal node information, but tree has empty label for an internal node: not allowed!"; } assert(A.n_sequences() == labels.size()); //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { vector<int> mapping = compute_mapping(labels, sequence_names(A)); return reorder_sequences(A,mapping); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2 = b; b2.clear(); if (b.from == 0) b2<<"Couldn't find sequence \""<<b2.missing<<"\" in alignment."; else b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree."; throw b2; } }
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost) { const alphabet& a = A.get_alphabet(); vector<int> letters(T.n_leaves()); int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); double tree_length = 0; for(int c=0;c<A.length();c++) { for(int i=0;i<T.n_leaves();i++) letters[i] = A(c,i); double length = n_mutations<B>(a,letters,T,cost,n_muts,branches); tree_length += length; } return tree_length; }
// mark nodes in T according to what node of Q they map to vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T, const vector<int>& branches_map) { assert(branches_map.size() == Q.n_branches() * 2); vector<int> nodes_map(T.n_nodes(),-1); // map nodes from T -> Q that are in both trees for(int b=0;b<Q.n_branches();b++) { int Q_source = Q.branch(b).source(); int Q_target = Q.branch(b).target(); int b2 = branches_map[b]; int T_source = T.directed_branch(b2).source(); int T_target = T.directed_branch(b2).target(); if (nodes_map[T_source] == -1) nodes_map[T_source] = Q_source; else assert(nodes_map[T_source] == Q_source); if (nodes_map[T_target] == -1) nodes_map[T_target] = Q_target; else assert(nodes_map[T_target] == Q_target); } // map the rest of the nodes from T -> Q for(int i=Q.n_leaves();i<Q.n_nodes();i++) { unsigned D = Q[i].degree(); if (D <= 3) continue; // get a branch of Q pointing into the node const_branchview outside = *(Q[i].branches_in()); // get a branch of T pointing into the node outside = T.directed_branch(branches_map[outside.name()]); list<const_branchview> branches; typedef list<const_branchview>::iterator list_iterator; append(outside.branches_after(),branches); for(list_iterator b = branches.begin() ; b != branches.end();) { int node = (*b).target(); if (nodes_map[node] == -1) nodes_map[node] = i; if (nodes_map[node] == i) { append((*b).branches_after(),branches); b++; } else { list_iterator prev = b; b++; branches.erase(prev); } } assert(branches.size() == D-3); } for(int i=0;i<nodes_map.size();i++) assert(nodes_map[i] != -1); return nodes_map; }
int main(int argc,char* argv[]) { try { //----------- Parse command line ----------// variables_map args = parse_cmd_line(argc,argv); int skip = args["skip"].as<int>(); int max = -1; if (args.count("max")) max = args["max"].as<int>(); int subsample = args["sub-sample"].as<int>(); vector<string> prune; if (args.count("prune")) { string p = args["prune"].as<string>(); prune = split(p,','); } //----------- Read the topology -----------// SequenceTree Q = load_T(args); standardize(Q); const int B = Q.n_branches(); const int N = Q.n_nodes(); vector<double> bf(B); for(int b=0;b<bf.size();b++) bf[b] = Q.branch(b).length(); //-------- Read in the tree samples --------// if ( args.count("simple") ) { accum_branch_lengths_ignore_topology A(Q); scan_trees(std::cin,skip,subsample,max,prune,A); for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; exit(0); } accum_branch_lengths_same_topology A(Q); try { scan_trees(std::cin,skip,subsample,max,prune,A); } catch (std::exception& e) { if (args.count("safe")) cout<<Q.write(false)<<endl; std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(0); } if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology"; if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl; //------- Merge lengths and topology -------// if (args.count("var")) { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m2[b]); cout<<Q; exit(0); } else { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); if (not args.count("no-node-lengths") and not args.count("show-node-lengths")) { for(int n=0;n<N;n++) { int degree = Q[n].neighbors().size(); for(out_edges_iterator b = Q[n].branches_out();b;b++) (*b).set_length((*b).length() + A.n1[n]/degree); } } //------- Print Tree and branch lengths -------// cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; //------------ Print node lengths -------------// if (args.count("show-node-lengths")) for(int n=0;n<Q.n_nodes();n++) { if (A.n1[n] > 0) { cout<<"node "<<A.n1[n]<<endl; int b = (*Q[n].branches_in()).name(); cout<<partition_from_branch(Q,b)<<endl; } } } } catch (std::exception& e) { std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(1); } return 0; }