/// Do a SPR move on T1, moving the subtree behind b1_ to branch b2 double do_SPR(SequenceTree& T1, int b1_,int b2) { const_branchview b1 = T1.directed_branch(b1_); SequenceTree T2 = T1; //------ Generate the new topology ------// if (T2.directed_branch(b2).target() == b1.target() or T2.directed_branch(b2).source() == b1.target()) ; else SPR(T2,b1.reverse(),b2); //------ Find the two new branches ------// vector<const_branchview> connected1; append(T1.directed_branch(b1.source(),b1.target()).branches_after(),connected1); vector<const_branchview> connected2; append(T2.directed_branch(b1.source(),b1.target()).branches_after(),connected2); assert(connected1.size() == 2); assert(connected2.size() == 2); //------- Place the split randomly -------// double L1 = connected1[0].length() + connected1[1].length(); double L2 = connected2[0].length() + connected2[1].length(); T2.directed_branch(connected2[0]).set_length( myrandomf() * L2 ); T2.directed_branch(connected2[1]).set_length( L2 - T2.directed_branch(connected2[0]).length() ); T1 = T2; return L2/L1; }
int choose_SPR_target(SequenceTree& T1, int b1_) { const_branchview b1 = T1.directed_branch(b1_); //----- Select the branch to move to ------// dynamic_bitset<> subtree_nodes = T1.partition(b1.reverse()); subtree_nodes[b1.target()] = true; vector<int> branches; vector<double> lengths; for(int i=0; i<T1.n_branches(); i++) { const_branchview bi = T1.branch(i); // skip branch if its contained in the subtree if (subtree_nodes[bi.target()] and subtree_nodes[bi.source()]) continue; double L = 1.0; // down-weight branch if it is one of the subtree's 2 neighbors if (subtree_nodes[bi.target()] or subtree_nodes[bi.source()]) L = 0.5; branches.push_back(i); lengths.push_back(L); } int b2 = branches[ choose(lengths) ]; return b2; }
bool update_lengths(const SequenceTree& Q,const SequenceTree& T, valarray<double>& branch_lengths, valarray<double>& branch_lengths_squared, valarray<double>& node_lengths) { // map branches from Q -> T vector<int> branches_map = extends_map(T,Q); if (not branches_map.size()) return false; // incorporate lengths of branches that map to Q for(int b=0;b<Q.n_branches();b++) { int b2 = branches_map[b]; double L = T.directed_branch(b2).length(); branch_lengths[b] += L; branch_lengths_squared[b] += L*L; } // map nodes from T -> Q vector<int> nodes_map = get_nodes_map(Q,T,branches_map); // incorprate lengths of branches that map to nodes in Q for(int i=T.n_leafbranches();i<T.n_branches();i++) { const_branchview b = T.branch(i); int n1 = nodes_map[b.source()]; int n2 = nodes_map[b.target()]; if (n1 == n2) node_lengths[n1] += T.branch(i).length(); } return true; }
B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost, ublas::matrix<B>& n_muts, const vector<const_branchview>& branches) { int root = T.directed_branch(0).target(); peel_n_mutations(a,letters,T,cost,n_muts,branches); return row_min(n_muts,root); }
B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost) { int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); return n_mutations(a,letters,T,cost,n_muts,branches); }
vector<vector<int> > get_all_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<int>& cost) { int root = T.directed_branch(0).target(); ublas::matrix<int> n_muts(T.n_nodes(), a.size()); peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) ); // get an order list of branches point away from the root; vector<const_branchview> branches = branches_from_node(T,root); std::reverse(branches.begin(),branches.end()); // Allocate space to store the letters for each node vector<vector<int> > node_letters(T.n_nodes()); const unsigned A = a.size(); // choose the cheapest letters at the root { double m = row_min(n_muts,root); for(int l=0;l<A;l++) if (n_muts(root,l) <= m) node_letters[root].push_back(l); } vector<double> temp(A); for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); vector<double> best(node_letters[s].size()); for(int j=0;j<node_letters[s].size();j++) { for(int l=0;l<A;l++) temp[l] = n_muts(t,l)+cost(l,node_letters[s][j]); best[j] = min(temp); } for(int l=0;l<A;l++) { bool is_best = false; for(int j=0;j<node_letters[s].size() and not is_best;j++) if (n_muts(t,l)+cost(l,node_letters[s][j]) <= best[j]) is_best=true; if (is_best) node_letters[t].push_back(l); } } return node_letters; }
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost) { const alphabet& a = A.get_alphabet(); vector<int> letters(T.n_leaves()); int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); double tree_length = 0; for(int c=0;c<A.length();c++) { for(int i=0;i<T.n_leaves();i++) letters[i] = A(c,i); double length = n_mutations<B>(a,letters,T,cost,n_muts,branches); tree_length += length; } return tree_length; }
vector<int> get_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<int>& cost) { int root = T.directed_branch(0).target(); ublas::matrix<int> n_muts(T.n_nodes(),a.size()); peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) ); // get an order list of branches point away from the root; vector<const_branchview> branches = branches_from_node(T,root); std::reverse(branches.begin(),branches.end()); // Allocate space to store the letter for each node vector<int> node_letters(T.n_nodes(),-1); // choose the cheapest letter at the root node_letters[root] = row_min(n_muts,root); const unsigned A = a.size(); vector<double> temp(A); for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); int k = node_letters[s]; assert(k != -1); for(int l=0;l<A;l++) temp[l] = n_muts(t,l)+cost(l,k); node_letters[t] = argmin(temp); } return node_letters; }
// mark nodes in T according to what node of Q they map to vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T, const vector<int>& branches_map) { assert(branches_map.size() == Q.n_branches() * 2); vector<int> nodes_map(T.n_nodes(),-1); // map nodes from T -> Q that are in both trees for(int b=0;b<Q.n_branches();b++) { int Q_source = Q.branch(b).source(); int Q_target = Q.branch(b).target(); int b2 = branches_map[b]; int T_source = T.directed_branch(b2).source(); int T_target = T.directed_branch(b2).target(); if (nodes_map[T_source] == -1) nodes_map[T_source] = Q_source; else assert(nodes_map[T_source] == Q_source); if (nodes_map[T_target] == -1) nodes_map[T_target] = Q_target; else assert(nodes_map[T_target] == Q_target); } // map the rest of the nodes from T -> Q for(int i=Q.n_leaves();i<Q.n_nodes();i++) { unsigned D = Q[i].degree(); if (D <= 3) continue; // get a branch of Q pointing into the node const_branchview outside = *(Q[i].branches_in()); // get a branch of T pointing into the node outside = T.directed_branch(branches_map[outside.name()]); list<const_branchview> branches; typedef list<const_branchview>::iterator list_iterator; append(outside.branches_after(),branches); for(list_iterator b = branches.begin() ; b != branches.end();) { int node = (*b).target(); if (nodes_map[node] == -1) nodes_map[node] = i; if (nodes_map[node] == i) { append((*b).branches_after(),branches); b++; } else { list_iterator prev = b; b++; branches.erase(prev); } } assert(branches.size() == D-3); } for(int i=0;i<nodes_map.size();i++) assert(nodes_map[i] != -1); return nodes_map; }
int main(int argc,char* argv[]) { try { //---------- Parse command line -------// variables_map args = parse_cmd_line(argc,argv); //----------- Load alignment and tree ---------// alignment A; SequenceTree T; if (args.count("tree")) load_A_and_T(args,A,T,false); else A = load_A(args,false); const alphabet& a = A.get_alphabet(); //------- Load groups and find branches -------// vector<sequence_group> groups; if (args.count("groups")) groups = load_groups(A,args["groups"].as<string>()); for(int i=0;i<groups.size();i++) { cerr<<groups[i].name<<": "; for(int j=0;j<groups[i].taxa.size();j++) cerr<<A.seq(groups[i].taxa[j]).name<<" "; cerr<<endl; } vector<int> group_branches; if (args.count("tree")) { for(int i=0;i<groups.size();i++) { dynamic_bitset<> p(T.n_leaves()); for(int j=0;j<groups[i].taxa.size();j++) p[groups[i].taxa[j]] = true; int found = -1; for(int b=0;b<2*T.n_branches() and found == -1;b++) if (p == branch_partition(T,b)) found = b; if (found == -1) throw myexception()<<"I can't find group "<<i+1<<" on the tree!"; group_branches.push_back(found); } } vector<string> group_names; for(int i=0;i<groups.size();i++) group_names.push_back(groups[i].name); vector<Partition> splits; if (args.count("split")) { vector<string> split = args["split"].as<vector<string> >(); for(int i=0;i<split.size();i++) splits.push_back(Partition(group_names,split[i])); } //-------------------------------------------// Matrix C(A.length(),A.n_sequences()+1); for(int i=0;i<C.size1();i++) for(int j=0;j<C.size2();j++) C(i,j) = 0; // yes but, how much more conservation THAN EXPECTED do we see? for(int c=0;c<C.size1();c++) { vector<bool> interesting(groups.size(), true); //-------------------------------------------------------// vector<int> leaf_letters( T.n_leaves() ); for(int j=0;j<leaf_letters.size();j++) leaf_letters[j] = A(c,j); vector<vector<int> > node_letters = get_all_parsimony_letters(a,leaf_letters,T,unit_cost_matrix(a)); vector<vector<int> > initial_value(groups.size()); for(int g=0;g<groups.size();g++) { int n = T.directed_branch(group_branches[g]).target(); initial_value[g] = node_letters[n]; } //------------ find 'group conserved at' values ----------// vector<int> value(groups.size(),alphabet::gap); for(int g=0;g<groups.size();g++) { vector<int> temp; for(int i=0;i<groups[g].taxa.size();i++) temp.push_back(A(c,groups[g].taxa[i])); int best = most_common(temp); int count = number_of(temp,best); if (count >= groups[g].taxa.size()-1 and count >=3 and count> groups[g].taxa.size()/2) value[g] = best; } //-------- Determine whether column is interesting --------// if (args.count("require-all-different")) { if (args.count("split")) { vector<bool> in_changed_split(groups.size(),false); for(int i=0;i<splits.size();i++) { bool some_different = false; for(int g1=0;g1<groups.size();g1++) for(int g2=0;g2<groups.size();g2++) if (splits[i].group1[g1] and splits[i].group2[g2]) { if (all_different(A,c,groups,g1,g2)) some_different = true; } bool no_same = true; for(int g1=0;g1<groups.size();g1++) for(int g2=0;g2<groups.size();g2++) if (splits[i].group1[g1] and splits[i].group2[g2]) if (not all_different(A,c,groups,g1,g2)) no_same = false; if (some_different and no_same) for(int g=0;g<groups.size();g++) if (splits[i].group1[g] or splits[i].group2[g]) in_changed_split[g] = true; } for(int g=0;g<groups.size();g++) interesting[g] = interesting[g] and in_changed_split[g]; } else { bool different = false; for(int g1=0;g1<groups.size();g1++) for(int g2=0;g2<groups.size();g2++) if (all_different(A,c,groups,g1,g2)) different = true; if (not different) for(int g=0;g<groups.size();g++) interesting[g] = false; } } if (args.count("require-change")) { if (args.count("split")) { vector<bool> in_changed_split(groups.size(),false); for(int i=0;i<splits.size();i++) { bool some_different = false; for(int g1=0;g1<groups.size();g1++) for(int g2=0;g2<groups.size();g2++) if (splits[i].group1[g1] and splits[i].group2[g2]) { if (value[g1] != value[g2]) if (not args.count("ignore-rate-change") or (value[g1] != alphabet::gap and value[g2] != alphabet::gap)) some_different = true; } bool no_same = true; for(int g1=0;g1<groups.size();g1++) for(int g2=0;g2<groups.size();g2++) if (splits[i].group1[g1] and splits[i].group2[g2]) if (value[g1] == value[g2]) no_same = false; // This is Option #1 // - some conserved differences but no conserved similarities // Also consider Option #2 // - a change in both LETTER and CONSERVATION on at least one of the // two branches leading from the duplication. if (some_different and no_same) for(int g=0;g<groups.size();g++) if (splits[i].group1[g] or splits[i].group2[g]) in_changed_split[g] = true; } for(int g=0;g<groups.size();g++) interesting[g] = interesting[g] and in_changed_split[g]; } else { if (args.count("ignore-rate-change")) { for(int g=0;g<groups.size();g++) interesting[g] = interesting[g] and not all_same_or(value,alphabet::gap); } else { for(int g=0;g<groups.size();g++) interesting[g] = interesting[g] and not all_same(value); } } } // A group is only interesting if its conserved if (args.count("require-conservation")) for(int g=0;g<groups.size();g++) interesting[g] = interesting[g] and (value[g] != alphabet::gap); // A group is only interesting if it is in one of the splits if (args.count("split")) for(int g=0;g<groups.size();g++) { bool found = false; for(int i=0;i<splits.size() and not found;i++) if (splits[i].group1[g] or splits[i].group2[g]) found = true; interesting[g] = interesting[g] and found; } //------------ print 'group conserved at' values ----------// cerr<<c+1<<" "; for(int i=0;i<value.size();i++) cerr<<a.lookup(value[i])<<" "; cerr<<" "; for(int g=0;g<groups.size();g++) if (interesting[g]) cerr<<"1 "; else cerr<<"0 "; cerr<<endl; //------------- print parsimony initial values --------------// cerr<<" "; for(int i=0;i<initial_value.size();i++) { for(int j=0;j<initial_value[i].size();j++) cerr<<a.lookup(initial_value[i][j]); cerr<<" "; } cerr<<" "<<n_mutations(a,leaf_letters,T,unit_cost_matrix(a))<<endl; cerr<<endl; //--------------------- Set highlighting ---------------------// // Interesting groups -> 1.0 for(int g=0;g<groups.size();g++) if (interesting[g]) for(int i=0;i<groups[g].taxa.size();i++) C(c,groups[g].taxa[i]) = 1.0; // Set conserved groups -> 0.5 for(int g=0;g<groups.size();g++) if (value[g] != alphabet::gap) for(int i=0;i<groups[g].taxa.size();i++) C(c,groups[g].taxa[i]) = std::max(0.5, C(c,groups[g].taxa[i])); } cout<<join(sequence_names(A),' ')<<endl; for(int i=0;i<C.size1();i++) { vector<double> temp; for(int j=0;j<C.size2();j++) temp.push_back(C(i,j)); cout<<join(temp,' ')<<endl; } } catch (std::exception& e) { std::cerr<<"alignment-find-conserved: Error! "<<e.what()<<endl; exit(1); } return 0; }