bool update_lengths(const SequenceTree& Q,const SequenceTree& T, valarray<double>& branch_lengths, valarray<double>& branch_lengths_squared, valarray<double>& node_lengths) { // map branches from Q -> T vector<int> branches_map = extends_map(T,Q); if (not branches_map.size()) return false; // incorporate lengths of branches that map to Q for(int b=0;b<Q.n_branches();b++) { int b2 = branches_map[b]; double L = T.directed_branch(b2).length(); branch_lengths[b] += L; branch_lengths_squared[b] += L*L; } // map nodes from T -> Q vector<int> nodes_map = get_nodes_map(Q,T,branches_map); // incorprate lengths of branches that map to nodes in Q for(int i=T.n_leafbranches();i<T.n_branches();i++) { const_branchview b = T.branch(i); int n1 = nodes_map[b.source()]; int n2 = nodes_map[b.target()]; if (n1 == n2) node_lengths[n1] += T.branch(i).length(); } return true; }
void estimate_tree(const alignment& A, SequenceTree& T, substitution::MultiModel& smodel, const vector<int>& parameters) { //------- Estimate branch lengths -------------// log_branch_likelihood score2(A,smodel,T,parameters); // Initialize starting point optimize::Vector start(0.1, T.n_branches() + parameters.size()); for(int b=0;b<T.n_branches();b++) start[b] = log(T.branch(b).length()); for(int i=0;i<parameters.size();i++) start[i+T.n_branches()] = smodel.get_parameter_value_as<Double>( parameters[i] ); // optimize::Vector end = search_gradient(start,score); // optimize::Vector end = search_basis(start,score); optimize::Vector end = search_gradient(start,score2,1e-3); for(int b=0;b<T.n_branches();b++) T.branch(b).set_length(exp(end[b])); for(int i=0;i<parameters.size();i++) smodel.set_parameter_value(parameters[i], end[i+T.n_branches()]); smodel.set_rate(1); }
int choose_SPR_target(SequenceTree& T1, int b1_) { const_branchview b1 = T1.directed_branch(b1_); //----- Select the branch to move to ------// dynamic_bitset<> subtree_nodes = T1.partition(b1.reverse()); subtree_nodes[b1.target()] = true; vector<int> branches; vector<double> lengths; for(int i=0; i<T1.n_branches(); i++) { const_branchview bi = T1.branch(i); // skip branch if its contained in the subtree if (subtree_nodes[bi.target()] and subtree_nodes[bi.source()]) continue; double L = 1.0; // down-weight branch if it is one of the subtree's 2 neighbors if (subtree_nodes[bi.target()] or subtree_nodes[bi.source()]) L = 0.5; branches.push_back(i); lengths.push_back(L); } int b2 = branches[ choose(lengths) ]; return b2; }
void accum_branch_lengths_ignore_topology::operator()(const SequenceTree& T) { n_samples++; for(int b1=0;b1<Q.n_branches();b1++) { // this is a complete waste of CPU time. dynamic_bitset<> bp1 = branch_partition(Q,b1); if (not bp1[0]) bp1.flip(); // search for Q.branch(b1) in tree T int b2 = -1; for(int i=0;i<T.n_branches();i++) { dynamic_bitset<> bp2 = branch_partition(T,i); if (not bp2[0]) bp2.flip(); if (bp1 == bp2) { b2 = i; break; } } if (b2 != -1) { double L = T.branch(b2).length(); m1[b1] += L; m2[b1] += L*L; n_matches[b1]++; } } }
SequenceTree get_mf_tree(const std::vector<std::string>& names, const std::vector<dynamic_bitset<> >& partitions) { SequenceTree T = star_tree(names); for(int i=0;i<partitions.size();i++) T.induce_partition(partitions[i]); for(int i=0;i<T.n_branches();i++) T.branch(i).set_length(1.0); return T; }
/// Tree prior: topology & branch lengths (exponential) efloat_t prior_exponential(const SequenceTree& T,double branch_mean) { efloat_t p = 1; // --------- uniform prior on topologies --------// if (T.n_leaves()>3) p /= num_topologies(T.n_leaves()); // ---- Exponential prior on branch lengths ---- // for(int i=0;i<T.n_branches();i++) p *= exponential_pdf(T.branch(i).length(), branch_mean); return p; }
/// Tree prior: topology & branch lengths (gamma) efloat_t prior_gamma(const SequenceTree& T,double branch_mean) { efloat_t p = 1; // --------- uniform prior on topologies --------// if (T.n_leaves()>3) p /= num_topologies(T.n_leaves()); // ---- Exponential prior on branch lengths ---- // double a = 0.5; double b = branch_mean*2; for(int i=0;i<T.n_branches();i++) p *= gamma_pdf(T.branch(i).length(), a, b); return p; }
SequenceTree get_mf_tree(const std::vector<std::string>& names, const std::vector<Partition>& partitions) { SequenceTree T = star_tree(names); int i=0; try { for(;i<partitions.size();i++) T.induce_partition(partitions[i].group1); } catch(...) { throw myexception()<<"Partition ("<<partitions[i]<<") conflicts with tree "<<T; } for(int i=0;i<T.n_branches();i++) T.branch(i).set_length(1.0); return T; }
// mark nodes in T according to what node of Q they map to vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T, const vector<int>& branches_map) { assert(branches_map.size() == Q.n_branches() * 2); vector<int> nodes_map(T.n_nodes(),-1); // map nodes from T -> Q that are in both trees for(int b=0;b<Q.n_branches();b++) { int Q_source = Q.branch(b).source(); int Q_target = Q.branch(b).target(); int b2 = branches_map[b]; int T_source = T.directed_branch(b2).source(); int T_target = T.directed_branch(b2).target(); if (nodes_map[T_source] == -1) nodes_map[T_source] = Q_source; else assert(nodes_map[T_source] == Q_source); if (nodes_map[T_target] == -1) nodes_map[T_target] = Q_target; else assert(nodes_map[T_target] == Q_target); } // map the rest of the nodes from T -> Q for(int i=Q.n_leaves();i<Q.n_nodes();i++) { unsigned D = Q[i].degree(); if (D <= 3) continue; // get a branch of Q pointing into the node const_branchview outside = *(Q[i].branches_in()); // get a branch of T pointing into the node outside = T.directed_branch(branches_map[outside.name()]); list<const_branchview> branches; typedef list<const_branchview>::iterator list_iterator; append(outside.branches_after(),branches); for(list_iterator b = branches.begin() ; b != branches.end();) { int node = (*b).target(); if (nodes_map[node] == -1) nodes_map[node] = i; if (nodes_map[node] == i) { append((*b).branches_after(),branches); b++; } else { list_iterator prev = b; b++; branches.erase(prev); } } assert(branches.size() == D-3); } for(int i=0;i<nodes_map.size();i++) assert(nodes_map[i] != -1); return nodes_map; }
int main(int argc,char* argv[]) { try { //----------- Parse command line ----------// variables_map args = parse_cmd_line(argc,argv); int skip = args["skip"].as<int>(); int max = -1; if (args.count("max")) max = args["max"].as<int>(); int subsample = args["sub-sample"].as<int>(); vector<string> prune; if (args.count("prune")) { string p = args["prune"].as<string>(); prune = split(p,','); } //----------- Read the topology -----------// SequenceTree Q = load_T(args); standardize(Q); const int B = Q.n_branches(); const int N = Q.n_nodes(); vector<double> bf(B); for(int b=0;b<bf.size();b++) bf[b] = Q.branch(b).length(); //-------- Read in the tree samples --------// if ( args.count("simple") ) { accum_branch_lengths_ignore_topology A(Q); scan_trees(std::cin,skip,subsample,max,prune,A); for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; exit(0); } accum_branch_lengths_same_topology A(Q); try { scan_trees(std::cin,skip,subsample,max,prune,A); } catch (std::exception& e) { if (args.count("safe")) cout<<Q.write(false)<<endl; std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(0); } if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology"; if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl; //------- Merge lengths and topology -------// if (args.count("var")) { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m2[b]); cout<<Q; exit(0); } else { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); if (not args.count("no-node-lengths") and not args.count("show-node-lengths")) { for(int n=0;n<N;n++) { int degree = Q[n].neighbors().size(); for(out_edges_iterator b = Q[n].branches_out();b;b++) (*b).set_length((*b).length() + A.n1[n]/degree); } } //------- Print Tree and branch lengths -------// cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; //------------ Print node lengths -------------// if (args.count("show-node-lengths")) for(int n=0;n<Q.n_nodes();n++) { if (A.n1[n] > 0) { cout<<"node "<<A.n1[n]<<endl; int b = (*Q[n].branches_in()).name(); cout<<partition_from_branch(Q,b)<<endl; } } } } catch (std::exception& e) { std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(1); } return 0; }
//FIXME T.seq(i) -> T.leafname(i) //FIXME T.get_sequences -> T.leafnames() void delete_node(SequenceTree& T,const std::string& name) { int index = find_index(T.get_sequences(),name); nodeview n = T.prune_subtree(T.branch(index).reverse()); T.remove_node_from_branch(n); }
void print_stats(std::ostream& o,std::ostream& trees, const Parameters& P, bool print_alignment) { efloat_t Pr_prior = P.prior(); efloat_t Pr_likelihood = P.likelihood(); efloat_t Pr = Pr_prior * Pr_likelihood; o<<" prior = "<<Pr_prior; for(int i=0;i<P.n_data_partitions();i++) o<<" prior_A"<<i+1<<" = "<<P[i].prior_alignment(); o<<" likelihood = "<<Pr_likelihood<<" logp = "<<Pr <<" beta = " <<P.beta[0] <<"\n"; if (print_alignment) for(int i=0;i<P.n_data_partitions();i++) o<<standardize(*P[i].A, *P.T)<<"\n"; { SequenceTree T = *P.T; valarray<double> weights(P.n_data_partitions()); for(int i=0;i<weights.size();i++) weights[i] = max(sequence_lengths(*P[i].A, P.T->n_leaves())); weights /= weights.sum(); double mu_scale=0; for(int i=0;i<P.n_data_partitions();i++) mu_scale += P[i].branch_mean()*weights[i]; for(int b=0;b<T.n_branches();b++) T.branch(b).set_length(mu_scale*T.branch(b).length()); trees<<T<<std::endl; trees.flush(); } o<<"\n"; show_parameters(o,P); o.flush(); for(int m=0;m<P.n_smodels();m++) { o<<"smodel"<<m+1<<endl; for(int i=0;i<P.SModel(m).n_base_models();i++) o<<" rate"<<i<<" = "<<P.SModel(m).base_model(i).rate(); o<<"\n\n"; for(int i=0;i<P.SModel(m).n_base_models();i++) o<<" fraction"<<i<<" = "<<P.SModel(m).distribution()[i]; o<<"\n\n"; o<<"frequencies = "<<"\n"; show_frequencies(o,P.SModel(m)); o<<"\n\n"; o.flush(); } // The leaf sequences should NOT change during alignment #ifndef NDEBUG for(int i=0;i<P.n_data_partitions();i++) check_alignment(*P[i].A, *P.T,"print_stats:end"); #endif }
bool update_lengths(const MC_tree& Q,const SequenceTree& T, const vector<dynamic_bitset<> >& node_masks, const vector<Partition>& partitions2, valarray<double>& branch_lengths, valarray<double>& node_lengths) { // check that this tree is consistent with the MC Tree Q for(int i=0;i<Q.branch_order.size();i++) { int b = Q.branch_order[i]; if (not implies(T,Q.partitions[b])) return false; } // map branches of the input tree for(int b=0;b<T.n_branches();b++) { Partition P = partition_from_branch(T,b); // Find mc tree branches implied by branch b vector<int> branches; for(int i=0;i<Q.branch_order.size();i++) { if (implies(P,partitions2[i])) branches.push_back(i); } // Find out which nodes this branch is inside of vector<int> nodes; for(int n=0;n<Q.n_nodes();n++) { if (Q.degree(n) == 0) continue; Partition P2 = P; P2.group1 = P2.group1 & node_masks[n]; P2.group2 = P2.group2 & node_masks[n]; if (P2.group1.none() or P2.group2.none()) continue; bool ok = true; for(int b=0;b<2*Q.n_branches() and ok;b++) { if (Q.mapping[b] != n) continue; Partition P3 = Q.partitions[b]; P3.group1 = P3.group1 & node_masks[n]; P3.group2 = P3.group2 & node_masks[n]; if (partition_less_than(P3,P2) or partition_less_than(P3,P2.reverse())) ; else ok=false; } if (ok) { nodes.push_back(n); assert(Q.degree(n) > 3); } } /* cerr<<"Branch: "<<P<<endl; cerr<<" - maps to "<<branches.size()<<" branches."<<endl; if (nodes.size()) { cerr<<" - inside node(s):"<<endl; cerr<<P<<endl; for(int i=0;i<nodes.size();i++) cerr<<" "<<Q.partitions[Q.branch_to_node(nodes[i])]<<endl; } */ // This branch should be inside only one node, if any. assert(nodes.size() < 2); // This branch should not be inside a node, if it implies an mc tree branch. if (branches.size()) assert(not nodes.size()); // But this branch should be inside a node if it doesn't imply a branch. assert(branches.size() + nodes.size() > 0); const double L = T.branch(b).length(); // Divide the branch length evenly between the branches it implies. for(int i=0;i<branches.size();i++) branch_lengths[branches[i]] += L/branches.size(); // Divide the branch length evenly between the nodes (node?) it implies. for(int i=0;i<nodes.size();i++) node_lengths[nodes[i]] += L/nodes.size(); } return true; }
int main(int argc,char* argv[]) { try { std::cout.precision(3); std::cout.setf(ios::fixed); //---------- Parse command line -------// variables_map args = parse_cmd_line(argc,argv); //--------------------- Initialize ---------------------// if (args.count("seed")) { unsigned long seed = args["seed"].as<unsigned long>(); myrand_init(seed); } else myrand_init(); int skip = args["skip"].as<int>(); int subsample=args["sub-sample"].as<int>(); int max = -1; if (args.count("max")) max = args["max"].as<int>(); double min_support = args["min-support"].as<double>(); // leaf taxa to ignore vector<string> ignore; if (args.count("ignore") and args["ignore"].as<string>().size() > 0) ignore = split(args["ignore"].as<string>(),','); // consensus levels string c_levels = args.count("consensus") ? args["consensus"].as<string>() : ""; vector<double> consensus_levels = get_consensus_levels(c_levels); double report_ratio = args["odds-ratio"].as<double>(); bool show_sub = args.count("sub-partitions"); //-------------- Read in tree distributions --------------// string filename = args["file"].as<string>(); ifstream file(filename.c_str()); if (not file) throw myexception()<<"Couldn't open file "<<filename; tree_sample tree_dist(file,skip,subsample,max,ignore); const unsigned N = tree_dist.size(); dynamic_bitset<> ignore_mask = group_from_names(tree_dist.names(),vector<string>()); //------ Compute Ml partitions or sub-partitions --------// vector< pair<Partition,unsigned> > all_partitions; if (show_sub) { int depth = args["depth"].as<int>(); double min_rooting = args["rooting"].as<double>(); all_partitions = get_Ml_sub_partitions_and_counts(tree_dist,min_support, ~ignore_mask,min_rooting,depth); // std::cerr<<"n_sub_partitions = "<<all_partitions.size()<<"\n"; } else all_partitions = get_Ml_partitions_and_counts(tree_dist,min_support, ~ignore_mask); vector<int> which_topology; vector<int> topology_counts; std::map<tree_record,int> topologies_index; for(int i=0;i<tree_dist.size();i++) { std::map<tree_record,int>::iterator record = topologies_index.find(tree_dist[i]); if (record == topologies_index.end()) { which_topology.push_back(i); topology_counts.push_back(0); topologies_index[tree_dist[i]] = which_topology.size()-1; record = topologies_index.find(tree_dist[i]); } topology_counts[record->second]++; } vector<int> order = iota<int>(topology_counts.size()); std::sort(order.begin(),order.end(),sequence_order<int>(topology_counts)); std::reverse(order.begin(), order.end()); //------ Topologies to analyze -----// vector<string> topologies; cout<<"# n_trees = "<<tree_dist.size()<<" n_topologies = "<<topology_counts.size()<<endl; cout<<"\nTopology support: \n\n"; for(int i=0;i < args["map-trees"].as<int>() ;i++) { if (i >= order.size()) continue; string t = tree_dist.T(which_topology[order[i]]).write(false); unsigned n = topology_counts[order[i]]; double PP = double(n)/N; double o = odds(n,N,1); cout<<"MAP-"<<i<<" = "<<t<<endl; cout<<" PP = "<<PP<<" LOD = "<<log10(o)<<endl; cout<<"\n"; } for(int i=0,n=0;i<topology_counts.size();i++) { n += topology_counts[i]; double PP = double(n)/N; if (PP >= 0.95) { cout<<"95% credible set contains "<<i+1<<" topologies."<<endl; break; } } cout<<"\n\n"; //------- Print out support for each partition --------// cout<<"Partition support: \n\n"; vector<pair<Partition,unsigned> > good_partitions = thin(all_partitions, N, report_ratio); sort(good_partitions.begin(),good_partitions.end(), count_more()); for(int i=0;i<good_partitions.size();i++) { if (not informative(good_partitions[i].first)) continue; unsigned n = good_partitions[i].second; double PP = double(n)/N; double o = odds(n,N,1); cout<<" PP = "<<PP<<" LOD = "<<log10(o); if (not good_partitions[i].first.full()) { double ratio = odds_ratio(good_partitions,i,N,1); cout<<" ratio = "<<log10(ratio); } cout<<" pi = "<<good_partitions[i].first<<endl; cout<<endl<<endl; } //----------- display M[l] consensus levels ----------// std::cout.precision(4); cout<<"\n\nConsensus levels:\n\n"; vector<Partition> c50_sub_partitions = get_Ml_partitions(all_partitions, 0.5, N); vector<Partition> c50_full_partitions = select(c50_sub_partitions,&Partition::full); c50_sub_partitions = get_moveable_tree(c50_sub_partitions); vector<unsigned> levels = get_Ml_levels(all_partitions,N,min_support); levels.push_back(N+1); for(int j=0,k=0;j<levels.size() and k < consensus_levels.size();j++) { unsigned clevel = (unsigned)(consensus_levels[k]*N); while (k<consensus_levels.size() and clevel < levels[j]) { clevel = (unsigned)(consensus_levels[k]*N); vector<Partition> all = get_Ml_partitions(all_partitions,consensus_levels[k],N); vector<Partition> sub; vector<Partition> full; for(int i=0;i<all.size();i++) if (all[i].full()) full.push_back(all[i]); else sub.push_back(all[i]); SequenceTree consensus = get_mf_tree(tree_dist.names(),full); SequenceTree consensus2 = consensus; double L = consensus_levels[k]*100; cout.unsetf(ios::fixed | ios::showpoint); vector<double> bf(consensus.n_branches(),1.0); for(int i=0;i<bf.size();i++) { if (consensus.branch(i).is_leaf_branch()) bf[i] = -1.0; else { dynamic_bitset<> mask = branch_partition(consensus,i); Partition p(tree_dist.names(),mask); unsigned count = get_partition_count(all_partitions,p); bf[i] = double(count)/N; } consensus2.branch(i).set_length(bf[i]); } cout<<" "<<L<<"-consensus-PP = "<<consensus2.write(true)<<std::endl; //cout<<" "<<L<<"-consensus-PP2 = "<<consensus.write_with_bootstrap_fraction(bf,false)<<std::endl; cout<<" "<<L<<"-consensus = "<<consensus.write(false)<<std::endl; if (show_sub) { for(int i=0;i<sub.size();i++) cout<<sub[i]<<endl; } cout<<endl<<endl; k++; } if (levels[j] <=N) { show_level(tree_dist,levels[j],c50_sub_partitions,all_partitions,show_sub,false); cout<<endl; } } } catch (std::exception& e) { std::cerr<<"trees-consensus: Error! "<<e.what()<<endl; exit(1); } return 0; }