B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost, ublas::matrix<B>& n_muts, const vector<const_branchview>& branches) { int root = T.directed_branch(0).target(); peel_n_mutations(a,letters,T,cost,n_muts,branches); return row_min(n_muts,root); }
/// Does any branch in T imply the partition p? bool implies(const SequenceTree& T,const Partition& p) { bool result = false; for(int b=0;b<T.n_branches() and not result;b++) { dynamic_bitset<> bp = branch_partition(T,b); if (implies(bp,p)) return true; } return false; }
vector<Partition> all_partitions_from_tree(const SequenceTree& T) { vector<Partition> partitions; for(int b=0;b<T.n_branches();b++) partitions.push_back(partition_from_branch(T,b)); return partitions; }
void remap_T_indices(SequenceTree& T,const vector<string>& names) { //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { vector<int> mapping = compute_mapping(T.get_sequences(),names); T.standardize(mapping); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2(b.missing,b.from); if (b.from == 0) b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in names."; else b2<<"Sequence '"<<b2.missing<<"' not found in the tree."; throw b2; } }
int which_branch(const SequenceTree& T, const Partition& p) { for(int b=0; b<2*T.n_branches(); b++) { dynamic_bitset<> bp = branch_partition(T,b); if( directed_implies(bp,p) ) return b; } return -1; }
void write_partitions(std::ostream& o,const vector<Partition>& partitions) { vector<Partition> full; vector<Partition> sub; for(int i=0;i<partitions.size();i++) if (partitions[i].full()) full.push_back(partitions[i]); else sub.push_back(partitions[i]); if (full.size()) { SequenceTree consensus = get_mf_tree(partitions[0].names,full); o<<consensus.write(false)<<endl; } for(int i=0;i<sub.size();i++) o<<sub[i]<<endl; }
/// \brief Remap the leaf indices of tree \a T to match the alignment \a A: check the result /// /// \param A The alignment. /// \param T The tree. /// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree? /// void link(alignment& A,SequenceTree& T,bool internal_sequences) { check_names_unique(A); // Later, might we WANT sub-branches??? if (has_sub_branches(T)) remove_sub_branches(T); if (internal_sequences and not is_Cayley(T)) { assert(has_polytomy(T)); throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences."; } //------ IF sequences < leaf nodes THEN complain ---------// if (A.n_sequences() < T.n_leaves()) throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has " <<A.n_sequences()<<" sequences."; //----- IF sequences = leaf nodes THEN maybe add internal sequences. else if (A.n_sequences() == T.n_leaves()) { if (internal_sequences) A = add_internal(A,T); } //----- IF sequences > leaf nodes THEN maybe complain -------// else { if (not internal_sequences) { alignment A2 = chop_internal(A); if (A2.n_sequences() == T.n_leaves()) { A = A2; } else throw myexception()<<"More alignment sequences than leaf nodes!"; } else if (A.n_sequences() > T.n_nodes()) throw myexception()<<"More alignment sequences than tree nodes!"; else if (A.n_sequences() < T.n_nodes()) throw myexception()<<"Fewer alignment sequences than tree nodes!"; } //---------- double-check that we have the right number of sequences ---------// if (internal_sequences) assert(A.n_sequences() == T.n_nodes()); else assert(A.n_sequences() == T.n_leaves()); //----- Remap leaf indices for T onto A's leaf sequence indices -----// remap_T_indices(T,A); if (internal_sequences) connect_leaf_characters(A,T); //---- Check to see that internal nodes satisfy constraints ----// check_alignment(A,T,internal_sequences); }
/// \brief Re-index the leaves of tree \a T1 so that the labels have the same ordering as in \a T2. /// /// \param T1 The leaf-labelled tree to re-index. /// \param T2 The leaf-labelled tree to match. /// void remap_T_indices(SequenceTree& T1,const SequenceTree& T2) { if (T1.n_leaves() != T2.n_leaves()) throw myexception()<<"Trees do not correspond: different numbers of leaves."; //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { remap_T_indices(T1,T2.get_sequences()); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2(b.missing,b.from); if (b.from == 0) b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in second tree."; else b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in first tree."; throw b2; } }
SequenceTree get_mf_tree(const std::vector<std::string>& names, const std::vector<Partition>& partitions) { SequenceTree T = star_tree(names); int i=0; try { for(;i<partitions.size();i++) T.induce_partition(partitions[i].group1); } catch(...) { throw myexception()<<"Partition ("<<partitions[i]<<") conflicts with tree "<<T; } for(int i=0;i<T.n_branches();i++) T.branch(i).set_length(1.0); return T; }
int choose_SPR_target(SequenceTree& T1, int b1_) { const_branchview b1 = T1.directed_branch(b1_); //----- Select the branch to move to ------// dynamic_bitset<> subtree_nodes = T1.partition(b1.reverse()); subtree_nodes[b1.target()] = true; vector<int> branches; vector<double> lengths; for(int i=0;i<T1.n_branches();i++) { const_branchview bi = T1.branch(i); // skip branch if its contained in the subtree if (subtree_nodes[bi.target()] and subtree_nodes[bi.source()]) continue; double L = 1.0; // down-weight branch if it is one of the subtree's 2 neighbors if (subtree_nodes[bi.target()] or subtree_nodes[bi.source()]) L = 0.5; branches.push_back(i); lengths.push_back(L); } try { int b2 = branches[ choose(lengths) ]; return b2; } catch (choose_exception<efloat_t>& c) { c.prepend(__PRETTY_FUNCTION__); throw c; } }
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a names. /// /// \param T The leaf-labelled tree. /// \param names The ordered leaf labels. /// void remap_T_leaf_indices(SequenceTree& T,const vector<string>& names) { assert(names.size() == T.n_leaves()); //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { vector<int> mapping = compute_mapping(T.get_leaf_labels(), names); T.standardize(mapping); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2 = b; b2.clear(); if (b2.from == 0) b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in names."; else b2<<"Sequence '"<<b2.missing<<"' not found in the tree."; throw b2; } }
data_partition::data_partition(const string& n, const alignment& a,const SequenceTree& t, const substitution::MultiModel& SM) :SModel_(SM), partition_name(n), cached_alignment_prior_for_branch(t.n_branches()), cached_alignment_counts_for_branch(t.n_branches(),ublas::matrix<int>(5,5)), cached_sequence_lengths(a.n_sequences()), branch_mean_(1.0), smodel_full_tree(true), A(a), T(t), MC(t,SM), LC(t,SModel()), branch_HMMs(t.n_branches()), branch_HMM_type(t.n_branches(),0), beta(2, 1.0) { for(int b=0;b<cached_alignment_counts_for_branch.size();b++) cached_alignment_counts_for_branch[b].invalidate(); }
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A. /// /// \param T The leaf-labelled tree. /// \param A A multiple sequence alignment. /// alignment remap_A_indices(alignment& A, const SequenceTree& T) { vector<string> labels = T.get_labels(); if (A.n_sequences() == T.n_leaves()) { labels.resize(T.n_leaves()); } else if (A.n_sequences() != T.n_nodes()) throw myexception()<<"Cannot map alignment onto tree:\n Alignment has "<<A.n_sequences()<<" sequences.\n Tree has "<<T.n_leaves()<<" leaves and "<<T.n_nodes()<<" nodes."; for(int i=0;i<labels.size();i++) if (labels[i] == "") { if (i<T.n_leaves()) throw myexception()<<"Tree has empty label for a leaf node: not allowed!"; else throw myexception()<<"Alignment has internal node information, but tree has empty label for an internal node: not allowed!"; } assert(A.n_sequences() == labels.size()); //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { vector<int> mapping = compute_mapping(labels, sequence_names(A)); return reorder_sequences(A,mapping); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2 = b; b2.clear(); if (b.from == 0) b2<<"Couldn't find sequence \""<<b2.missing<<"\" in alignment."; else b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree."; throw b2; } }
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A. /// /// \param T The leaf-labelled tree. /// \param A A multiple sequence alignment. /// void remap_T_indices(SequenceTree& T,const alignment& A) { if (A.n_sequences() < T.n_leaves()) throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves, but alignment has only "<<A.n_sequences()<<" sequences."; //----- Remap leaf indices for T onto A's leaf sequence indices -----// try { vector<string> names = sequence_names(A,T.n_leaves()); remap_T_indices(T,names); } catch(const bad_mapping<string>& b) { bad_mapping<string> b2(b.missing,b.from); if (b.from == 0) b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in alignment."; else b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree."; throw b2; } }
bool update_lengths(const SequenceTree& Q,const SequenceTree& T, valarray<double>& branch_lengths, valarray<double>& branch_lengths_squared, valarray<double>& node_lengths) { // map branches from Q -> T vector<int> branches_map = extends_map(T,Q); if (not branches_map.size()) return false; // incorporate lengths of branches that map to Q for(int b=0;b<Q.n_branches();b++) { int b2 = branches_map[b]; double L = T.directed_branch(b2).length(); branch_lengths[b] += L; branch_lengths_squared[b] += L*L; } // map nodes from T -> Q vector<int> nodes_map = get_nodes_map(Q,T,branches_map); // incorprate lengths of branches that map to nodes in Q for(int i=T.n_leafbranches();i<T.n_branches();i++) { const_branchview b = T.branch(i); int n1 = nodes_map[b.source()]; int n2 = nodes_map[b.target()]; if (n1 == n2) node_lengths[n1] += T.branch(i).length(); } return true; }
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost) { const alphabet& a = A.get_alphabet(); vector<int> letters(T.n_leaves()); int root = T.directed_branch(0).target(); vector<const_branchview> branches = branches_toward_node(T,root); ublas::matrix<B> n_muts(T.n_nodes(), a.size()); double tree_length = 0; for(int c=0;c<A.length();c++) { for(int i=0;i<T.n_leaves();i++) letters[i] = A(c,i); double length = n_mutations<B>(a,letters,T,cost,n_muts,branches); tree_length += length; } return tree_length; }
void load_As_and_random_T(const variables_map& args,vector<alignment>& alignments,SequenceTree& T,const vector<bool>& internal_sequences) { //align - filenames vector<string> filenames = args["align"].as<vector<string> >(); // load the alignments alignments = load_alignments(filenames,load_alphabets(args)); //------------- Load random tree ------------------------// SequenceTree TC = star_tree(sequence_names(alignments[0])); if (args.count("t-constraint")) TC = load_constraint_tree(args["t-constraint"].as<string>(),sequence_names(alignments[0])); T = TC; RandomTree(T,1.0); //-------------- Link --------------------------------// link(alignments,T,internal_sequences); //---------------process----------------// for(int i=0;i<alignments.size();i++) { //---------------- Randomize alignment? -----------------// if (args.count("randomize-alignment")) alignments[i] = randomize(alignments[i],T.n_leaves()); //------------------ Analyze 'internal'------------------// if ((args.count("internal") and args["internal"].as<string>() == "+") or args.count("randomize-alignment")) for(int column=0;column< alignments[i].length();column++) { for(int j=T.n_leaves();j<alignments[i].n_sequences();j++) alignments[i](column,j) = alphabet::not_gap; } //---- Check that internal sequence satisfy constraints ----// check_alignment(alignments[i],T,internal_sequences[i]); } }
/// Reorder internal sequences of \a A to correspond to standardized node names for \a T alignment standardize(const alignment& A, const SequenceTree& T) { SequenceTree T2 = T; // if we don't have any internal node sequences, then we are already standardized if (A.n_sequences() == T.n_leaves()) return A; // standardize NON-LEAF node and branch names in T vector<int> mapping = T2.standardize(); vector<int> new_order = invert(mapping); return reorder_sequences(A,new_order); }
int main(int argc,char* argv[]) { Arguments args; args.read(argc,argv); unsigned long seed =0; if (args.set("seed")) { seed = convertTo<unsigned long>(args["seed"]); myrand_init(seed); } else seed = myrand_init(); assert(args.set("names")); vector<string> names = split(args["names"],':'); double branch_mean = 0.1; if (args.set("mean")) branch_mean = convertTo<double>(args["mean"]); SequenceTree T = RandomTree(names,branch_mean); std::cout<<T.write()<<std::endl; }
vector<int> get_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T, const ublas::matrix<int>& cost) { int root = T.directed_branch(0).target(); ublas::matrix<int> n_muts(T.n_nodes(),a.size()); peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) ); // get an order list of branches point away from the root; vector<const_branchview> branches = branches_from_node(T,root); std::reverse(branches.begin(),branches.end()); // Allocate space to store the letter for each node vector<int> node_letters(T.n_nodes(),-1); // choose the cheapest letter at the root node_letters[root] = row_min(n_muts,root); const unsigned A = a.size(); vector<double> temp(A); for(int i=0;i<branches.size();i++) { int s = branches[i].source(); int t = branches[i].target(); int k = node_letters[s]; assert(k != -1); for(int l=0;l<A;l++) temp[l] = n_muts(t,l)+cost(l,k); node_letters[t] = argmin(temp); } return node_letters; }
string topology(const string& t) { SequenceTree T = standardized(t); return T.write(false); }
Parameters::Parameters(const vector<alignment>& A, const SequenceTree& t, const vector<polymorphic_cow_ptr<substitution::MultiModel> >& SMs, const vector<int>& s_mapping, const vector<int>& scale_mapping) :SModels(SMs), smodel_for_partition(s_mapping), scale_for_partition(scale_mapping), branch_prior_type(0), smodel_full_tree(true), T(t), TC(star_tree(t.get_sequences())), branch_HMM_type(t.n_branches(),0), beta(2, 1.0), updown(-1), features(0) { constants.push_back(-1); for(int i=0;i<n_scales;i++) add_super_parameter("mu"+convertToString(i+1),1.0); // check that smodel mapping has correct size. if (smodel_for_partition.size() != A.size()) throw myexception()<<"There are "<<A.size() <<" data partitions, but you mapped smodels onto " <<smodel_for_partition.size(); // register the substitution models as sub-models for(int i=0;i<SModels.size();i++) { string name = "S" + convertToString(i+1); add_submodel(name, *SModels[i]); } // NO indel model (in this constructor) // check that we only mapping existing smodels to data partitions for(int i=0;i<smodel_for_partition.size();i++) { int m = smodel_for_partition[i]; if (m >= SModels.size()) throw myexception()<<"You can't use smodel "<<m+1<<" for data partition "<<i+1 <<" because there are only "<<SModels.size()<<" smodels."; } // load values from sub-models (smodels/imodel) read(); // don't constrain any branch lengths for(int b=0;b<TC->n_branches();b++) TC->branch(b).set_length(-1); // create data partitions and register as sub-models for(int i=0;i<A.size();i++) { // compute name for data-partition string name = string("part") + convertToString(i+1); // get reference to smodel for data-partition const substitution::MultiModel& SM = SModel(smodel_for_partition[i]); // create data partition data_partitions.push_back(cow_ptr<data_partition>(data_partition(name,A[i],*T,SM))); // register data partition as sub-model add_submodel(name,*data_partitions[i]); } }
double computeLeastSquaresEdgeLengths(const StrDblMatrix &orig_dm, SequenceTree &tree){ StrDblMatrix dm(orig_dm); const int numOriginalLeafs = dm.getSize(); SequenceTree::NodeVector nodes; tree.recalcNodeIdsPostfixOrderAndAddInOrder(nodes); size_t nodeIdToRowIndex[nodes.size()]; size_t rowIndexToNodeId[nodes.size()]; str2int_hashmap name2Id((int)(nodes.size()*1.7)); for(size_t i=0 ; i<nodes.size() ; i++) if(nodes[i]->isLeaf()){ //PRINT(NAME(nodes[i]));PRINT(ID(nodes[i])); name2Id[NAME(nodes[i])] = ID(nodes[i]); } for(size_t row=0 ; row<dm.getSize() ; row++){ str2int_hashmap::iterator f = name2Id.find(dm.getIdentifier(row)); if(f==name2Id.end()) USER_ERROR("name doesn't exist in tree: " << dm.getIdentifier(row)); nodeIdToRowIndex[(*f).second] = row; rowIndexToNodeId[row] = (*f).second; } //the number of leafs below each node int numNodesBelow[nodes.size()]; for(size_t i=0;i<nodes.size();i++) numNodesBelow[i]=1; //-------------------------------- //BOTTOM UP TRAVERSAL IN TREE for(size_t i=0;i<nodes.size()-1;i++){ if(nodes[i]->isLeaf()) continue; //get the children and do the UNJ calculation to get the edge lengths SequenceTree::Node *parent = nodes[i]; SequenceTree::Node *child1 = parent->getRightMostChild(); SequenceTree::Node *child2 = child1->getLeftSibling(); if(child2->getLeftSibling()!=NULL ){ USER_ERROR("Have to be unrooted binary tree. Parent has " << parent->getNumChildren() << " children"); } numNodesBelow[ID(parent)] = numNodesBelow[ID(child1)] + numNodesBelow[ID(child2)]; //SEPARATOR();PRINT(NAME(child1));PRINT(NAME(child2)); double sum = 0; for(size_t row=0;row<dm.getSize();row++){ if(row==nodeIdToRowIndex[ID(child1)] || row==nodeIdToRowIndex[ID(child2)] ) continue; sum += numNodesBelow[rowIndexToNodeId[row]]*(dm.getDistance(nodeIdToRowIndex[ID(child1)],row)- dm.getDistance(nodeIdToRowIndex[ID(child2)],row)); } if(!isfinite(sum)){ USER_ERROR("Distance Matrix contains a non finite number: " << sum); } EDGE(child1) = 0.5*dm.getDistance(nodeIdToRowIndex[ID(child1)], nodeIdToRowIndex[ID(child2)]) + 1.0/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum; EDGE(child2) = 0.5*dm.getDistance(nodeIdToRowIndex[ID(child1)], nodeIdToRowIndex[ID(child2)]) - 1.0/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum; // PRINT(dm.getDistance(nodeIdToRowIndex[ID(child1)],nodeIdToRowIndex[ID(child2)])); // PRINT((numOriginalLeafs-numNodesBelow[ID(parent)])); // PRINT(sum);PRINT(EDGE(child1));PRINT( EDGE(child2)); //PRINT(1/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum); //swap child1 to last row int idOnLastRow = rowIndexToNodeId[dm.getSize()-1]; if(idOnLastRow!=ID(child1)){ int rowChild1 = nodeIdToRowIndex[ID(child1)]; //PRINT(nodeIdToRowIndex[ID(child1)]);PRINT(dm.getSize()); dm.swapRowToLast(nodeIdToRowIndex[ID(child1)]); nodeIdToRowIndex[idOnLastRow] = rowChild1; rowIndexToNodeId[rowChild1] = idOnLastRow; rowIndexToNodeId[dm.getSize()-1] = ID(child1); nodeIdToRowIndex[ID(child1)] = dm.getSize()-1; } //update distances to parent double w1 = (1.0*numNodesBelow[ID(child1)])/numNodesBelow[ID(parent)]; double w2 = (1.0*numNodesBelow[ID(child2)])/numNodesBelow[ID(parent)]; double distChild1Child2 = w1*EDGE(child1)+w2*EDGE(child2); //put parent on the row of child 2 nodeIdToRowIndex[ID(parent)] = nodeIdToRowIndex[ID(child2)]; rowIndexToNodeId[nodeIdToRowIndex[ID(parent)]] = ID(parent); int parentRow = nodeIdToRowIndex[ID(parent)]; int child1Row = nodeIdToRowIndex[ID(child1)]; int child2Row = nodeIdToRowIndex[ID(child2)]; for(size_t row=0 ; row<dm.getSize()-1 ; row++){ dm.setDistance(parentRow,row, w1*dm.getDistance(child1Row,row)+ w2*dm.getDistance(child2Row,row)- distChild1Child2); } dm.setDistance(nodeIdToRowIndex[ID(parent)],nodeIdToRowIndex[ID(parent)],0.0); //remove last row dm.removeLastRow(); } //Take care of root SequenceTree::Node *root = nodes[nodes.size()-1]; if(!root->isRoot() || root->getNumChildren()!=3){ USER_ERROR("Have to be unrooted binary tree. Root has " << root->getNumChildren() << " children"); } // cout << dm << endl; SequenceTree::Node *c1 = root->getRightMostChild(); SequenceTree::Node *c2 = c1->getLeftSibling(); SequenceTree::Node *c3 = c2->getLeftSibling(); int c1row = nodeIdToRowIndex[ID(c1)]; int c2row = nodeIdToRowIndex[ID(c2)]; int c3row = nodeIdToRowIndex[ID(c3)]; EDGE(c1) = 0.5*(dm.getDistance(c1row,c2row) + dm.getDistance(c1row,c3row)-dm.getDistance(c2row,c3row)); EDGE(c2) = 0.5*(dm.getDistance(c2row,c1row) + dm.getDistance(c2row,c3row)-dm.getDistance(c1row,c3row)); EDGE(c3) = 0.5*(dm.getDistance(c3row,c2row) + dm.getDistance(c3row,c1row)-dm.getDistance(c2row,c1row)); EDGE(root) = 0; //COMPUTE THE L2 SCORE StrDblMatrix treeM(tree.getNumLeafs()); tree.tree2distanceMatrix(treeM); return computeL2(treeM, orig_dm); }
//-------------------------------------------------- // THE SEQUENCE BASED NJ ALGO // // void computeSequenceBasedNJ(std::vector<Sequence> &seqs, SequenceTree &resultTree){ // 1. Create a star tree with the leafs being the input sequences in b128 format. DNA_b128_String defaultString(seqs[0].seq.size()); b128Tree tree(defaultString); b128Tree::Node *root = tree.getRoot(); obj_ptr2obj_ptr_hashmap node2seqs((size_t)(seqs.size()*1.5)); b128Tree::NodeVector leafs; for ( size_t i = 0 ; i < seqs.size() ; i++ ){ b128Tree::Node *leaf = root->addChild(defaultString); node2seqs[leaf] = &(seqs[i]); leafs.push_back(leaf); (leaf->data).append(seqs[i].seq); } // 2. Compute the DistanceMatrix for the seqs. b128Matrix dm(seqs.size()); for ( size_t i = 0 ; i < leafs.size() ; i++ ){ dm.setIdentifier(i,leafs[i]); } fillMatrix(dm); //std::cout << dm << std::endl; // 3. COMPUTE ROW SUMS double rowSums[dm.getSize()]; for ( size_t row = 0 ; row < dm.getSize() ; row++ ){ double sum = 0; size_t i =0; for ( ; i < dm.getSize() ; i++ ) sum += dm.getDistance(row,i); rowSums[row] = sum; } //---------------- // 4. // NJ ITERATION //compute the row sums while ( dm.getSize() > 3 ) { //FIND MIN PAIR //find the minimal value double minVal = FLT_MAX; size_t mini = 1000000; size_t minj = 1000000; for ( size_t i = 0 ; i < dm.getSize() ; i++ ){ for ( size_t j = i+1 ; j < dm.getSize() ; j++ ){ double newVal = (dm.getSize() - 2.0)*dm.getDistance(i,j) - rowSums[i] - rowSums[j]; //std::cout << newVal << " , "; if ( newVal < minVal ){ minVal = newVal; mini = i; minj = j; } } } // std::cout << std::endl; //PRINT(minVal); //make sure that minj is the last row in the matrix if ( mini == dm.getSize() -1 ){ mini = minj; } else { dm.swapRowToLast(minj); double tmp = rowSums[dm.getSize()-1]; rowSums[dm.getSize()-1] = rowSums[minj]; rowSums[minj] = tmp; } minj = dm.getSize()-1; //CLUSTER THE LEAFS DNA_b128_String &child1str = dm.getIdentifier(mini)->data; DNA_b128_String &child2str = dm.getIdentifier(minj)->data; b128Tree::Node *parent = dm.getIdentifier(mini)->getTree()->detachFromParentAndAddAsSiblings(dm.getIdentifier(mini),dm.getIdentifier(minj), defaultString); dm.setIdentifier(mini, parent); //COMPUTE PARSIMONY AND SET IN PARENT DNA_b128_String &parentstr = parent->data; DNA_b128_String::create_weighted_parsimonious(parentstr,child1str,child2str); //COMPUTE DISTANCES FROM PARENT TO ALL OTHER NODES //PRINT(mini);PRINT(minj); for ( size_t i = 0 ; i < dm.getSize()-1 ; i++ ){//skip last row double dist2iandj = dm.getDistance(mini,i) + dm.getDistance(minj,i); DNA_b128_String &leafstr = dm.getIdentifier(i)->data; double dist = computeK2PDistance(parentstr,leafstr); // regular nj update function: //double regnj = dist2iandj * 0.5; //double studier = (dist2iandj-dm.getDistance(mini,minj))*0.5; //PRINT(dist); PRINT(regnj);PRINT(dist2iandj);PRINT(dist-regnj);PRINT(dist-studier); //PRINT(dist - dm.getDistance(mini,i) );PRINT(dist - dm.getDistance(minj,i) ); dm.setDistance(mini,i, dist); //update rowsums rowSums[i] = rowSums[i] - dist2iandj + dm.getDistance(mini,i); //PRINT(rowSums[i]); } dm.setDistance(mini,mini,0); //remove the last row of the matrix dm.removeLastRow(); //recompute the row sum for the parent double sum = 0; for ( size_t i = 0 ; i < dm.getSize() ; i++ ) sum += dm.getDistance(mini,i); rowSums[mini] = sum; } //END ITERATION //---------------------------------- //CONVERT THE TREE TO A SEQUENCE TREE tree.recalcNodeStructure(); // tree.drawTree(std::cout); b128Tree::NodeVector leafnodes; tree.addLeafs(leafnodes); Sequence_double dummy; dummy.dbl = -1; resultTree = SequenceTree(tree,dummy); SequenceTree::NodeVector seqnodes; resultTree.addLeafs(seqnodes); for ( size_t i = 0 ; i < seqnodes.size() ; i++ ){ seqnodes[i]->data.s = *((Sequence *) node2seqs[leafnodes[i]]); } //resultTree.drawTree(std::cout); }
bool update_lengths(const MC_tree& Q,const SequenceTree& T, const vector<dynamic_bitset<> >& node_masks, const vector<Partition>& partitions2, valarray<double>& branch_lengths, valarray<double>& node_lengths) { // check that this tree is consistent with the MC Tree Q for(int i=0;i<Q.branch_order.size();i++) { int b = Q.branch_order[i]; if (not implies(T,Q.partitions[b])) return false; } // map branches of the input tree for(int b=0;b<T.n_branches();b++) { Partition P = partition_from_branch(T,b); // Find mc tree branches implied by branch b vector<int> branches; for(int i=0;i<Q.branch_order.size();i++) { if (implies(P,partitions2[i])) branches.push_back(i); } // Find out which nodes this branch is inside of vector<int> nodes; for(int n=0;n<Q.n_nodes();n++) { if (Q.degree(n) == 0) continue; Partition P2 = P; P2.group1 = P2.group1 & node_masks[n]; P2.group2 = P2.group2 & node_masks[n]; if (P2.group1.none() or P2.group2.none()) continue; bool ok = true; for(int b=0;b<2*Q.n_branches() and ok;b++) { if (Q.mapping[b] != n) continue; Partition P3 = Q.partitions[b]; P3.group1 = P3.group1 & node_masks[n]; P3.group2 = P3.group2 & node_masks[n]; if (partition_less_than(P3,P2) or partition_less_than(P3,P2.reverse())) ; else ok=false; } if (ok) { nodes.push_back(n); assert(Q.degree(n) > 3); } } /* cerr<<"Branch: "<<P<<endl; cerr<<" - maps to "<<branches.size()<<" branches."<<endl; if (nodes.size()) { cerr<<" - inside node(s):"<<endl; cerr<<P<<endl; for(int i=0;i<nodes.size();i++) cerr<<" "<<Q.partitions[Q.branch_to_node(nodes[i])]<<endl; } */ // This branch should be inside only one node, if any. assert(nodes.size() < 2); // This branch should not be inside a node, if it implies an mc tree branch. if (branches.size()) assert(not nodes.size()); // But this branch should be inside a node if it doesn't imply a branch. assert(branches.size() + nodes.size() > 0); const double L = T.branch(b).length(); // Divide the branch length evenly between the branches it implies. for(int i=0;i<branches.size();i++) branch_lengths[branches[i]] += L/branches.size(); // Divide the branch length evenly between the nodes (node?) it implies. for(int i=0;i<nodes.size();i++) node_lengths[nodes[i]] += L/nodes.size(); } return true; }
void print_stats(std::ostream& o,std::ostream& trees, const Parameters& P, bool print_alignment) { efloat_t Pr_prior = P.prior(); efloat_t Pr_likelihood = P.likelihood(); efloat_t Pr = Pr_prior * Pr_likelihood; o<<" prior = "<<Pr_prior; for(int i=0;i<P.n_data_partitions();i++) o<<" prior_A"<<i+1<<" = "<<P[i].prior_alignment(); o<<" likelihood = "<<Pr_likelihood<<" logp = "<<Pr <<" beta = " <<P.beta[0] <<"\n"; if (print_alignment) for(int i=0;i<P.n_data_partitions();i++) o<<standardize(*P[i].A, *P.T)<<"\n"; { SequenceTree T = *P.T; valarray<double> weights(P.n_data_partitions()); for(int i=0;i<weights.size();i++) weights[i] = max(sequence_lengths(*P[i].A, P.T->n_leaves())); weights /= weights.sum(); double mu_scale=0; for(int i=0;i<P.n_data_partitions();i++) mu_scale += P[i].branch_mean()*weights[i]; for(int b=0;b<T.n_branches();b++) T.branch(b).set_length(mu_scale*T.branch(b).length()); trees<<T<<std::endl; trees.flush(); } o<<"\n"; show_parameters(o,P); o.flush(); for(int m=0;m<P.n_smodels();m++) { o<<"smodel"<<m+1<<endl; for(int i=0;i<P.SModel(m).n_base_models();i++) o<<" rate"<<i<<" = "<<P.SModel(m).base_model(i).rate(); o<<"\n\n"; for(int i=0;i<P.SModel(m).n_base_models();i++) o<<" fraction"<<i<<" = "<<P.SModel(m).distribution()[i]; o<<"\n\n"; o<<"frequencies = "<<"\n"; show_frequencies(o,P.SModel(m)); o<<"\n\n"; o.flush(); } // The leaf sequences should NOT change during alignment #ifndef NDEBUG for(int i=0;i<P.n_data_partitions();i++) check_alignment(*P[i].A, *P.T,"print_stats:end"); #endif }
// mark nodes in T according to what node of Q they map to vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T, const vector<int>& branches_map) { assert(branches_map.size() == Q.n_branches() * 2); vector<int> nodes_map(T.n_nodes(),-1); // map nodes from T -> Q that are in both trees for(int b=0;b<Q.n_branches();b++) { int Q_source = Q.branch(b).source(); int Q_target = Q.branch(b).target(); int b2 = branches_map[b]; int T_source = T.directed_branch(b2).source(); int T_target = T.directed_branch(b2).target(); if (nodes_map[T_source] == -1) nodes_map[T_source] = Q_source; else assert(nodes_map[T_source] == Q_source); if (nodes_map[T_target] == -1) nodes_map[T_target] = Q_target; else assert(nodes_map[T_target] == Q_target); } // map the rest of the nodes from T -> Q for(int i=Q.n_leaves();i<Q.n_nodes();i++) { unsigned D = Q[i].degree(); if (D <= 3) continue; // get a branch of Q pointing into the node const_branchview outside = *(Q[i].branches_in()); // get a branch of T pointing into the node outside = T.directed_branch(branches_map[outside.name()]); list<const_branchview> branches; typedef list<const_branchview>::iterator list_iterator; append(outside.branches_after(),branches); for(list_iterator b = branches.begin() ; b != branches.end();) { int node = (*b).target(); if (nodes_map[node] == -1) nodes_map[node] = i; if (nodes_map[node] == i) { append((*b).branches_after(),branches); b++; } else { list_iterator prev = b; b++; branches.erase(prev); } } assert(branches.size() == D-3); } for(int i=0;i<nodes_map.size();i++) assert(nodes_map[i] != -1); return nodes_map; }
int main(int argc,char* argv[]) { try { //----------- Parse command line ----------// variables_map args = parse_cmd_line(argc,argv); int skip = args["skip"].as<int>(); int max = -1; if (args.count("max")) max = args["max"].as<int>(); int subsample = args["sub-sample"].as<int>(); vector<string> prune; if (args.count("prune")) { string p = args["prune"].as<string>(); prune = split(p,','); } //----------- Read the topology -----------// SequenceTree Q = load_T(args); standardize(Q); const int B = Q.n_branches(); const int N = Q.n_nodes(); vector<double> bf(B); for(int b=0;b<bf.size();b++) bf[b] = Q.branch(b).length(); //-------- Read in the tree samples --------// if ( args.count("simple") ) { accum_branch_lengths_ignore_topology A(Q); scan_trees(std::cin,skip,subsample,max,prune,A); for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; exit(0); } accum_branch_lengths_same_topology A(Q); try { scan_trees(std::cin,skip,subsample,max,prune,A); } catch (std::exception& e) { if (args.count("safe")) cout<<Q.write(false)<<endl; std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(0); } if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology"; if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl; //------- Merge lengths and topology -------// if (args.count("var")) { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m2[b]); cout<<Q; exit(0); } else { for(int b=0;b<B;b++) Q.branch(b).set_length(A.m1[b]); if (not args.count("no-node-lengths") and not args.count("show-node-lengths")) { for(int n=0;n<N;n++) { int degree = Q[n].neighbors().size(); for(out_edges_iterator b = Q[n].branches_out();b;b++) (*b).set_length((*b).length() + A.n1[n]/degree); } } //------- Print Tree and branch lengths -------// cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl; //------------ Print node lengths -------------// if (args.count("show-node-lengths")) for(int n=0;n<Q.n_nodes();n++) { if (A.n1[n] > 0) { cout<<"node "<<A.n1[n]<<endl; int b = (*Q[n].branches_in()).name(); cout<<partition_from_branch(Q,b)<<endl; } } } } catch (std::exception& e) { std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl; exit(1); } return 0; }
//FIXME T.seq(i) -> T.leafname(i) //FIXME T.get_sequences -> T.leafnames() void delete_node(SequenceTree& T,const std::string& name) { int index = find_index(T.get_sequences(),name); nodeview n = T.prune_subtree(T.branch(index).reverse()); T.remove_node_from_branch(n); }
RootedSequenceTree add_root(SequenceTree T,int b) { int r = T.create_node_on_branch(b); return RootedSequenceTree(T,r); }