Exemple #1
0
B n_mutations(const alphabet& a, const vector<int>& letters, const SequenceTree& T,const ublas::matrix<B>& cost,
	      ublas::matrix<B>& n_muts, const vector<const_branchview>& branches)
{
  int root = T.directed_branch(0).target();

  peel_n_mutations(a,letters,T,cost,n_muts,branches);

  return row_min(n_muts,root);
}
Exemple #2
0
/// Does any branch in T imply the partition p?
bool implies(const SequenceTree& T,const Partition& p) {
  bool result = false;
  for(int b=0;b<T.n_branches() and not result;b++) {
    dynamic_bitset<> bp = branch_partition(T,b);

    if (implies(bp,p)) return true;
  }
  return false;
}
Exemple #3
0
vector<Partition> all_partitions_from_tree(const SequenceTree& T) 
{
  vector<Partition> partitions;

  for(int b=0;b<T.n_branches();b++)
    partitions.push_back(partition_from_branch(T,b));

  return partitions;
}
Exemple #4
0
void remap_T_indices(SequenceTree& T,const vector<string>& names)
{
  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    vector<int> mapping = compute_mapping(T.get_sequences(),names);

    T.standardize(mapping);
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2(b.missing,b.from);
    if (b.from == 0)
      b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in names.";
    else
      b2<<"Sequence '"<<b2.missing<<"' not found in the tree.";
    throw b2;
  }
}
Exemple #5
0
int which_branch(const SequenceTree& T, const Partition& p) 
{
  for(int b=0; b<2*T.n_branches(); b++) {
    dynamic_bitset<> bp = branch_partition(T,b);
    if( directed_implies(bp,p) )
      return b;
  }
  return -1;
}
Exemple #6
0
void write_partitions(std::ostream& o,const vector<Partition>& partitions)
{
  vector<Partition> full;
  vector<Partition> sub;
  for(int i=0;i<partitions.size();i++)
    if (partitions[i].full())
      full.push_back(partitions[i]);
    else
      sub.push_back(partitions[i]);

  if (full.size()) {
    SequenceTree consensus = get_mf_tree(partitions[0].names,full);
    o<<consensus.write(false)<<endl;
  }

  for(int i=0;i<sub.size();i++)
    o<<sub[i]<<endl;
}
Exemple #7
0
/// \brief  Remap the leaf indices of tree \a T to match the alignment \a A: check the result
///
/// \param A The alignment.
/// \param T The tree.
/// \param internal_sequences Should the resulting alignment have sequences for internal nodes on the tree?
///
void link(alignment& A,SequenceTree& T,bool internal_sequences) 
{
  check_names_unique(A);

  // Later, might we WANT sub-branches???
  if (has_sub_branches(T))
    remove_sub_branches(T);

  if (internal_sequences and not is_Cayley(T)) {
    assert(has_polytomy(T));
    throw myexception()<<"Cannot link a multifurcating tree to an alignment with internal sequences.";
  }

  //------ IF sequences < leaf nodes THEN complain ---------//
  if (A.n_sequences() < T.n_leaves())
    throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves but Alignment only has "
		       <<A.n_sequences()<<" sequences.";

  //----- IF sequences = leaf nodes THEN maybe add internal sequences.
  else if (A.n_sequences() == T.n_leaves()) {
    if (internal_sequences)
      A = add_internal(A,T);
  }
  //----- IF sequences > leaf nodes THEN maybe complain -------//
  else {
    if (not internal_sequences) {
      alignment A2 = chop_internal(A);
      if (A2.n_sequences() == T.n_leaves()) {
	A = A2;
      }
      else
	throw myexception()<<"More alignment sequences than leaf nodes!";
    } 
    else if (A.n_sequences() > T.n_nodes())
      throw myexception()<<"More alignment sequences than tree nodes!";
    else if (A.n_sequences() < T.n_nodes())
      throw myexception()<<"Fewer alignment sequences than tree nodes!";
  }
  
  //---------- double-check that we have the right number of sequences ---------//
  if (internal_sequences)
    assert(A.n_sequences() == T.n_nodes());
  else
    assert(A.n_sequences() == T.n_leaves());

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  remap_T_indices(T,A);

  if (internal_sequences)
    connect_leaf_characters(A,T);

  //---- Check to see that internal nodes satisfy constraints ----//
  check_alignment(A,T,internal_sequences);
}
Exemple #8
0
/// \brief Re-index the leaves of tree \a T1 so that the labels have the same ordering as in \a T2.
///
/// \param T1 The leaf-labelled tree to re-index.
/// \param T2 The leaf-labelled tree to match.
///
void remap_T_indices(SequenceTree& T1,const SequenceTree& T2)
{
  if (T1.n_leaves() != T2.n_leaves())
    throw myexception()<<"Trees do not correspond: different numbers of leaves.";

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    remap_T_indices(T1,T2.get_sequences());
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2(b.missing,b.from);
    if (b.from == 0)
      b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in second tree.";
    else
      b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in first tree.";
    throw b2;
  }
}
Exemple #9
0
SequenceTree get_mf_tree(const std::vector<std::string>& names,
			 const std::vector<Partition>& partitions) 
{
  SequenceTree T = star_tree(names);

  int i=0;
  try {
    for(;i<partitions.size();i++)
      T.induce_partition(partitions[i].group1);
  }
  catch(...) {
    throw myexception()<<"Partition ("<<partitions[i]<<") conflicts with tree "<<T;
  }

  for(int i=0;i<T.n_branches();i++)
    T.branch(i).set_length(1.0);

  return T;
}
Exemple #10
0
int choose_SPR_target(SequenceTree& T1, int b1_) 
{
  const_branchview b1 = T1.directed_branch(b1_);

  //----- Select the branch to move to ------//
  dynamic_bitset<> subtree_nodes = T1.partition(b1.reverse());
  subtree_nodes[b1.target()] = true;

  vector<int> branches;
  vector<double> lengths;

  for(int i=0;i<T1.n_branches();i++) 
  {
    const_branchview bi = T1.branch(i);

    // skip branch if its contained in the subtree
    if (subtree_nodes[bi.target()] and 
	subtree_nodes[bi.source()])
      continue;

    double L = 1.0;

    // down-weight branch if it is one of the subtree's 2 neighbors
    if (subtree_nodes[bi.target()] or 
	subtree_nodes[bi.source()])
      L = 0.5;

    branches.push_back(i);
    lengths.push_back(L);
  }

  try {
    int b2 = branches[ choose(lengths) ];

    return b2;
  }
  catch (choose_exception<efloat_t>& c)
  {
    c.prepend(__PRETTY_FUNCTION__);
    throw c;
  }
}
Exemple #11
0
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a names.
///
/// \param T The leaf-labelled tree.
/// \param names The ordered leaf labels.
///
void remap_T_leaf_indices(SequenceTree& T,const vector<string>& names)
{
  assert(names.size() == T.n_leaves());
  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    vector<int> mapping = compute_mapping(T.get_leaf_labels(), names);

    T.standardize(mapping);
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2 = b;
    b2.clear();
    if (b2.from == 0)
      b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in names.";
    else
      b2<<"Sequence '"<<b2.missing<<"' not found in the tree.";
    throw b2;
  }
}
Exemple #12
0
data_partition::data_partition(const string& n, const alignment& a,const SequenceTree& t,
			       const substitution::MultiModel& SM)
  :SModel_(SM),
   partition_name(n),
   cached_alignment_prior_for_branch(t.n_branches()),
   cached_alignment_counts_for_branch(t.n_branches(),ublas::matrix<int>(5,5)),
   cached_sequence_lengths(a.n_sequences()),
   branch_mean_(1.0),
   smodel_full_tree(true),
   A(a),
   T(t),
   MC(t,SM),
   LC(t,SModel()),
   branch_HMMs(t.n_branches()),
   branch_HMM_type(t.n_branches(),0),
   beta(2, 1.0)
{
  for(int b=0;b<cached_alignment_counts_for_branch.size();b++)
    cached_alignment_counts_for_branch[b].invalidate();
}
Exemple #13
0
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A.
///
/// \param T The leaf-labelled tree.
/// \param A A multiple sequence alignment.
///
alignment remap_A_indices(alignment& A, const SequenceTree& T)
{
  vector<string> labels = T.get_labels();

  if (A.n_sequences() == T.n_leaves())
  {
    labels.resize(T.n_leaves());

  }
  else if (A.n_sequences() != T.n_nodes())
    throw myexception()<<"Cannot map alignment onto tree:\n  Alignment has "<<A.n_sequences()<<" sequences.\n  Tree has "<<T.n_leaves()<<" leaves and "<<T.n_nodes()<<" nodes.";
      

  for(int i=0;i<labels.size();i++)
    if (labels[i] == "")
    {
      if (i<T.n_leaves())
	throw myexception()<<"Tree has empty label for a leaf node: not allowed!";
      else
	throw myexception()<<"Alignment has internal node information, but tree has empty label for an internal node: not allowed!";
    }

  assert(A.n_sequences() == labels.size());

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    vector<int> mapping = compute_mapping(labels, sequence_names(A));

    return reorder_sequences(A,mapping);
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2 = b;
    b2.clear();
    if (b.from == 0)
      b2<<"Couldn't find sequence \""<<b2.missing<<"\" in alignment.";
    else
      b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree.";
    throw b2;
  }
}
Exemple #14
0
/// \brief Re-index the leaves of tree \a T so that the labels have the same ordering as in \a A.
///
/// \param T The leaf-labelled tree.
/// \param A A multiple sequence alignment.
///
void remap_T_indices(SequenceTree& T,const alignment& A)
{
  if (A.n_sequences() < T.n_leaves())
    throw myexception()<<"Tree has "<<T.n_leaves()<<" leaves, but alignment has only "<<A.n_sequences()<<" sequences.";

  //----- Remap leaf indices for T onto A's leaf sequence indices -----//
  try {
    vector<string> names = sequence_names(A,T.n_leaves());  

    remap_T_indices(T,names);
  }
  catch(const bad_mapping<string>& b)
  {
    bad_mapping<string> b2(b.missing,b.from);
    if (b.from == 0)
      b2<<"Couldn't find leaf sequence \""<<b2.missing<<"\" in alignment.";
    else
      b2<<"Alignment sequence '"<<b2.missing<<"' not found in the tree.";
    throw b2;
  }
}
Exemple #15
0
bool update_lengths(const SequenceTree& Q,const SequenceTree& T,
		    valarray<double>& branch_lengths, 
		    valarray<double>& branch_lengths_squared, 
		    valarray<double>& node_lengths)
{
  // map branches from Q -> T
  vector<int> branches_map = extends_map(T,Q);
  if (not branches_map.size())
    return false;

  // incorporate lengths of branches that map to Q
  for(int b=0;b<Q.n_branches();b++)
  {
    int b2 = branches_map[b];
    double L = T.directed_branch(b2).length();
    branch_lengths[b] += L;
    branch_lengths_squared[b] += L*L;
  }

  // map nodes from T -> Q
  vector<int> nodes_map = get_nodes_map(Q,T,branches_map);

  // incorprate lengths of branches that map to nodes in Q
  for(int i=T.n_leafbranches();i<T.n_branches();i++) 
  {
    const_branchview b = T.branch(i);
    int n1 = nodes_map[b.source()];
    int n2 = nodes_map[b.target()];

    if (n1 == n2)
      node_lengths[n1] += T.branch(i).length();
  }

  return true;
}
Exemple #16
0
B n_mutations(const alignment& A, const SequenceTree& T,const ublas::matrix<B>& cost)
{
  const alphabet& a = A.get_alphabet();

  vector<int> letters(T.n_leaves());

  int root = T.directed_branch(0).target();

  vector<const_branchview> branches = branches_toward_node(T,root);

  ublas::matrix<B> n_muts(T.n_nodes(), a.size());

  double tree_length = 0;
  for(int c=0;c<A.length();c++) {
    for(int i=0;i<T.n_leaves();i++)
      letters[i] = A(c,i);
    double length = n_mutations<B>(a,letters,T,cost,n_muts,branches);
    tree_length += length;
  }

  return tree_length;
}
Exemple #17
0
void load_As_and_random_T(const variables_map& args,vector<alignment>& alignments,SequenceTree& T,const vector<bool>& internal_sequences)
{
  //align - filenames
  vector<string> filenames = args["align"].as<vector<string> >();

  // load the alignments
  alignments = load_alignments(filenames,load_alphabets(args));

  //------------- Load random tree ------------------------//
  SequenceTree TC = star_tree(sequence_names(alignments[0]));
  if (args.count("t-constraint"))
    TC = load_constraint_tree(args["t-constraint"].as<string>(),sequence_names(alignments[0]));

  T = TC;
  RandomTree(T,1.0);

  //-------------- Link --------------------------------//
  link(alignments,T,internal_sequences);

  //---------------process----------------//
  for(int i=0;i<alignments.size();i++) 
  {
    
    //---------------- Randomize alignment? -----------------//
    if (args.count("randomize-alignment"))
      alignments[i] = randomize(alignments[i],T.n_leaves());
  
    //------------------ Analyze 'internal'------------------//
    if ((args.count("internal") and args["internal"].as<string>() == "+")
	or args.count("randomize-alignment"))
      for(int column=0;column< alignments[i].length();column++) {
	for(int j=T.n_leaves();j<alignments[i].n_sequences();j++) 
	  alignments[i](column,j) = alphabet::not_gap;
      }

    //---- Check that internal sequence satisfy constraints ----//
    check_alignment(alignments[i],T,internal_sequences[i]);
  }
}
Exemple #18
0
/// Reorder internal sequences of \a A to correspond to standardized node names for \a T
alignment standardize(const alignment& A, const SequenceTree& T) 
{
  SequenceTree T2 = T;

  // if we don't have any internal node sequences, then we are already standardized
  if (A.n_sequences() == T.n_leaves())
    return A;

  // standardize NON-LEAF node and branch names in T
  vector<int> mapping = T2.standardize();
  vector<int> new_order = invert(mapping);

  return reorder_sequences(A,new_order);
}
Exemple #19
0
int main(int argc,char* argv[]) { 
  Arguments args;
  args.read(argc,argv);

  unsigned long seed =0;
  if (args.set("seed")) {
    seed = convertTo<unsigned long>(args["seed"]);
    myrand_init(seed);
  }
  else
    seed = myrand_init();

  assert(args.set("names"));
  vector<string> names = split(args["names"],':');

  double branch_mean = 0.1;
  if (args.set("mean"))
    branch_mean = convertTo<double>(args["mean"]);

  SequenceTree T = RandomTree(names,branch_mean);

  std::cout<<T.write()<<std::endl;
}
Exemple #20
0
vector<int> get_parsimony_letters(const alphabet& a, const vector<int>& letters, const SequenceTree& T,
				  const ublas::matrix<int>& cost)
{
  int root = T.directed_branch(0).target();
  ublas::matrix<int> n_muts(T.n_nodes(),a.size());

  peel_n_mutations(a,letters,T,cost,n_muts, branches_toward_node(T,root) );

  // get an order list of branches point away from the root;
  vector<const_branchview> branches = branches_from_node(T,root);
  std::reverse(branches.begin(),branches.end());
  
  // Allocate space to store the letter for each node
  vector<int> node_letters(T.n_nodes(),-1);

  // choose the cheapest letter at the root
  node_letters[root] = row_min(n_muts,root);

  const unsigned A = a.size();
  vector<double> temp(A);

  for(int i=0;i<branches.size();i++) 
  {
    int s = branches[i].source();
    int t = branches[i].target();

    int k = node_letters[s];
    assert(k != -1);

    for(int l=0;l<A;l++)
      temp[l] = n_muts(t,l)+cost(l,k);

    node_letters[t] = argmin(temp);
  }

  return node_letters;
}
Exemple #21
0
string topology(const string& t) {
    SequenceTree T = standardized(t);
    return T.write(false);
}
Exemple #22
0
Parameters::Parameters(const vector<alignment>& A, const SequenceTree& t,
		       const vector<polymorphic_cow_ptr<substitution::MultiModel> >& SMs,
		       const vector<int>& s_mapping,
		       const vector<int>& scale_mapping)
  :SModels(SMs),
   smodel_for_partition(s_mapping),
   scale_for_partition(scale_mapping),
   branch_prior_type(0),
   smodel_full_tree(true),
   T(t),
   TC(star_tree(t.get_sequences())),
   branch_HMM_type(t.n_branches(),0),
   beta(2, 1.0),
   updown(-1),
   features(0)
{
  constants.push_back(-1);

  for(int i=0;i<n_scales;i++)
    add_super_parameter("mu"+convertToString(i+1),1.0);

  // check that smodel mapping has correct size.
  if (smodel_for_partition.size() != A.size())
    throw myexception()<<"There are "<<A.size()
		       <<" data partitions, but you mapped smodels onto "
		       <<smodel_for_partition.size();

  // register the substitution models as sub-models
  for(int i=0;i<SModels.size();i++) {
    string name = "S" + convertToString(i+1);
    add_submodel(name, *SModels[i]);
  }

  // NO indel model (in this constructor)

  // check that we only mapping existing smodels to data partitions
  for(int i=0;i<smodel_for_partition.size();i++) {
    int m = smodel_for_partition[i];
    if (m >= SModels.size())
      throw myexception()<<"You can't use smodel "<<m+1<<" for data partition "<<i+1
			 <<" because there are only "<<SModels.size()<<" smodels.";
  }

  // load values from sub-models (smodels/imodel)
  read();

  // don't constrain any branch lengths
  for(int b=0;b<TC->n_branches();b++)
    TC->branch(b).set_length(-1);

  // create data partitions and register as sub-models
  for(int i=0;i<A.size();i++) 
  {
    // compute name for data-partition
    string name = string("part") + convertToString(i+1);

    // get reference to smodel for data-partition
    const substitution::MultiModel& SM = SModel(smodel_for_partition[i]);

    // create data partition
    data_partitions.push_back(cow_ptr<data_partition>(data_partition(name,A[i],*T,SM)));

    // register data partition as sub-model
    add_submodel(name,*data_partitions[i]);
  }
}
double
computeLeastSquaresEdgeLengths(const StrDblMatrix &orig_dm,  SequenceTree &tree){
  

  StrDblMatrix dm(orig_dm);
  const int numOriginalLeafs = dm.getSize();
  SequenceTree::NodeVector nodes;
  tree.recalcNodeIdsPostfixOrderAndAddInOrder(nodes);
  size_t nodeIdToRowIndex[nodes.size()];
  size_t rowIndexToNodeId[nodes.size()];
  str2int_hashmap name2Id((int)(nodes.size()*1.7));

  for(size_t i=0 ; i<nodes.size() ; i++)
    if(nodes[i]->isLeaf()){
      //PRINT(NAME(nodes[i]));PRINT(ID(nodes[i]));
      name2Id[NAME(nodes[i])] = ID(nodes[i]);
    }

  for(size_t row=0 ; row<dm.getSize() ; row++){
    str2int_hashmap::iterator f = name2Id.find(dm.getIdentifier(row));
    if(f==name2Id.end())
      USER_ERROR("name doesn't exist in tree: " << dm.getIdentifier(row));
    
    nodeIdToRowIndex[(*f).second] = row;
    rowIndexToNodeId[row] = (*f).second;
  }
  
  //the number of leafs below each node
  int numNodesBelow[nodes.size()];
  for(size_t i=0;i<nodes.size();i++)
    numNodesBelow[i]=1;

  //--------------------------------
  //BOTTOM UP TRAVERSAL IN TREE
  for(size_t i=0;i<nodes.size()-1;i++){
    if(nodes[i]->isLeaf())
      continue;

    //get the children and do the UNJ calculation to get the edge lengths
    SequenceTree::Node *parent = nodes[i];
    SequenceTree::Node *child1 = parent->getRightMostChild();
    SequenceTree::Node *child2 = child1->getLeftSibling();
    if(child2->getLeftSibling()!=NULL ){
      USER_ERROR("Have to be unrooted binary tree. Parent has " << parent->getNumChildren() << " children");
    }
    numNodesBelow[ID(parent)] = numNodesBelow[ID(child1)] + numNodesBelow[ID(child2)];
    //SEPARATOR();PRINT(NAME(child1));PRINT(NAME(child2));

    
    double sum = 0;
    for(size_t row=0;row<dm.getSize();row++){
      if(row==nodeIdToRowIndex[ID(child1)] || 
	 row==nodeIdToRowIndex[ID(child2)] )
	continue;
      sum += numNodesBelow[rowIndexToNodeId[row]]*(dm.getDistance(nodeIdToRowIndex[ID(child1)],row)-
						   dm.getDistance(nodeIdToRowIndex[ID(child2)],row));				
    }

    if(!isfinite(sum)){
      USER_ERROR("Distance Matrix contains a non finite number: " << sum);
    }

    EDGE(child1) = 0.5*dm.getDistance(nodeIdToRowIndex[ID(child1)],
				      nodeIdToRowIndex[ID(child2)])
      + 1.0/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum;
    EDGE(child2) = 0.5*dm.getDistance(nodeIdToRowIndex[ID(child1)],
				      nodeIdToRowIndex[ID(child2)])
      - 1.0/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum;
    // PRINT(dm.getDistance(nodeIdToRowIndex[ID(child1)],nodeIdToRowIndex[ID(child2)]));
    //     PRINT((numOriginalLeafs-numNodesBelow[ID(parent)]));
    //     PRINT(sum);PRINT(EDGE(child1));PRINT( EDGE(child2));
    //PRINT(1/(2*(numOriginalLeafs-numNodesBelow[ID(parent)]))*sum);
    
    //swap child1 to last row 
    int idOnLastRow = rowIndexToNodeId[dm.getSize()-1];
    if(idOnLastRow!=ID(child1)){
      int rowChild1 = nodeIdToRowIndex[ID(child1)];
      //PRINT(nodeIdToRowIndex[ID(child1)]);PRINT(dm.getSize());
      dm.swapRowToLast(nodeIdToRowIndex[ID(child1)]);
      nodeIdToRowIndex[idOnLastRow] = rowChild1;
      rowIndexToNodeId[rowChild1] = idOnLastRow;
      rowIndexToNodeId[dm.getSize()-1] = ID(child1);
      nodeIdToRowIndex[ID(child1)] = dm.getSize()-1;
    }
    //update distances to parent
    double w1 = (1.0*numNodesBelow[ID(child1)])/numNodesBelow[ID(parent)];
    double w2 = (1.0*numNodesBelow[ID(child2)])/numNodesBelow[ID(parent)];
    double distChild1Child2 = w1*EDGE(child1)+w2*EDGE(child2);
  
    //put parent on the row of child 2
    nodeIdToRowIndex[ID(parent)] = nodeIdToRowIndex[ID(child2)];
    rowIndexToNodeId[nodeIdToRowIndex[ID(parent)]] = ID(parent);
    int parentRow = nodeIdToRowIndex[ID(parent)]; 
    int child1Row = nodeIdToRowIndex[ID(child1)];
    int child2Row = nodeIdToRowIndex[ID(child2)];

    for(size_t row=0 ; row<dm.getSize()-1 ; row++){
      dm.setDistance(parentRow,row,
		     w1*dm.getDistance(child1Row,row)+
		     w2*dm.getDistance(child2Row,row)-
		     distChild1Child2);
    }    
    
    dm.setDistance(nodeIdToRowIndex[ID(parent)],nodeIdToRowIndex[ID(parent)],0.0);
    //remove last row
    dm.removeLastRow();
  }

  //Take care of root
  SequenceTree::Node *root = nodes[nodes.size()-1];
  if(!root->isRoot() || root->getNumChildren()!=3){
    USER_ERROR("Have to be unrooted binary tree. Root has " << root->getNumChildren() << " children");
  }
  
  //  cout << dm << endl;
  SequenceTree::Node *c1 = root->getRightMostChild();
  SequenceTree::Node *c2 = c1->getLeftSibling();
  SequenceTree::Node *c3 = c2->getLeftSibling();
  
  int c1row = nodeIdToRowIndex[ID(c1)];
  int c2row = nodeIdToRowIndex[ID(c2)];
  int c3row = nodeIdToRowIndex[ID(c3)];

  EDGE(c1) = 0.5*(dm.getDistance(c1row,c2row) + dm.getDistance(c1row,c3row)-dm.getDistance(c2row,c3row));
  EDGE(c2) = 0.5*(dm.getDistance(c2row,c1row) + dm.getDistance(c2row,c3row)-dm.getDistance(c1row,c3row));
  EDGE(c3) = 0.5*(dm.getDistance(c3row,c2row) + dm.getDistance(c3row,c1row)-dm.getDistance(c2row,c1row));
  EDGE(root) = 0;  

  //COMPUTE THE L2 SCORE
  StrDblMatrix treeM(tree.getNumLeafs());
  tree.tree2distanceMatrix(treeM);
  return computeL2(treeM, orig_dm);
}
//--------------------------------------------------
// THE SEQUENCE BASED NJ ALGO
//
//
void
computeSequenceBasedNJ(std::vector<Sequence> &seqs, SequenceTree &resultTree){

  // 1. Create a star tree with the leafs being the input sequences in b128 format.
  DNA_b128_String defaultString(seqs[0].seq.size());
  b128Tree tree(defaultString);
  b128Tree::Node *root = tree.getRoot();
  obj_ptr2obj_ptr_hashmap node2seqs((size_t)(seqs.size()*1.5));
  b128Tree::NodeVector leafs;
  for ( size_t i = 0 ; i < seqs.size()  ; i++ ){
    b128Tree::Node *leaf = root->addChild(defaultString);
    node2seqs[leaf] = &(seqs[i]);
    leafs.push_back(leaf);
    (leaf->data).append(seqs[i].seq);
  }
  
  // 2. Compute the DistanceMatrix for the seqs.
  b128Matrix dm(seqs.size());
  for ( size_t i = 0 ; i < leafs.size() ; i++ ){
    dm.setIdentifier(i,leafs[i]);
  }

  fillMatrix(dm);

  //std::cout << dm << std::endl;
  // 3. COMPUTE ROW SUMS
  double rowSums[dm.getSize()];
  for ( size_t row = 0 ; row < dm.getSize() ; row++ ){
    double sum = 0;
    size_t i =0;
    for ( ; i < dm.getSize() ; i++ )
      sum += dm.getDistance(row,i);

    rowSums[row] = sum;
  }

  //----------------
  // 4. 
  // NJ ITERATION
  //compute the row sums

  while ( dm.getSize() > 3 ) {

    //FIND MIN PAIR
    //find the minimal value
    double minVal = FLT_MAX;
    size_t mini = 1000000;
    size_t minj = 1000000;
    for ( size_t i = 0 ; i < dm.getSize() ; i++ ){
      for ( size_t j = i+1 ; j < dm.getSize() ; j++ ){
        double newVal = (dm.getSize() - 2.0)*dm.getDistance(i,j) - rowSums[i] - rowSums[j];
        //std::cout << newVal << " , ";
        if ( newVal < minVal ){
          minVal = newVal;
          mini = i;
          minj = j;
        }
      }
    }
    //    std::cout << std::endl;
    //PRINT(minVal);
    //make sure that minj is the last row in the matrix
    if ( mini == dm.getSize() -1 ){
      mini = minj;
    }
    else {
      dm.swapRowToLast(minj);
      double tmp = rowSums[dm.getSize()-1];
      rowSums[dm.getSize()-1] = rowSums[minj];
      rowSums[minj] = tmp;
    }
    minj = dm.getSize()-1;

    //CLUSTER THE LEAFS
    DNA_b128_String &child1str = dm.getIdentifier(mini)->data;
    DNA_b128_String &child2str = dm.getIdentifier(minj)->data;
    b128Tree::Node *parent = dm.getIdentifier(mini)->getTree()->detachFromParentAndAddAsSiblings(dm.getIdentifier(mini),dm.getIdentifier(minj), defaultString);
    dm.setIdentifier(mini, parent);

    //COMPUTE PARSIMONY AND SET IN PARENT
    DNA_b128_String &parentstr = parent->data;
    DNA_b128_String::create_weighted_parsimonious(parentstr,child1str,child2str);
    //COMPUTE DISTANCES FROM PARENT TO ALL OTHER NODES
    //PRINT(mini);PRINT(minj);
    for ( size_t i = 0 ; i < dm.getSize()-1 ; i++ ){//skip last row
      double dist2iandj = dm.getDistance(mini,i) + dm.getDistance(minj,i);
      DNA_b128_String &leafstr = dm.getIdentifier(i)->data;
      double dist = computeK2PDistance(parentstr,leafstr);
      // regular nj update function:
      //double regnj = dist2iandj * 0.5; 
      //double studier = (dist2iandj-dm.getDistance(mini,minj))*0.5;
      //PRINT(dist); PRINT(regnj);PRINT(dist2iandj);PRINT(dist-regnj);PRINT(dist-studier);
      //PRINT(dist - dm.getDistance(mini,i) );PRINT(dist - dm.getDistance(minj,i) );
      
      dm.setDistance(mini,i, dist); 
    
      //update rowsums
      rowSums[i] = rowSums[i] - dist2iandj + dm.getDistance(mini,i);
      //PRINT(rowSums[i]);
    }

    dm.setDistance(mini,mini,0);
    
    //remove the last row of the matrix
    dm.removeLastRow();
    
    //recompute the row sum for the parent
    double sum = 0;
    for ( size_t i = 0 ; i < dm.getSize() ; i++ )
      sum += dm.getDistance(mini,i);
    rowSums[mini] = sum;

  }
  //END ITERATION
  //----------------------------------

  
  //CONVERT THE TREE TO A SEQUENCE TREE
  tree.recalcNodeStructure();
  //  tree.drawTree(std::cout);
  b128Tree::NodeVector leafnodes;
  tree.addLeafs(leafnodes);

  Sequence_double dummy;
  dummy.dbl = -1;
  resultTree = SequenceTree(tree,dummy);

  SequenceTree::NodeVector seqnodes;
  resultTree.addLeafs(seqnodes);
  
  for ( size_t i = 0 ; i < seqnodes.size() ; i++ ){
    seqnodes[i]->data.s = *((Sequence *) node2seqs[leafnodes[i]]);
  }
  //resultTree.drawTree(std::cout);
}
Exemple #25
0
bool update_lengths(const MC_tree& Q,const SequenceTree& T,
		    const vector<dynamic_bitset<> >& node_masks,
		    const vector<Partition>& partitions2,
		    valarray<double>& branch_lengths, 
		    valarray<double>& node_lengths)
{
  // check that this tree is consistent with the MC Tree Q
  for(int i=0;i<Q.branch_order.size();i++) 
  {
    int b = Q.branch_order[i];
    if (not implies(T,Q.partitions[b]))
      return false;
  }

  // map branches of the input tree
  for(int b=0;b<T.n_branches();b++)
  {
    Partition P = partition_from_branch(T,b);

    // Find mc tree branches implied by branch b
    vector<int> branches;
    for(int i=0;i<Q.branch_order.size();i++) 
    {
      if (implies(P,partitions2[i]))
	branches.push_back(i);
    }

    // Find out which nodes this branch is inside of 
    vector<int> nodes;
    for(int n=0;n<Q.n_nodes();n++) 
    {
      if (Q.degree(n) == 0) continue;

      Partition P2 = P;
      P2.group1 = P2.group1 & node_masks[n];
      P2.group2 = P2.group2 & node_masks[n];
      if (P2.group1.none() or P2.group2.none())
	continue;

      bool ok = true;
      for(int b=0;b<2*Q.n_branches() and ok;b++)
      {
	if (Q.mapping[b] != n)  continue;
	Partition P3 = Q.partitions[b];
	P3.group1 = P3.group1 & node_masks[n];
	P3.group2 = P3.group2 & node_masks[n];
	if (partition_less_than(P3,P2) or partition_less_than(P3,P2.reverse()))
	  ;
	else
	  ok=false;
      }

      if (ok) {
	nodes.push_back(n);
	assert(Q.degree(n) > 3);
      }
    }

    /*
    cerr<<"Branch: "<<P<<endl;
    cerr<<"  - maps to "<<branches.size()<<" branches."<<endl;
    if (nodes.size()) {
      cerr<<"  - inside node(s):"<<endl;
      cerr<<P<<endl;
      for(int i=0;i<nodes.size();i++)
	cerr<<"    "<<Q.partitions[Q.branch_to_node(nodes[i])]<<endl;
    }
    */

    // This branch should be inside only one node, if any.
    assert(nodes.size() < 2);

    // This branch should not be inside a node, if it implies an mc tree branch.
    if (branches.size()) assert(not nodes.size());

    // But this branch should be inside a node if it doesn't imply a branch.
    assert(branches.size() + nodes.size() > 0);

    const double L = T.branch(b).length();

    // Divide the branch length evenly between the branches it implies.
    for(int i=0;i<branches.size();i++)
      branch_lengths[branches[i]] += L/branches.size();

    // Divide the branch length evenly between the nodes (node?) it implies.
    for(int i=0;i<nodes.size();i++)
      node_lengths[nodes[i]] += L/nodes.size();
  }

  return true;
}
Exemple #26
0
void print_stats(std::ostream& o,std::ostream& trees,
		 const Parameters& P,
		 bool print_alignment) 
{
  efloat_t Pr_prior = P.prior();
  efloat_t Pr_likelihood = P.likelihood();
  efloat_t Pr = Pr_prior * Pr_likelihood;

  o<<"    prior = "<<Pr_prior;
  for(int i=0;i<P.n_data_partitions();i++) 
    o<<"   prior_A"<<i+1<<" = "<<P[i].prior_alignment();

  o<<"    likelihood = "<<Pr_likelihood<<"    logp = "<<Pr
   <<"    beta = " <<P.beta[0]  <<"\n";

  if (print_alignment)
    for(int i=0;i<P.n_data_partitions();i++)
      o<<standardize(*P[i].A, *P.T)<<"\n";
  
  {
    SequenceTree T = *P.T;
    
    valarray<double> weights(P.n_data_partitions());
    for(int i=0;i<weights.size();i++)
      weights[i] = max(sequence_lengths(*P[i].A, P.T->n_leaves()));
    weights /= weights.sum();

    double mu_scale=0;
    for(int i=0;i<P.n_data_partitions();i++)
      mu_scale += P[i].branch_mean()*weights[i];

    for(int b=0;b<T.n_branches();b++)
      T.branch(b).set_length(mu_scale*T.branch(b).length());
    trees<<T<<std::endl;
    trees.flush();
  }
  
  o<<"\n";
  show_parameters(o,P);
  o.flush();

  for(int m=0;m<P.n_smodels();m++) {
    o<<"smodel"<<m+1<<endl;
    for(int i=0;i<P.SModel(m).n_base_models();i++)
      o<<"    rate"<<i<<" = "<<P.SModel(m).base_model(i).rate();
    o<<"\n\n";

    for(int i=0;i<P.SModel(m).n_base_models();i++)
      o<<"    fraction"<<i<<" = "<<P.SModel(m).distribution()[i];
    o<<"\n\n";

    o<<"frequencies = "<<"\n";
    show_frequencies(o,P.SModel(m));
    o<<"\n\n";

    o.flush();
  }

  // The leaf sequences should NOT change during alignment
#ifndef NDEBUG
  for(int i=0;i<P.n_data_partitions();i++)
    check_alignment(*P[i].A, *P.T,"print_stats:end");
#endif
}
Exemple #27
0
// mark nodes in T according to what node of Q they map to
vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T,
			  const vector<int>& branches_map)
{
  assert(branches_map.size() == Q.n_branches() * 2);

  vector<int> nodes_map(T.n_nodes(),-1);

  // map nodes from T -> Q that are in both trees
  for(int b=0;b<Q.n_branches();b++)
  {
    int Q_source = Q.branch(b).source();
    int Q_target = Q.branch(b).target();

    int b2 = branches_map[b];

    int T_source = T.directed_branch(b2).source();
    int T_target = T.directed_branch(b2).target();

    if (nodes_map[T_source] == -1)
      nodes_map[T_source] = Q_source;
    else
      assert(nodes_map[T_source] == Q_source);

    if (nodes_map[T_target] == -1)
      nodes_map[T_target] = Q_target;
    else
      assert(nodes_map[T_target] == Q_target);
  }

  // map the rest of the nodes from T -> Q
  for(int i=Q.n_leaves();i<Q.n_nodes();i++) 
  {
    unsigned D = Q[i].degree();
    if (D <= 3) continue;

    // get a branch of Q pointing into the node
    const_branchview outside = *(Q[i].branches_in());
    // get a branch of T pointing into the node
    outside = T.directed_branch(branches_map[outside.name()]);

    list<const_branchview> branches;
    typedef list<const_branchview>::iterator list_iterator;
    append(outside.branches_after(),branches);
    for(list_iterator b = branches.begin() ; b != branches.end();)
    {
      int node = (*b).target();
      if (nodes_map[node] == -1)
	nodes_map[node] = i;

      if (nodes_map[node] == i) {
	append((*b).branches_after(),branches);
	b++;
      }
      else {
	list_iterator prev = b;
	b++;
	branches.erase(prev);
      }
    }
    assert(branches.size() == D-3);
  }

  for(int i=0;i<nodes_map.size();i++)
    assert(nodes_map[i] != -1);

  return nodes_map;
}
Exemple #28
0
int main(int argc,char* argv[]) 
{ 
  try {
    //----------- Parse command line  ----------//
    variables_map args = parse_cmd_line(argc,argv);

    int skip = args["skip"].as<int>();

    int max = -1;
    if (args.count("max"))
      max = args["max"].as<int>();

    int subsample = args["sub-sample"].as<int>();

    vector<string> prune;
    if (args.count("prune")) {
      string p = args["prune"].as<string>();
      prune = split(p,',');
    }
      

    //----------- Read the topology -----------//
    SequenceTree Q = load_T(args);
    standardize(Q);
    const int B = Q.n_branches();
    const int N = Q.n_nodes();
    vector<double> bf(B);
    for(int b=0;b<bf.size();b++)
      bf[b] = Q.branch(b).length();

    //-------- Read in the tree samples --------//
    if ( args.count("simple") ) {
      accum_branch_lengths_ignore_topology A(Q);
      scan_trees(std::cin,skip,subsample,max,prune,A);
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;
      exit(0);
    }

    accum_branch_lengths_same_topology A(Q);

    try {
      scan_trees(std::cin,skip,subsample,max,prune,A);
    }
    catch (std::exception& e) 
    {
      if (args.count("safe"))
	cout<<Q.write(false)<<endl;
      std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
      exit(0);
    }

    if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology";
    if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl;

    //------- Merge lengths and topology -------//
    if (args.count("var")) {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m2[b]);
      cout<<Q;
      exit(0);
    }
    else {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);

      if (not args.count("no-node-lengths") and 
	  not args.count("show-node-lengths")) {
	for(int n=0;n<N;n++) {
	  int degree = Q[n].neighbors().size();
	  for(out_edges_iterator b = Q[n].branches_out();b;b++)
	    (*b).set_length((*b).length() + A.n1[n]/degree);
	}
      }

      //------- Print Tree and branch lengths -------//
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;

      //------------ Print node lengths -------------//
      if (args.count("show-node-lengths"))
	for(int n=0;n<Q.n_nodes();n++) {
	  if (A.n1[n] > 0) {
	    cout<<"node "<<A.n1[n]<<endl;
	    int b = (*Q[n].branches_in()).name();
	    cout<<partition_from_branch(Q,b)<<endl;
	  }
	}

    }
  }
  catch (std::exception& e) {
    std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
    exit(1);
  }
  return 0;
}
Exemple #29
0
//FIXME T.seq(i) -> T.leafname(i)
//FIXME T.get_sequences -> T.leafnames()
void delete_node(SequenceTree& T,const std::string& name) 
{
  int index = find_index(T.get_sequences(),name);
  nodeview n = T.prune_subtree(T.branch(index).reverse());
  T.remove_node_from_branch(n);
}
Exemple #30
0
RootedSequenceTree add_root(SequenceTree T,int b) {
  int r = T.create_node_on_branch(b);
  return RootedSequenceTree(T,r);
}