Exemplo n.º 1
0
bool update_lengths(const SequenceTree& Q,const SequenceTree& T,
		    valarray<double>& branch_lengths, 
		    valarray<double>& branch_lengths_squared, 
		    valarray<double>& node_lengths)
{
  // map branches from Q -> T
  vector<int> branches_map = extends_map(T,Q);
  if (not branches_map.size())
    return false;

  // incorporate lengths of branches that map to Q
  for(int b=0;b<Q.n_branches();b++)
  {
    int b2 = branches_map[b];
    double L = T.directed_branch(b2).length();
    branch_lengths[b] += L;
    branch_lengths_squared[b] += L*L;
  }

  // map nodes from T -> Q
  vector<int> nodes_map = get_nodes_map(Q,T,branches_map);

  // incorprate lengths of branches that map to nodes in Q
  for(int i=T.n_leafbranches();i<T.n_branches();i++) 
  {
    const_branchview b = T.branch(i);
    int n1 = nodes_map[b.source()];
    int n2 = nodes_map[b.target()];

    if (n1 == n2)
      node_lengths[n1] += T.branch(i).length();
  }

  return true;
}
Exemplo n.º 2
0
void estimate_tree(const alignment& A,
		   SequenceTree& T,
		   substitution::MultiModel& smodel,
		   const vector<int>& parameters)
{
  //------- Estimate branch lengths -------------//
  log_branch_likelihood score2(A,smodel,T,parameters);
  
  // Initialize starting point
  optimize::Vector start(0.1, T.n_branches() + parameters.size());
  for(int b=0;b<T.n_branches();b++)
    start[b] = log(T.branch(b).length());
  for(int i=0;i<parameters.size();i++)
    start[i+T.n_branches()] = smodel.get_parameter_value_as<Double>( parameters[i] );

  //    optimize::Vector end = search_gradient(start,score);
  //    optimize::Vector end = search_basis(start,score);
  optimize::Vector end = search_gradient(start,score2,1e-3);

  for(int b=0;b<T.n_branches();b++)
    T.branch(b).set_length(exp(end[b]));

  for(int i=0;i<parameters.size();i++)
    smodel.set_parameter_value(parameters[i], end[i+T.n_branches()]);

  smodel.set_rate(1);
}
Exemplo n.º 3
0
void accum_branch_lengths_ignore_topology::operator()(const SequenceTree& T)
{
  n_samples++;
  for(int b1=0;b1<Q.n_branches();b1++)
  {
    // this is a complete waste of CPU time.
    dynamic_bitset<> bp1 = branch_partition(Q,b1);
    if (not bp1[0]) bp1.flip();

    // search for Q.branch(b1) in tree T
    int b2 = -1;
    for(int i=0;i<T.n_branches();i++) 
    {
      dynamic_bitset<> bp2 = branch_partition(T,i);
      if (not bp2[0]) bp2.flip();

      if (bp1 == bp2) {
	b2 = i;
	break;
      }
    }

    if (b2 != -1) {
      double L = T.branch(b2).length();
      m1[b1] += L;
      m2[b1] += L*L;
      n_matches[b1]++;
    }
  }
}
Exemplo n.º 4
0
 accum_branch_lengths_ignore_topology(const SequenceTree& T)
   :
   n_samples(0),
   n_matches(0, T.n_branches()),
   Q(T),
   m1(0.0, Q.n_branches()),
   m2(0.0, Q.n_branches())
 {}
Exemplo n.º 5
0
int choose_SPR_target(SequenceTree& T1, int b1_)
{
    const_branchview b1 = T1.directed_branch(b1_);

    //----- Select the branch to move to ------//
    dynamic_bitset<> subtree_nodes = T1.partition(b1.reverse());
    subtree_nodes[b1.target()] = true;

    vector<int> branches;
    vector<double> lengths;

    for(int i=0; i<T1.n_branches(); i++)
    {
        const_branchview bi = T1.branch(i);

        // skip branch if its contained in the subtree
        if (subtree_nodes[bi.target()] and
                subtree_nodes[bi.source()])
            continue;

        double L = 1.0;

        // down-weight branch if it is one of the subtree's 2 neighbors
        if (subtree_nodes[bi.target()] or
                subtree_nodes[bi.source()])
            L = 0.5;

        branches.push_back(i);
        lengths.push_back(L);
    }

    int b2 = branches[ choose(lengths) ];

    return b2;
}
Exemplo n.º 6
0
int which_partition(const SequenceTree& T, const Partition& p) {
  for(int b=0; b<T.n_branches(); b++) {
    dynamic_bitset<> bp = branch_partition(T,b);
    if( implies(bp,p) )
      return b;
  }
  throw myexception(string("Partition not found in tree!"));
}
Exemplo n.º 7
0
vector<Partition> all_partitions_from_tree(const SequenceTree& T) 
{
  vector<Partition> partitions;

  for(int b=0;b<T.n_branches();b++)
    partitions.push_back(partition_from_branch(T,b));

  return partitions;
}
Exemplo n.º 8
0
/// Does any branch in T imply the partition p?
bool implies(const SequenceTree& T,const Partition& p) {
  bool result = false;
  for(int b=0;b<T.n_branches() and not result;b++) {
    dynamic_bitset<> bp = branch_partition(T,b);

    if (implies(bp,p)) return true;
  }
  return false;
}
Exemplo n.º 9
0
int which_branch(const SequenceTree& T, const Partition& p) 
{
  for(int b=0; b<2*T.n_branches(); b++) {
    dynamic_bitset<> bp = branch_partition(T,b);
    if( directed_implies(bp,p) )
      return b;
  }
  return -1;
}
Exemplo n.º 10
0
data_partition::data_partition(const string& n, const alignment& a,const SequenceTree& t,
			       const substitution::MultiModel& SM)
  :SModel_(SM),
   partition_name(n),
   cached_alignment_prior_for_branch(t.n_branches()),
   cached_alignment_counts_for_branch(t.n_branches(),ublas::matrix<int>(5,5)),
   cached_sequence_lengths(a.n_sequences()),
   branch_mean_(1.0),
   smodel_full_tree(true),
   A(a),
   T(t),
   MC(t,SM),
   LC(t,SModel()),
   branch_HMMs(t.n_branches()),
   branch_HMM_type(t.n_branches(),0),
   beta(2, 1.0)
{
  for(int b=0;b<cached_alignment_counts_for_branch.size();b++)
    cached_alignment_counts_for_branch[b].invalidate();
}
Exemplo n.º 11
0
SequenceTree get_mf_tree(const std::vector<std::string>& names,
			 const std::vector<dynamic_bitset<> >& partitions) 
{
  SequenceTree T = star_tree(names);

  for(int i=0;i<partitions.size();i++)
    T.induce_partition(partitions[i]);

  for(int i=0;i<T.n_branches();i++)
    T.branch(i).set_length(1.0);

  return T;
}
Exemplo n.º 12
0
/// Tree prior: topology & branch lengths (exponential)
efloat_t prior_exponential(const SequenceTree& T,double branch_mean) 
{
  efloat_t p = 1;

  // --------- uniform prior on topologies --------//
  if (T.n_leaves()>3)
    p /= num_topologies(T.n_leaves());

  // ---- Exponential prior on branch lengths ---- //
  for(int i=0;i<T.n_branches();i++) 
    p *= exponential_pdf(T.branch(i).length(), branch_mean);

  return p;
}
Exemplo n.º 13
0
/// Tree prior: topology & branch lengths (gamma)
efloat_t prior_gamma(const SequenceTree& T,double branch_mean) 
{
  efloat_t p = 1;

  // --------- uniform prior on topologies --------//
  if (T.n_leaves()>3)
    p /= num_topologies(T.n_leaves());

  // ---- Exponential prior on branch lengths ---- //
  double a = 0.5;
  double b = branch_mean*2;

  for(int i=0;i<T.n_branches();i++) 
    p *= gamma_pdf(T.branch(i).length(), a, b);

  return p;
}
Exemplo n.º 14
0
SequenceTree get_mf_tree(const std::vector<std::string>& names,
			 const std::vector<Partition>& partitions) 
{
  SequenceTree T = star_tree(names);

  int i=0;
  try {
    for(;i<partitions.size();i++)
      T.induce_partition(partitions[i].group1);
  }
  catch(...) {
    throw myexception()<<"Partition ("<<partitions[i]<<") conflicts with tree "<<T;
  }

  for(int i=0;i<T.n_branches();i++)
    T.branch(i).set_length(1.0);

  return T;
}
Exemplo n.º 15
0
int main(int argc,char* argv[])
{ 
  try {
    //---------- Parse command line  -------//
    variables_map args = parse_cmd_line(argc,argv);

    //----------- Load alignment and tree ---------//
    alignment A;
    SequenceTree T;
    if (args.count("tree"))
      load_A_and_T(args,A,T,false);
    else
      A = load_A(args,false);

    const alphabet& a = A.get_alphabet();
    
    //------- Load groups and find branches -------//
    vector<sequence_group> groups;
    if (args.count("groups")) 
      groups = load_groups(A,args["groups"].as<string>());

    for(int i=0;i<groups.size();i++) {
      cerr<<groups[i].name<<": ";
      for(int j=0;j<groups[i].taxa.size();j++)
	cerr<<A.seq(groups[i].taxa[j]).name<<" ";
      cerr<<endl;
    }

    vector<int> group_branches;
    if (args.count("tree"))
    {
      for(int i=0;i<groups.size();i++)
      {
	dynamic_bitset<> p(T.n_leaves());
	for(int j=0;j<groups[i].taxa.size();j++)
	  p[groups[i].taxa[j]] = true;

	int found = -1;
	for(int b=0;b<2*T.n_branches() and found == -1;b++)
	  if (p == branch_partition(T,b))
	    found = b;
	if (found == -1)
	  throw myexception()<<"I can't find group "<<i+1<<" on the tree!";
	
	group_branches.push_back(found);
      }
    }

    vector<string> group_names;
    for(int i=0;i<groups.size();i++)
      group_names.push_back(groups[i].name);

    vector<Partition> splits;
    if (args.count("split")) 
    {
      vector<string> split = args["split"].as<vector<string> >();
      for(int i=0;i<split.size();i++) 
	splits.push_back(Partition(group_names,split[i]));
    }

    //-------------------------------------------//

    Matrix C(A.length(),A.n_sequences()+1);
    for(int i=0;i<C.size1();i++)
      for(int j=0;j<C.size2();j++)
	C(i,j) = 0;

    // yes but, how much more conservation THAN EXPECTED do we see?

    for(int c=0;c<C.size1();c++) 
    {
      vector<bool> interesting(groups.size(), true);

      //-------------------------------------------------------//
      vector<int> leaf_letters( T.n_leaves() );
      for(int j=0;j<leaf_letters.size();j++)
	leaf_letters[j] = A(c,j);
      vector<vector<int> > node_letters = get_all_parsimony_letters(a,leaf_letters,T,unit_cost_matrix(a));

      vector<vector<int> > initial_value(groups.size());
      for(int g=0;g<groups.size();g++) 
      {
	int n = T.directed_branch(group_branches[g]).target();
	initial_value[g] = node_letters[n];
      }

      //------------ find 'group conserved at' values ----------//
      vector<int> value(groups.size(),alphabet::gap);

      for(int g=0;g<groups.size();g++) 
      {
	vector<int> temp;
	for(int i=0;i<groups[g].taxa.size();i++)
	  temp.push_back(A(c,groups[g].taxa[i]));

	int best = most_common(temp);
	int count = number_of(temp,best);
	
	if (count >= groups[g].taxa.size()-1 and count >=3 and count> groups[g].taxa.size()/2)
	  value[g] = best;
      }

      //-------- Determine whether column is interesting --------//
      if (args.count("require-all-different"))
      {
	if (args.count("split"))
	{
	  vector<bool> in_changed_split(groups.size(),false);
	  for(int i=0;i<splits.size();i++) 
	  {
	    bool some_different = false;
	    for(int g1=0;g1<groups.size();g1++)
	      for(int g2=0;g2<groups.size();g2++)
		if (splits[i].group1[g1] and splits[i].group2[g2]) {
		  if (all_different(A,c,groups,g1,g2))
		    some_different = true;
		}
	  
	    bool no_same = true;
	    for(int g1=0;g1<groups.size();g1++)
	      for(int g2=0;g2<groups.size();g2++)
		if (splits[i].group1[g1] and splits[i].group2[g2])
		  if (not all_different(A,c,groups,g1,g2))
		    no_same = false;

	    if (some_different and no_same) 
	      for(int g=0;g<groups.size();g++)
		if (splits[i].group1[g] or splits[i].group2[g])
		  in_changed_split[g] = true;
	  }

	  for(int g=0;g<groups.size();g++)
	    interesting[g] = interesting[g] and in_changed_split[g];
	}

	else {
	  bool different = false;
	  for(int g1=0;g1<groups.size();g1++)
	    for(int g2=0;g2<groups.size();g2++)
	      if (all_different(A,c,groups,g1,g2))
		different = true;

	  if (not different)
	    for(int g=0;g<groups.size();g++) 
	      interesting[g] = false;
	}
      }
    
      if (args.count("require-change"))
      {
	if (args.count("split"))
	{
	  vector<bool> in_changed_split(groups.size(),false);
	  for(int i=0;i<splits.size();i++) 
	  {
	    bool some_different = false;
	    for(int g1=0;g1<groups.size();g1++)
	      for(int g2=0;g2<groups.size();g2++)
		if (splits[i].group1[g1] and splits[i].group2[g2]) {
		  if (value[g1] != value[g2])
		    if (not args.count("ignore-rate-change") or 
			(value[g1] != alphabet::gap and value[g2] != alphabet::gap))
		      some_different = true;
		}
	  
	    bool no_same = true;
	    for(int g1=0;g1<groups.size();g1++)
	      for(int g2=0;g2<groups.size();g2++)
		if (splits[i].group1[g1] and splits[i].group2[g2])
		  if (value[g1] == value[g2])
		    no_same = false;

	    // This is Option #1 
	    //  - some conserved differences but no conserved similarities
	    // Also consider Option #2
	    //  - a change in both LETTER and CONSERVATION on at least one of the
	    //    two branches leading from the duplication.
	    if (some_different and no_same) 
	      for(int g=0;g<groups.size();g++)
		if (splits[i].group1[g] or splits[i].group2[g])
		  in_changed_split[g] = true;
	  }

	  for(int g=0;g<groups.size();g++)
	    interesting[g] = interesting[g] and in_changed_split[g];


	}

	else {
	  if (args.count("ignore-rate-change")) 
	  {
	    for(int g=0;g<groups.size();g++) 
	      interesting[g] = interesting[g] and not all_same_or(value,alphabet::gap);
	  }
	  else 
	  {
	    for(int g=0;g<groups.size();g++)
		interesting[g] = interesting[g] and not all_same(value);
	  }
	}
      }
    
      // A group is only interesting if its conserved
      if (args.count("require-conservation"))
	for(int g=0;g<groups.size();g++)
	  interesting[g] = interesting[g] and (value[g] != alphabet::gap);

      // A group is only interesting if it is in one of the splits
      if (args.count("split"))
	for(int g=0;g<groups.size();g++) {
	  bool found = false;
	  for(int i=0;i<splits.size() and not found;i++) 
	    if (splits[i].group1[g] or splits[i].group2[g])
	      found = true;
	  interesting[g] = interesting[g] and found;
	}

      //------------ print 'group conserved at' values ----------//
      cerr<<c+1<<"   ";
      for(int i=0;i<value.size();i++) 
	cerr<<a.lookup(value[i])<<" ";
      cerr<<"     ";

      for(int g=0;g<groups.size();g++) 
	if (interesting[g])
	  cerr<<"1 ";
	else
	  cerr<<"0 ";
      cerr<<endl;

      //------------- print parsimony initial values --------------//
      cerr<<"     ";
      for(int i=0;i<initial_value.size();i++) {
	for(int j=0;j<initial_value[i].size();j++)
	  cerr<<a.lookup(initial_value[i][j]);
	cerr<<" "; 
      }     
      cerr<<"    "<<n_mutations(a,leaf_letters,T,unit_cost_matrix(a))<<endl;
      cerr<<endl;



      //--------------------- Set highlighting ---------------------//
      // Interesting groups -> 1.0
      for(int g=0;g<groups.size();g++)
	if (interesting[g])
	  for(int i=0;i<groups[g].taxa.size();i++)
	    C(c,groups[g].taxa[i]) = 1.0;

      // Set conserved groups -> 0.5
      for(int g=0;g<groups.size();g++)
	if (value[g] != alphabet::gap)
	  for(int i=0;i<groups[g].taxa.size();i++)
	    C(c,groups[g].taxa[i]) = std::max(0.5, C(c,groups[g].taxa[i]));
    }


    cout<<join(sequence_names(A),' ')<<endl;
    for(int i=0;i<C.size1();i++) {
      vector<double> temp;
      for(int j=0;j<C.size2();j++)
	temp.push_back(C(i,j));
      cout<<join(temp,' ')<<endl;
    }
    
  }
  catch (std::exception& e) {
    std::cerr<<"alignment-find-conserved: Error! "<<e.what()<<endl;
    exit(1);
  }

  return 0;
}
Exemplo n.º 16
0
// mark nodes in T according to what node of Q they map to
vector<int> get_nodes_map(const SequenceTree& Q,const SequenceTree& T,
			  const vector<int>& branches_map)
{
  assert(branches_map.size() == Q.n_branches() * 2);

  vector<int> nodes_map(T.n_nodes(),-1);

  // map nodes from T -> Q that are in both trees
  for(int b=0;b<Q.n_branches();b++)
  {
    int Q_source = Q.branch(b).source();
    int Q_target = Q.branch(b).target();

    int b2 = branches_map[b];

    int T_source = T.directed_branch(b2).source();
    int T_target = T.directed_branch(b2).target();

    if (nodes_map[T_source] == -1)
      nodes_map[T_source] = Q_source;
    else
      assert(nodes_map[T_source] == Q_source);

    if (nodes_map[T_target] == -1)
      nodes_map[T_target] = Q_target;
    else
      assert(nodes_map[T_target] == Q_target);
  }

  // map the rest of the nodes from T -> Q
  for(int i=Q.n_leaves();i<Q.n_nodes();i++) 
  {
    unsigned D = Q[i].degree();
    if (D <= 3) continue;

    // get a branch of Q pointing into the node
    const_branchview outside = *(Q[i].branches_in());
    // get a branch of T pointing into the node
    outside = T.directed_branch(branches_map[outside.name()]);

    list<const_branchview> branches;
    typedef list<const_branchview>::iterator list_iterator;
    append(outside.branches_after(),branches);
    for(list_iterator b = branches.begin() ; b != branches.end();)
    {
      int node = (*b).target();
      if (nodes_map[node] == -1)
	nodes_map[node] = i;

      if (nodes_map[node] == i) {
	append((*b).branches_after(),branches);
	b++;
      }
      else {
	list_iterator prev = b;
	b++;
	branches.erase(prev);
      }
    }
    assert(branches.size() == D-3);
  }

  for(int i=0;i<nodes_map.size();i++)
    assert(nodes_map[i] != -1);

  return nodes_map;
}
Exemplo n.º 17
0
int main(int argc,char* argv[]) 
{ 
  try {
    //----------- Parse command line  ----------//
    variables_map args = parse_cmd_line(argc,argv);

    int skip = args["skip"].as<int>();

    int max = -1;
    if (args.count("max"))
      max = args["max"].as<int>();

    int subsample = args["sub-sample"].as<int>();

    vector<string> prune;
    if (args.count("prune")) {
      string p = args["prune"].as<string>();
      prune = split(p,',');
    }
      

    //----------- Read the topology -----------//
    SequenceTree Q = load_T(args);
    standardize(Q);
    const int B = Q.n_branches();
    const int N = Q.n_nodes();
    vector<double> bf(B);
    for(int b=0;b<bf.size();b++)
      bf[b] = Q.branch(b).length();

    //-------- Read in the tree samples --------//
    if ( args.count("simple") ) {
      accum_branch_lengths_ignore_topology A(Q);
      scan_trees(std::cin,skip,subsample,max,prune,A);
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;
      exit(0);
    }

    accum_branch_lengths_same_topology A(Q);

    try {
      scan_trees(std::cin,skip,subsample,max,prune,A);
    }
    catch (std::exception& e) 
    {
      if (args.count("safe"))
	cout<<Q.write(false)<<endl;
      std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
      exit(0);
    }

    if (log_verbose) std::cerr<<A.n_matches<<" out of "<<A.n_samples<<" trees matched the topology";
    if (log_verbose) std::cerr<<" ("<<double(A.n_matches)/A.n_samples*100<<"%)"<<std::endl;

    //------- Merge lengths and topology -------//
    if (args.count("var")) {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m2[b]);
      cout<<Q;
      exit(0);
    }
    else {
      for(int b=0;b<B;b++)
	Q.branch(b).set_length(A.m1[b]);

      if (not args.count("no-node-lengths") and 
	  not args.count("show-node-lengths")) {
	for(int n=0;n<N;n++) {
	  int degree = Q[n].neighbors().size();
	  for(out_edges_iterator b = Q[n].branches_out();b;b++)
	    (*b).set_length((*b).length() + A.n1[n]/degree);
	}
      }

      //------- Print Tree and branch lengths -------//
      cout<<Q.write_with_bootstrap_fraction(bf,true)<<endl;

      //------------ Print node lengths -------------//
      if (args.count("show-node-lengths"))
	for(int n=0;n<Q.n_nodes();n++) {
	  if (A.n1[n] > 0) {
	    cout<<"node "<<A.n1[n]<<endl;
	    int b = (*Q[n].branches_in()).name();
	    cout<<partition_from_branch(Q,b)<<endl;
	  }
	}

    }
  }
  catch (std::exception& e) {
    std::cerr<<"tree-mean-lengths: Error! "<<e.what()<<endl;
    exit(1);
  }
  return 0;
}
Exemplo n.º 18
0
Parameters::Parameters(const vector<alignment>& A, const SequenceTree& t,
		       const vector<polymorphic_cow_ptr<substitution::MultiModel> >& SMs,
		       const vector<int>& s_mapping,
		       const vector<int>& scale_mapping)
  :SModels(SMs),
   smodel_for_partition(s_mapping),
   scale_for_partition(scale_mapping),
   branch_prior_type(0),
   smodel_full_tree(true),
   T(t),
   TC(star_tree(t.get_sequences())),
   branch_HMM_type(t.n_branches(),0),
   beta(2, 1.0),
   updown(-1),
   features(0)
{
  constants.push_back(-1);

  for(int i=0;i<n_scales;i++)
    add_super_parameter("mu"+convertToString(i+1),1.0);

  // check that smodel mapping has correct size.
  if (smodel_for_partition.size() != A.size())
    throw myexception()<<"There are "<<A.size()
		       <<" data partitions, but you mapped smodels onto "
		       <<smodel_for_partition.size();

  // register the substitution models as sub-models
  for(int i=0;i<SModels.size();i++) {
    string name = "S" + convertToString(i+1);
    add_submodel(name, *SModels[i]);
  }

  // NO indel model (in this constructor)

  // check that we only mapping existing smodels to data partitions
  for(int i=0;i<smodel_for_partition.size();i++) {
    int m = smodel_for_partition[i];
    if (m >= SModels.size())
      throw myexception()<<"You can't use smodel "<<m+1<<" for data partition "<<i+1
			 <<" because there are only "<<SModels.size()<<" smodels.";
  }

  // load values from sub-models (smodels/imodel)
  read();

  // don't constrain any branch lengths
  for(int b=0;b<TC->n_branches();b++)
    TC->branch(b).set_length(-1);

  // create data partitions and register as sub-models
  for(int i=0;i<A.size();i++) 
  {
    // compute name for data-partition
    string name = string("part") + convertToString(i+1);

    // get reference to smodel for data-partition
    const substitution::MultiModel& SM = SModel(smodel_for_partition[i]);

    // create data partition
    data_partitions.push_back(cow_ptr<data_partition>(data_partition(name,A[i],*T,SM)));

    // register data partition as sub-model
    add_submodel(name,*data_partitions[i]);
  }
}
Exemplo n.º 19
0
int main(int argc,char* argv[]) 
{ 
  try {

    std::cout.precision(3);
    std::cout.setf(ios::fixed);

    //---------- Parse command line  -------//
    variables_map args = parse_cmd_line(argc,argv);

    //--------------------- Initialize ---------------------//
    if (args.count("seed")) {
      unsigned long seed = args["seed"].as<unsigned long>();
      myrand_init(seed);
    }
    else
      myrand_init();

    int skip = args["skip"].as<int>();

    int subsample=args["sub-sample"].as<int>();

    int max = -1;
    if (args.count("max"))
      max = args["max"].as<int>();

    double min_support = args["min-support"].as<double>();

    // leaf taxa to ignore
    vector<string> ignore;
    if (args.count("ignore") and args["ignore"].as<string>().size() > 0)
      ignore = split(args["ignore"].as<string>(),',');

    // consensus levels 
    string c_levels = args.count("consensus") ? args["consensus"].as<string>() : "";
    vector<double> consensus_levels = get_consensus_levels(c_levels);

    double report_ratio = args["odds-ratio"].as<double>();

    bool show_sub = args.count("sub-partitions");

    //-------------- Read in tree distributions --------------//
    string filename = args["file"].as<string>();
    ifstream file(filename.c_str());
    if (not file)
      throw myexception()<<"Couldn't open file "<<filename;
      
    tree_sample tree_dist(file,skip,subsample,max,ignore);
    const unsigned N = tree_dist.size();

    dynamic_bitset<> ignore_mask = group_from_names(tree_dist.names(),vector<string>());

    //------ Compute Ml partitions or sub-partitions --------//
    vector< pair<Partition,unsigned> > all_partitions;

    if (show_sub)
    {
      int depth = args["depth"].as<int>();

      double min_rooting = args["rooting"].as<double>();

      all_partitions = get_Ml_sub_partitions_and_counts(tree_dist,min_support, ~ignore_mask,min_rooting,depth);
      //      std::cerr<<"n_sub_partitions = "<<all_partitions.size()<<"\n";
    }
    else
      all_partitions = get_Ml_partitions_and_counts(tree_dist,min_support, ~ignore_mask);

    vector<int> which_topology;
    vector<int> topology_counts;
    std::map<tree_record,int> topologies_index;

    for(int i=0;i<tree_dist.size();i++)
    {
      std::map<tree_record,int>::iterator record = topologies_index.find(tree_dist[i]);
      if (record == topologies_index.end())
      {
	which_topology.push_back(i);
	topology_counts.push_back(0);

	topologies_index[tree_dist[i]] = which_topology.size()-1;
	record = topologies_index.find(tree_dist[i]);
      }
      topology_counts[record->second]++;
    }
    vector<int> order = iota<int>(topology_counts.size());

    std::sort(order.begin(),order.end(),sequence_order<int>(topology_counts));
    std::reverse(order.begin(), order.end());

    //------  Topologies to analyze -----//
    vector<string> topologies;

    cout<<"# n_trees = "<<tree_dist.size()<<"   n_topologies = "<<topology_counts.size()<<endl;
    cout<<"\nTopology support: \n\n";
    for(int i=0;i < args["map-trees"].as<int>() ;i++) 
    {
      if (i >= order.size()) continue;

      string t = tree_dist.T(which_topology[order[i]]).write(false);

      unsigned n = topology_counts[order[i]];
      double PP = double(n)/N;
      double o = odds(n,N,1);

      cout<<"MAP-"<<i<<" = "<<t<<endl;
      cout<<"   PP = "<<PP<<"       LOD = "<<log10(o)<<endl;
      cout<<"\n";
    }

    
    for(int i=0,n=0;i<topology_counts.size();i++) 
    {
      n += topology_counts[i];
      double PP = double(n)/N;

      if (PP >= 0.95) {
	cout<<"95% credible set contains "<<i+1<<" topologies."<<endl;
	break;
      }
    }
    cout<<"\n\n";


    //------- Print out support for each partition --------//
    cout<<"Partition support: \n\n";

    vector<pair<Partition,unsigned> > good_partitions = thin(all_partitions, N, report_ratio);

    sort(good_partitions.begin(),good_partitions.end(), count_more());

    for(int i=0;i<good_partitions.size();i++) 
    {
      if (not informative(good_partitions[i].first))
	continue;

      unsigned n = good_partitions[i].second;

      double PP = double(n)/N;
      double o = odds(n,N,1);

      cout<<"   PP = "<<PP<<"       LOD = "<<log10(o);

      if (not good_partitions[i].first.full()) {
	double ratio = odds_ratio(good_partitions,i,N,1);
	cout<<"       ratio = "<<log10(ratio);
      }
      cout<<"       pi = "<<good_partitions[i].first<<endl;

      cout<<endl<<endl;
    }


    //----------- display M[l] consensus levels ----------//
    std::cout.precision(4);
    cout<<"\n\nConsensus levels:\n\n";

    vector<Partition> c50_sub_partitions = get_Ml_partitions(all_partitions, 0.5, N);
    vector<Partition> c50_full_partitions = select(c50_sub_partitions,&Partition::full);
    c50_sub_partitions = get_moveable_tree(c50_sub_partitions);


    vector<unsigned> levels = get_Ml_levels(all_partitions,N,min_support);

    levels.push_back(N+1);

    for(int j=0,k=0;j<levels.size() and k < consensus_levels.size();j++) 
    {
      unsigned clevel = (unsigned)(consensus_levels[k]*N);

      while (k<consensus_levels.size() and clevel < levels[j]) 
      {
	clevel = (unsigned)(consensus_levels[k]*N);

	vector<Partition> all  = get_Ml_partitions(all_partitions,consensus_levels[k],N);
	vector<Partition> sub;
	vector<Partition> full;
	for(int i=0;i<all.size();i++)
	  if (all[i].full())
	    full.push_back(all[i]);
	  else
	    sub.push_back(all[i]);

	SequenceTree consensus = get_mf_tree(tree_dist.names(),full);
	SequenceTree consensus2 = consensus;
      
	double L = consensus_levels[k]*100;
	
	cout.unsetf(ios::fixed | ios::showpoint);
	
	vector<double> bf(consensus.n_branches(),1.0);
	for(int i=0;i<bf.size();i++) 
	{
	  if (consensus.branch(i).is_leaf_branch())
	    bf[i] = -1.0;
	  else {
	    dynamic_bitset<> mask = branch_partition(consensus,i);
	    Partition p(tree_dist.names(),mask);
	    unsigned count = get_partition_count(all_partitions,p);
	    bf[i] = double(count)/N;
	  }
	  consensus2.branch(i).set_length(bf[i]);
	}

	
	cout<<" "<<L<<"-consensus-PP = "<<consensus2.write(true)<<std::endl;
	//cout<<" "<<L<<"-consensus-PP2 = "<<consensus.write_with_bootstrap_fraction(bf,false)<<std::endl;
	cout<<" "<<L<<"-consensus = "<<consensus.write(false)<<std::endl;
	
	if (show_sub) {
	  for(int i=0;i<sub.size();i++)
	    cout<<sub[i]<<endl;
	}
	cout<<endl<<endl;

	k++;
      }
	

      if (levels[j] <=N) {
	show_level(tree_dist,levels[j],c50_sub_partitions,all_partitions,show_sub,false);
	cout<<endl;
      }

    }
  }
  catch (std::exception& e) {
    std::cerr<<"trees-consensus: Error! "<<e.what()<<endl;
    exit(1);
  }
  return 0;
}
Exemplo n.º 20
0
void print_stats(std::ostream& o,std::ostream& trees,
		 const Parameters& P,
		 bool print_alignment) 
{
  efloat_t Pr_prior = P.prior();
  efloat_t Pr_likelihood = P.likelihood();
  efloat_t Pr = Pr_prior * Pr_likelihood;

  o<<"    prior = "<<Pr_prior;
  for(int i=0;i<P.n_data_partitions();i++) 
    o<<"   prior_A"<<i+1<<" = "<<P[i].prior_alignment();

  o<<"    likelihood = "<<Pr_likelihood<<"    logp = "<<Pr
   <<"    beta = " <<P.beta[0]  <<"\n";

  if (print_alignment)
    for(int i=0;i<P.n_data_partitions();i++)
      o<<standardize(*P[i].A, *P.T)<<"\n";
  
  {
    SequenceTree T = *P.T;
    
    valarray<double> weights(P.n_data_partitions());
    for(int i=0;i<weights.size();i++)
      weights[i] = max(sequence_lengths(*P[i].A, P.T->n_leaves()));
    weights /= weights.sum();

    double mu_scale=0;
    for(int i=0;i<P.n_data_partitions();i++)
      mu_scale += P[i].branch_mean()*weights[i];

    for(int b=0;b<T.n_branches();b++)
      T.branch(b).set_length(mu_scale*T.branch(b).length());
    trees<<T<<std::endl;
    trees.flush();
  }
  
  o<<"\n";
  show_parameters(o,P);
  o.flush();

  for(int m=0;m<P.n_smodels();m++) {
    o<<"smodel"<<m+1<<endl;
    for(int i=0;i<P.SModel(m).n_base_models();i++)
      o<<"    rate"<<i<<" = "<<P.SModel(m).base_model(i).rate();
    o<<"\n\n";

    for(int i=0;i<P.SModel(m).n_base_models();i++)
      o<<"    fraction"<<i<<" = "<<P.SModel(m).distribution()[i];
    o<<"\n\n";

    o<<"frequencies = "<<"\n";
    show_frequencies(o,P.SModel(m));
    o<<"\n\n";

    o.flush();
  }

  // The leaf sequences should NOT change during alignment
#ifndef NDEBUG
  for(int i=0;i<P.n_data_partitions();i++)
    check_alignment(*P[i].A, *P.T,"print_stats:end");
#endif
}
Exemplo n.º 21
0
bool update_lengths(const MC_tree& Q,const SequenceTree& T,
		    const vector<dynamic_bitset<> >& node_masks,
		    const vector<Partition>& partitions2,
		    valarray<double>& branch_lengths, 
		    valarray<double>& node_lengths)
{
  // check that this tree is consistent with the MC Tree Q
  for(int i=0;i<Q.branch_order.size();i++) 
  {
    int b = Q.branch_order[i];
    if (not implies(T,Q.partitions[b]))
      return false;
  }

  // map branches of the input tree
  for(int b=0;b<T.n_branches();b++)
  {
    Partition P = partition_from_branch(T,b);

    // Find mc tree branches implied by branch b
    vector<int> branches;
    for(int i=0;i<Q.branch_order.size();i++) 
    {
      if (implies(P,partitions2[i]))
	branches.push_back(i);
    }

    // Find out which nodes this branch is inside of 
    vector<int> nodes;
    for(int n=0;n<Q.n_nodes();n++) 
    {
      if (Q.degree(n) == 0) continue;

      Partition P2 = P;
      P2.group1 = P2.group1 & node_masks[n];
      P2.group2 = P2.group2 & node_masks[n];
      if (P2.group1.none() or P2.group2.none())
	continue;

      bool ok = true;
      for(int b=0;b<2*Q.n_branches() and ok;b++)
      {
	if (Q.mapping[b] != n)  continue;
	Partition P3 = Q.partitions[b];
	P3.group1 = P3.group1 & node_masks[n];
	P3.group2 = P3.group2 & node_masks[n];
	if (partition_less_than(P3,P2) or partition_less_than(P3,P2.reverse()))
	  ;
	else
	  ok=false;
      }

      if (ok) {
	nodes.push_back(n);
	assert(Q.degree(n) > 3);
      }
    }

    /*
    cerr<<"Branch: "<<P<<endl;
    cerr<<"  - maps to "<<branches.size()<<" branches."<<endl;
    if (nodes.size()) {
      cerr<<"  - inside node(s):"<<endl;
      cerr<<P<<endl;
      for(int i=0;i<nodes.size();i++)
	cerr<<"    "<<Q.partitions[Q.branch_to_node(nodes[i])]<<endl;
    }
    */

    // This branch should be inside only one node, if any.
    assert(nodes.size() < 2);

    // This branch should not be inside a node, if it implies an mc tree branch.
    if (branches.size()) assert(not nodes.size());

    // But this branch should be inside a node if it doesn't imply a branch.
    assert(branches.size() + nodes.size() > 0);

    const double L = T.branch(b).length();

    // Divide the branch length evenly between the branches it implies.
    for(int i=0;i<branches.size();i++)
      branch_lengths[branches[i]] += L/branches.size();

    // Divide the branch length evenly between the nodes (node?) it implies.
    for(int i=0;i<nodes.size();i++)
      node_lengths[nodes[i]] += L/nodes.size();
  }

  return true;
}