Exemplo n.º 1
0
vector<pair<partition,unsigned> > 
get_Ml_partitions_and_counts(const tree_sample& sample,double l) 
{
    dynamic_bitset<> mask(sample.names().size());
    mask.flip();
    return get_Ml_partitions_and_counts(sample,l,mask);
}
Exemplo n.º 2
0
vector<pair<partition,unsigned> > 
get_Ml_sub_partitions_and_counts(const tree_sample& sample,double l,double min_rooting,int depth) 
{
    dynamic_bitset<> mask(sample.names().size());
    mask.flip();
    return get_Ml_sub_partitions_and_counts(sample,l,mask,min_rooting,depth);
  
}
Exemplo n.º 3
0
void show_level(const tree_sample& tree_dist,
		unsigned level,
		const vector<Partition>& skeleton,
		const vector<pair<Partition,unsigned> >& all_partitions,
		bool show_sub,
		bool show_PP)
{
  vector<Partition> full_skeleton = select(skeleton,&Partition::full);

  const unsigned N = tree_dist.size();

  cout.unsetf(ios::fixed | ios::showpoint);
	
  const vector<Partition> sub = get_Ml_partitions(all_partitions,level);
  const vector<Partition> full = select(sub,&Partition::full);

  //  const vector<Partition> moveable = get_moveable_tree(sub);
  
  //  vector<Partition> full_hull = Ml_min_Hull(full_skeleton,sub);
  //  vector<Partition> sub_hull = Ml_min_Hull(skeleton,sub);
    
  double fraction = double(level)/N;

  double LOD = log10(odds(level,N,1));

  cout<<"   level = "<<fraction*100
      <<"   LOD = "<<LOD
      <<"   full = "<<count(full, informative);

  if (show_sub) {
    cout<<"   sub = "       <<count(sub,informative);
    //    cout<<"   consistent = "<<count(moveable,informative);
    //    cout<<"   sub(50) = "<<count(moveable,informative);
    //    cout<<"   sub#1 = "     <<count(full_hull,informative);
    //    cout<<"   sub#2 = "     <<count(sub_hull,informative);
  }

  if (show_PP)
    cout<<"   PP = "<<100*tree_dist.PP(full);

  cout<<endl;
}
Exemplo n.º 4
0
vector<pair<partition,unsigned> > 
get_Ml_sub_partitions_and_counts(const tree_sample& sample,double l,const dynamic_bitset<>& mask,
				 double min_rooting,int depth) 
{
    // get list of branches to consider cutting
    //   FIXME - consider 4n-12 most probable partitions, here?
    //         - Perhaps NOT, though.
    vector<partition> partitions_c50 = get_Ml_partitions(sample, 0.5);
    SequenceTree c50 = get_mf_tree(sample.names(),partitions_c50);
    vector<const_branchview> branches = branches_from_leaves(c50);  

    // construct unit masks
    // - unit masks are masks that come directly from a supported branch (full, or partial)
    list< dynamic_bitset<> > unit_masks;
    for(int b=0;b<branches.size();b++)
	add_unique(unit_masks, mask & branch_partition(c50,branches[b]) );

    // construct beginning masks
    list<dynamic_bitset<> > new_masks = unit_masks;
    list<dynamic_bitset<> > masks;

    // start collecting splits at M[l]
    vector<pair<partition,unsigned> > splits = get_Ml_partitions_and_counts(sample,l,mask);

    // any good mask should be combined w/ other good masks
    list<dynamic_bitset<> > good_masks;
    for(int iterations=0;not new_masks.empty();iterations++)
    {
	vector<pair<partition,unsigned> > full_splits = splits;

	if (log_verbose) cerr<<"iteration: "<<iterations<<"   depth: "<<depth<<"   new_masks: "<<new_masks.size()<<endl;
	list<dynamic_bitset<> > new_good_masks;
	list<dynamic_bitset<> > new_unit_masks;

	// get sub-splits for each mask
	for(const auto& mask: new_masks)
	{
	    // get sub-splits of mask
	    vector<pair<partition,unsigned> > partial_splits = get_Ml_partitions_and_counts(sample,l,mask);
    
	    // match up sub-splits and full splits
	    // FIXME - aren't we RE-doing a lot of work, here?
	    vector<int> parents = match(full_splits,partial_splits);

	    // check for splits with increased support when mask is unplugged
	    double rooting=1.0;
	    for(int i=0;i<partial_splits.size();i++) 
	    {
		if (not informative(partial_splits[i].first))
		    continue;
	    
		double r = 1;
		if (parents[i] == -1) {
		    r = (l*sample.size())/double(partial_splits[i].second);
		}
		else {
		    r = full_splits[parents[i]].second/double(partial_splits[i].second);
		    assert(r <= 1.0);
		}

		double OD = statistics::odds(partial_splits[i].second-5,sample.size(),10);

		// actually, considering bad rooting of low-probability edges may be a better (or alternate)
		// strategy to unplugging edges that are only slightly bad.

		// Determination of rooting probabilities seems to have the largest effect on computation time
		//  - thus, in the long run, new_good_masks has a larger effect than new_unit_masks.
		//  - actually, this makes kind of makes sense...
		//    + new_unit_masks can add splits they reveal under fairly weak conditions.
		//    + however, unless a new unit mask ends up being a good_mask, it won't trigger the quadratic behavior.

		// What happens when we consider unplugging ratios for branches (now) supported at level l<0.5?
		if (r < min_rooting and OD > 0.5) {
		    add_unique(new_unit_masks,unit_masks,partial_splits[i].first.group1);
		    add_unique(new_unit_masks,unit_masks,partial_splits[i].first.group2);
		    rooting = std::min(rooting,r);
		}

		// Store the new sub-splits we found
		if (r < 0.999 or 
		    (parents[i] != -1 and statistics::odds_ratio(partial_splits[i].second,
								 full_splits[parents[i]].second,
								 sample.size(),
								 10) > 1.1)
		    )
		    splits.push_back(partial_splits[i]);
	    }

	    // check if any of our branches make this branch badly rooted
	    if (rooting < min_rooting)
		new_good_masks.push_front(mask);
	}

	if (log_verbose) cerr<<"new unit_masks = "<<new_unit_masks.size()<<endl;

	// 1. masks += new_masks
	add_unique(masks, {}, new_masks);

	// 2. good_masks += new_good_masks
	add_unique(good_masks, {}, new_good_masks);

	// 3. unit_masks += new_unit_masks
	add_unique(unit_masks, {}, new_unit_masks);

	if (depth == 0) break;

	// 4. good_masks += (good_masks + new_good_masks) * new_good_masks
	// Rationale: pull out every combination of masks known to be "good"
	new_masks = new_unit_masks;
	for(const auto& i: new_good_masks)
	    for(const auto& j: good_masks)
		if (i != j)
		    add_unique(new_masks,masks,i & j);

	//cerr<<"   new good masks = "<<new_good_masks.size()<<"    new unit masks = "<<new_unit_masks.size()<<endl;
	//cerr<<"       good masks = "<<good_masks.size()    <<"       total masks = "<<masks.size()<<"       found = "<<splits.size()<<endl;

    }

    return splits;
}
Exemplo n.º 5
0
vector<pair<partition,unsigned> >
get_Ml_partitions_and_counts(const tree_sample& sample,double l,const dynamic_bitset<>&  mask) 
{
    // find the first bit
    int first = mask.find_first();
    assert(first >= 0);

    if (l <= 0.0)
	throw myexception()<<"Consensus level must be > 0.0";
    if (l > 1.0)
	throw myexception()<<"Consensus level must be <= 1.0";

    // use a sorted list of <partition,count>, sorted by partition.
    typedef map<dynamic_bitset<>,p_count> container_t;
    container_t counts;

    // use a linked list of pointers to <partition,count> records.
    list<container_t::iterator> majority;

    vector<string> names = sample.names();

    unsigned count = 0;

    for(int i=0;i<sample.trees.size();i++) 
    {
	const vector<dynamic_bitset<> >& T = sample.trees[i].partitions;

	unsigned min_old = std::min(1+(unsigned)(l*count),count);

	count ++;
	unsigned min_new = std::min(1+(unsigned)(l*count),count);

	// for each partition in the next tree
	dynamic_bitset<> partition(names.size());
	for(int b=0;b<T.size();b++) 
	{
	    partition = T[b];

	    if (not partition[first])
		partition.flip();

	    partition &= mask;

	    // Look up record for this partition
	    container_t::iterator record = counts.find(partition);
	    if (record == counts.end()) {
		counts.insert(container_t::value_type(partition,p_count()));
		record = counts.find(partition);
		assert(record != counts.end());
	    }

	    // FIXME - we are doing the lookup twice
	    p_count& pc = record->second;
	    int& C2 = pc.count;
	    int C1 = C2;
	    if (pc.last_tree != i) {
		pc.last_tree=i;
		C2 ++;
	    }
      
	    // add the partition if it wasn't good before, but is now
	    if ((C1==0 or C1<min_old) and C2 >= min_new)
		majority.push_back(record);
	}


	// for partition in the majority tree
	typedef list<container_t::iterator>::iterator iterator_t;
	for(iterator_t p = majority.begin();p != majority.end();) {
	    if ((*p)->second.count < min_new) {
		iterator_t old = p;
		p++;
		majority.erase(old);
	    }
	    else
		p++;
	}
    }

    vector<pair<partition,unsigned> > partitions;
    partitions.reserve( 2*names.size() );
    for(auto p : majority)
    {
	partition pi(p->first, mask);
	unsigned p_count = p->second.count;

	if (valid(pi))
	    partitions.push_back(pair<partition,unsigned>(pi,p_count));
    }

    return partitions;
}
Exemplo n.º 6
0
/// \brief Get the count and average length for each split
///
/// \param sample The tree sample
///
map<dynamic_bitset<>,count_and_length>
get_partition_counts_and_lengths(const tree_sample& sample)
{
    // use an rbtree of <partition,count_and_length>, sorted by partition.
    typedef map<dynamic_bitset<>,count_and_length> container_t;
    container_t counts;

    vector<string> names = sample.names();
    const int L = names.size();
    const int N = sample.trees.size();

    // Setup: add leaf branch records and store references to them
    vector<container_t::iterator> leaf_branch_records;
    for(int i=0;i<L;i++) 
    {
	// construct leaf branch split
	dynamic_bitset<> partition(names.size());
	partition[i] = 1;
	if (not partition[0]) partition.flip();
  
	// insert it a get a reference
	counts.insert(container_t::value_type(partition,count_and_length(N,0)));
	container_t::iterator record = counts.find(partition);

	assert(record != counts.end());
	leaf_branch_records.push_back(record);
    }

    // Main loop: iterate over all trees
    for(int i=0;i<sample.trees.size();i++) 
    {
	const tree_record& T = sample.trees[i];

	// for each INTERNAL partition in the next tree
	for(int b=0;b<L;b++) {
	    count_and_length& cl = leaf_branch_records[b]->second;
	    cl.length += T.branch_lengths[b];
	}

	// for each INTERNAL partition in the next tree
	dynamic_bitset<> partition(names.size());
	for(int b=0;b<T.partitions.size();b++) 
	{
	    partition = T.partitions[b];
	    if (not partition[0]) partition.flip();

	    // Look up record for this partition
	    container_t::iterator record = counts.find(partition);
	    if (record == counts.end()) {
		counts.insert(container_t::value_type(partition,count_and_length()));
		record = counts.find(partition);          // FIXME - we are doing the lookup twice
		assert(record != counts.end());
	    }

	    // Increment the count and add in the new length
	    count_and_length& cl = record->second;
	    cl.count++;
	    cl.length += T.branch_lengths[L+b];
	}
    }

    for(container_t::iterator r = counts.begin();r != counts.end();r++) 
    {
	count_and_length& cl = r->second;
	cl.length /= cl.count;
    }

    return counts;
}