コード例 #1
0
ファイル: readconciler.cpp プロジェクト: ryneches/PhyloSift
void make_graph( PhyloTree< TreeNode >& tree, PhyloGraph& pg ){
	// use boost's graph algorithms
	pg.V = tree.size();
	pg.E = tree.size()-1;
	pg.edge_array = new Edge[ tree.size() - 1 ];
	pg.weights = new double[ tree.size() - 1 ];
	size_t eI = 0;
	for( size_t vI = 0; vI < pg.V; ++vI )
	{
		if( tree[vI].parents.size() != 0 )
		{
			pg.edge_array[eI] = Edge( vI, tree[vI].parents[0] );
			pg.weights[eI] = tree[vI].distance;
			eI++;
		}
	}

	pg.g = Graph(pg.edge_array, pg.edge_array + pg.E, pg.V);

	// add edge weights
	pg.w = get(boost::edge_weight, pg.g);
	double *wp = pg.weights;
	boost::graph_traits < Graph >::edge_iterator e, e_end;
	for (boost::tie(e, e_end) = boost::edges(pg.g); e != e_end; ++e)
		pg.w[*e] = *wp++;

} 
コード例 #2
0
ファイル: readconciler.cpp プロジェクト: ryneches/PhyloSift
int main(int argc, char** argv){

	if(argc < 3){
		cerr << "Usage: readconciler <reference tree> <gene tree> <gene to species map> <mapping output file>\n";
		return -1;
	}
	// read ref tree
	string reftreefile(argv[1]);
	ifstream reftreein(reftreefile.c_str());
	if(!reftreein.is_open()){
		cerr << "Unable to read file " << reftreefile << endl;
		return -2;
	}
	PhyloTree< TreeNode > reftree;
	reftree.readTree( reftreein );
	cout << "The reference tree has " << reftree.size() << " nodes\n";
	
	// read map
	ifstream mapin(argv[3]);
	if(!mapin.is_open()){
		cerr << "Unable to read file " << argv[3] << endl;
		return -3;
	}
	unordered_multimap<string, string> gene_map;
	string line;
	while( getline(mapin, line) ){
		stringstream line_str(line);
		string gene;
		string species;
		getline(line_str, species, '\t');
		getline(line_str, species, '\t');
		getline(line_str, gene);
		gene_map.insert(make_pair(species, gene));
		other_map.insert(make_pair(gene,species));
	}

	// read & reconcile each of the read trees
	string genetreefile = argv[2];
	reconcile( reftree, genetreefile, gene_map, argv[4] );
	
	return 0;
}
コード例 #3
0
ファイル: PhyloTreeTest.cpp プロジェクト: ehamberg/phyloea
TEST_F(PhyloTreeTest, RandomTree) {
    vector<PhyloTreeNode*> leaves;
    unsigned int n = 30;

    for (unsigned int i = 0; i < n; i++) {
        leaves.push_back(new PhyloTreeNode());
    }

    PhyloTree t;
    t.buildRandomTree(leaves);

    ASSERT_LT(log2(n), t.height()); // |tree| must be ≥ log₂(n)
    ASSERT_GE(n+1, t.height());     // |tree| must be ≤ n+1
    ASSERT_TRUE(t.getRoot()->isRoot()); // there should be a parentless root node

    // all leaves should now have a parent
    for (unsigned int i = 0; i < n; i++) {
        ASSERT_TRUE(!leaves.at(i)->isRoot());
    }
}
コード例 #4
0
ファイル: uniquifyTrees.cpp プロジェクト: Wyss/mauve-py
int main( int argc, char* argv[] )
{
	if( argc < 3 )
	{
		cerr << "Usage: uniquifyTrees <nexus input file> <nexus output file>\n";
		cerr << "All trees in the input file must have the same number of taxa and the same taxon labels\n";
	}
	string input_filename = argv[1];
	string output_filename = argv[2];
	ifstream input_file( input_filename.c_str() );
	if( !input_file.is_open() )
	{
		cerr << "Error opening \"" << input_filename << "\"\n";
		return -1;
	}
	ofstream output_file( output_filename.c_str() );
	if( !output_file.is_open() )
	{
		cerr << "Error opening \"" << output_filename << "\"\n";
		return -1;
	}
	
	size_t tree_sizes = 0;
	uint tree_count = 0;
	vector< string > tree_list;
	while( true )
	{
		PhyloTree< TreeNode > t;
		t.readTree( input_file );
		if( t.size() == 0 )
			break;
		if( tree_sizes == 0 )
			tree_sizes = t.size();
		if( t.size() != tree_sizes )
		{
			cerr << "Error: tree " << tree_count + 1 << " has a different number of taxa\n";
			return -2;
		}
 		sortTaxa( t );
		relabelTaxaToStartWithZero( t );
		stringstream ss;
		t.writeTree(ss);
		tree_list.push_back(ss.str());
		cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
		cout << "Read " << tree_list.size() << " trees";
	}
	cout << endl;
	cout << "Writing unique trees to \"" << output_filename << "\"\n";
	sort(tree_list.begin(), tree_list.end() );
	size_t unique_count = 0;
	for( size_t treeI = 0; treeI < tree_list.size(); treeI++ )
	{
		if( treeI > 0 && tree_list[treeI] == tree_list[treeI - 1] )
			continue;
		output_file << tree_list[treeI] << endl;
		unique_count++;
	}
	cerr << "There are " << unique_count << " unique trees\n";
	return 0;
}
コード例 #5
0
ファイル: uniquifyTrees.cpp プロジェクト: Wyss/mauve-py
/**
 * Assumes that taxa have numeric labels starting at 1 and simply
 * subtracts 1 from each node label
 */
void relabelTaxaToStartWithZero( PhyloTree< TreeNode >& t )
{
	for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
	{
		if( t[nodeI].name == "" )
			continue;
		stringstream name_str( t[nodeI].name );
		uint number;
		name_str >> number;
		number--;
		stringstream new_name_str;
		new_name_str << number;
		t[nodeI].name = new_name_str.str();
	}
}
コード例 #6
0
ファイル: uniquifyTrees.cpp プロジェクト: Wyss/mauve-py
void sortTaxa( PhyloTree< TreeNode >& t )
{
	for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ )
	{
		if( t[nodeI].children.size() == 0 )
			continue;
		// get the "representative" of each subtree
		vector< pair<string, node_id_t> > representatives = vector< pair<string, node_id_t> >( t[nodeI].children.size() );
		for( size_t repI = 0; repI < representatives.size(); repI++ )
		{
			node_id_t rep_node = getRepresentativeTaxon( t, t[nodeI].children[ repI ] );
			representatives[ repI ] = make_pair( t[rep_node].name, repI );
		}
		// sort children on their representative taxon names
		TaxonNamePairComparator tnc;
		sort( representatives.begin(), representatives.end(), tnc );
		// repopulate the children array with the sorted order
		vector< node_id_t > sorted_children;
		for( size_t repI = 0; repI < representatives.size(); repI++ )
			sorted_children.push_back( t[nodeI].children[representatives[repI].second] );
		t[nodeI].children = sorted_children;
	}
}
コード例 #7
0
ファイル: PhyloTreeTest.cpp プロジェクト: ehamberg/phyloea
TEST_F(PhyloTreeTest, PrefixCoding) {
    vector<PhyloTreeNode*> nodes1 = Fasta::readFastaFile("tests/aligned.fasta");
    PhyloTree t;
    t.setEvolutionModel(new Kimura(10.0));
    t.buildRandomTree(nodes1);

    double lh1 = t.logLikelihood();
    ASSERT_LT(lh1, 0.0);

    string prefixCoded = PhyloTreeNode::prefixRepresentation(t.getRoot());

    vector<PhyloTreeNode*> nodes2 = Fasta::readFastaFile("tests/aligned.fasta");
    PhyloTree t2 = PhyloTree::decodePrefixNotation(nodes2, prefixCoded, new Kimura(10.0));

    double lh2 = t.logLikelihood();

    ASSERT_DOUBLE_EQ(lh1, lh2);
}
コード例 #8
0
ファイル: readconciler.cpp プロジェクト: ryneches/PhyloSift
void reconcile( PhyloTree< TreeNode >& reftree, string treefile, unordered_multimap<string, string>& gene_map, string output_fname ){
	// read ref tree
	ifstream treein(treefile.c_str());
	if(!treein.is_open()){
		cerr << "Unable to read file " << treefile << endl;
		return;
	}

//
// read a tree with edge numberings from pplacer
// assume jplace format with treestring on second line
//
	string line;
	string treestring;
	getline( treein, line );
	getline( treein, treestring );
	size_t qpos = treestring.find("\"");
	size_t rqpos = treestring.rfind("\"");
	treestring = treestring.substr( qpos + 1, rqpos - qpos - 1);
	stringstream treestr(treestring);
//	cout << "Trying to read " << treestring << endl;

	PhyloTree< TreeNode > tree;
	tree.readTree( treestr );
	cout << "The read tree has " << tree.size() << " nodes\n";
//
// remove edge numbers
// assume jplace format
//
	std::unordered_map<int,int> edgenum_map;
	for(int i=0; i<tree.size(); i++){
		size_t atpos = tree[i].name.find("{");
		size_t ratpos = tree[i].name.rfind("}");
		int edgenum = -1;		
		if( atpos == string::npos ){
			edgenum = atoi(tree[i].name.c_str());
		}else{
			edgenum = atoi(tree[i].name.substr(atpos+1, ratpos - atpos - 1).c_str());
//			cerr << "node " << i << " edgenum is " << tree[i].name.substr(atpos+1, ratpos - atpos - 1) << " name is " << tree[i].name.substr(0, atpos) << endl;
			tree[i].name = tree[i].name.substr(0, atpos);
		}
//		cerr << "mapping " << i << " to " << edgenum << "\n";
		edgenum_map.insert(make_pair(i,edgenum));
	}
//	cerr << "Done removing edge numbers\n";

//
// construct boost graphs of the trees
//
	PhyloGraph pg;	
	make_graph( tree, pg );

	PhyloGraph refpg;	
	make_graph( reftree, refpg );

//
// Phase 3: construct map to reference tree
//
// a) cut gene tree on each edge
// b) compute splits at cut point
// c) cut species tree on each edge
// d) determine which species tree split matches the gene tree split best
// e) write out the split match


// plan for later...
// c) compute PD on either side of cut point
// d) logical AND splits with reftree splits
// e) compute minimum spanning tree among remaining nodes
// f) compute PD of minimum spanning trees
// 	
	vector< boost::dynamic_bitset<> > pg_splitlist;
	vector<Vertex> pg_vertex_map;
	enumerate_splits( pg, pg_splitlist, pg_vertex_map );
	cout << "Done with gene tree splits\n";
	vector< boost::dynamic_bitset<> > ref_splitlist;
	vector<Vertex> ref_vertex_map;
	enumerate_splits( refpg, ref_splitlist, ref_vertex_map );

	// need a mapping from vertex numbers in refpg to vertex numbers in pg
	cout << "Making gene tree map\n";
	unordered_map< string, int > gtmap;
	for(int i=0; i<tree.size(); i++){
		if(tree[i].children.size()==0){
			gtmap.insert(make_pair(tree[i].name, i));
		}
	}
	cout << gtmap.size() << " genes mapped\n";
	
	cout << "Making species to gene tree map\n";
	vector< vector< int > > species_to_gene_map;	// maps split IDs in species tree to split IDs in gene tree
	for(int i=0; i<refpg.V; i++){
		if(ref_vertex_map[i]==-1)
			continue;
		// which genes does this species contain?
		pair< unordered_multimap<string,string>::iterator, unordered_multimap<string,string>::iterator> iter;
		iter = gene_map.equal_range(reftree[i].name);
		vector<int> curmap;
		if(iter.first ==iter.second){
			cerr << "Error no mapping found for " << reftree[i].name << endl;
		}
		for(; iter.first !=iter.second; iter.first++){			
			if( pg_vertex_map[ gtmap[iter.first->second] ] == -1 )
				continue;
			curmap.push_back( pg_vertex_map[ gtmap[iter.first->second] ] );
//			cout << "mapped ref " << reftree[i].name << "\t" << ref_vertex_map[i] << " to " << curmap.back() << endl;
//			cout << "reverse map to " << tree[gtmap[iter.first->second]].name << " and " << other_map[tree[gtmap[iter.first->second]].name] << endl;
		}
		// add a list of gene vertices for this species
		species_to_gene_map.push_back(curmap);
	}
	cout << species_to_gene_map.size() << " species mapped\n";
	cout << "rs.size() " << ref_splitlist[0].size() << endl;

	cout << "Finding best edges\n";
	ofstream mapout(output_fname.c_str());
	for( size_t i=0; i < pg.E; i++ ){
		// for each reftree edge, calculate mapping quality between this edge and reftree edges
		double scoresum = 0;
		double bestscore = 0;
		vector<double> maxscores;
//		cout << "ts1.count()\t" << pg_splitlist[i].count() << endl;
		if(pg_splitlist[i].count() == 1){
			size_t f = pg_splitlist[i].find_first();
			int qq=0;
			for(int abc=-1; abc<(int)f; qq++)
				if(pg_vertex_map[qq]!=-1)
					abc++;
//			cout << "gene tree " << other_map[ tree[qq].name ] << " treenode " << qq << " split id " << f << " edge " << i << endl;
		}

		boost::dynamic_bitset<> treesplit1 = pg_splitlist[i];
		boost::dynamic_bitset<> treesplit2 = pg_splitlist[i];
		treesplit2.flip();

		for( size_t j=0; j < refpg.E; j++ ){
			// logical AND
			boost::dynamic_bitset<> refsplit1 = ref_splitlist[j];
			boost::dynamic_bitset<> refsplit2 = ref_splitlist[j];
			refsplit2.flip();
//			cout << "rs1.count() " << refsplit1.count() << "\trs2.count() " << refsplit2.count() << endl;
			normalize_split( refsplit1, species_to_gene_map, pg_splitlist[i].size() );
			normalize_split( refsplit2, species_to_gene_map, pg_splitlist[i].size() );
//			cout << "normalized rs1.count() " << refsplit1.count() << "\trs2.count() " << refsplit2.count() << endl;
			
			boost::dynamic_bitset<> and11 = treesplit1 & refsplit1;
			boost::dynamic_bitset<> and21 = treesplit2 & refsplit1;
			boost::dynamic_bitset<> and12 = treesplit1 & refsplit2;
			boost::dynamic_bitset<> and22 = treesplit2 & refsplit2;
			double a11score = (double)and11.count() / (double)treesplit1.count();
			double a22score = (double)and22.count() / (double)treesplit2.count();
			double a1122score = (a11score + a22score) / 2.0;
			double a12score = (double)and12.count() / (double)treesplit1.count();
			double a21score = (double)and21.count() / (double)treesplit2.count();
			double a1212score = (a12score + a21score) / 2.0;
			a1212score = pow( a1212score, 100.0 );
			a1122score = pow( a1122score, 100.0 );
			maxscores.push_back( max(a1122score, a1212score));
			scoresum += maxscores.back();
			bestscore = max(maxscores.back(), bestscore);
		}
		// count the number of nodes with the max score. if it is more than a threshold, ignore this node since it is too hard to reconcile
		int place_count = 0;
		for(size_t j=0; j<maxscores.size(); j++){
			if(maxscores[j] < bestscore)
				continue;
			place_count++;
		}
		if(place_count < placement_limit ){
			for(size_t j=0; j<maxscores.size(); j++){
				if(maxscores[j] < bestscore)
					continue;
				string refnodename = reftree[ refpg.edge_array[j].first ].name;
	//			cout << "gene tree edge " << i << " linking " << other_map[tree[pg.edge_array[i].first].name] << " best reftree edge " << refnodename << endl; 
	//			cout << "found edge " << pg.edge_array[i].first << "\n";
				mapout << edgenum_map[pg.edge_array[i].first] << "\t" << refnodename << endl;
			}
		}
//		if(pg_splitlist[i].count() == 1)
//			return;
	}
}
コード例 #9
0
//**********************************************************************************************************************
vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) {
	try{
		conTax = "";
		vector<string> allNames;
		map<string, string>::iterator it;
		map<string, string>::iterator it2;

		//create a tree containing sequences from this bin
		PhyloTree* phylo = new PhyloTree();
		
		size = 0;
		for (int i = 0; i < names.size(); i++) {
	
			//if namesfile include the names
			if (namefile != "") {
	
				//is this sequence in the name file - namemap maps seqName -> repSeqName
				it2 = nameMap.find(names[i]);
				
				if (it2 == nameMap.end()) { //this name is not in name file, skip it
					m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
				}else{
					
					//is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
					it = taxMap.find(it2->second);
			
					if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
					
						if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
						else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
					}else{
				
						//add seq to tree
						phylo->addSeqToTree(names[i], it->second);
						size++;
						allNames.push_back(names[i]);
					}
				}
				
			}else{
				//is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
				it = taxMap.find(names[i]);
		
				if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
					m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
				}else{
                    if (countfile != "") {
                        int numDups = ct->getNumSeqs(names[i]); 
                        for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree(names[i], it->second);  }
                        size += numDups;
                    }else{
					//add seq to tree
                        phylo->addSeqToTree(names[i], it->second);
                        size++;  
                    }
                    allNames.push_back(names[i]);
				}
			}

			
			if (m->control_pressed) { delete phylo; return allNames; }
			
		}
		
		//build tree
		phylo->assignHeirarchyIDs(0);
		
		TaxNode currentNode = phylo->get(0);
		int myLevel = 0; 	
		//at each level
		while (currentNode.children.size() != 0) { //you still have more to explore
		
			TaxNode bestChild;
			int bestChildSize = 0;
			
			//go through children
			for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
				
				TaxNode temp = phylo->get(itChild->second);
				
				//select child with largest accesions - most seqs assigned to it
				if (temp.accessions.size() > bestChildSize) {
					bestChild = phylo->get(itChild->second);
					bestChildSize = temp.accessions.size();
				}
				
			}
            
            //phylotree adds an extra unknown so we want to remove that
            if (bestChild.name == "unknown") { bestChildSize--; }
				
			//is this taxonomy above cutoff
			int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
			
			if (consensusConfidence >= cutoff) { //if yes, add it
				if (probs) {
					conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
				}else{
					conTax += bestChild.name + ";";
				}
				myLevel++;
			}else{ //if no, quit
				break;
			}
			
			//move down a level
			currentNode = bestChild;
		}
		
		if (myLevel != phylo->getMaxLevel()) {
			while (myLevel != phylo->getMaxLevel()) {
				conTax += "unclassified;";
				myLevel++;
			}
		}		
		if (conTax == "") {  conTax = "no_consensus;";  }
		
		delete phylo;	
		
		return allNames;
			
	}
	catch(exception& e) {
		m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
		exit(1);
	}
}
コード例 #10
0
ファイル: splitmatrix.cpp プロジェクト: Cryomics-Lab/mothur
int SplitMatrix::splitClassify() {
    try {
        cutoff = int(cutoff);

        map<string, int> seqGroup;
        map<string, int>::iterator it;
        map<string, int>::iterator it2;

        int numGroups = 0;

        //build tree from users taxonomy file
        PhyloTree* phylo = new PhyloTree();

        map<string, string> temp;
        m->readTax(taxFile, temp, true);

        for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
            phylo->addSeqToTree(itTemp->first, itTemp->second);
            temp.erase(itTemp++);
        }

        phylo->assignHeirarchyIDs(0);

        //make sure the cutoff is not greater than maxlevel
        if (cutoff > phylo->getMaxLevel()) {
            m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel()));
            m->mothurOutEndLine();
            cutoff = phylo->getMaxLevel();
        }

        //for each node in tree
        for (int i = 0; i < phylo->getNumNodes(); i++) {

            //is this node within the cutoff
            TaxNode taxon = phylo->get(i);

            if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
                if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
                    for (int j = 0; j < taxon.accessions.size(); j++) {
                        seqGroup[taxon.accessions[j]] = numGroups;
                    }
                    numGroups++;
                }
            }
        }

        delete phylo;

        if (method == "classify") {
            splitDistanceFileByTax(seqGroup, numGroups);
        } else {
            createDistanceFilesFromTax(seqGroup, numGroups);
        }

        return 0;

    }
    catch(exception& e) {
        m->errorOut(e, "SplitMatrix", "splitClassify");
        exit(1);
    }
}
コード例 #11
0
ファイル: ratefree.cpp プロジェクト: bqminh/IQ-TREE
double RateFree::optimizeWithEM() {
    size_t ptn, c;
    size_t nptn = phylo_tree->aln->getNPattern();
    size_t nmix = ncategory;
    const double MIN_PROP = 1e-4;
    
//    double *lk_ptn = aligned_alloc<double>(nptn);
    double *new_prop = aligned_alloc<double>(nmix);
    PhyloTree *tree = new PhyloTree;

    // attach memory to save space
//    tree->central_partial_lh = phylo_tree->central_partial_lh;
//    tree->central_scale_num = phylo_tree->central_scale_num;
//    tree->central_partial_pars = phylo_tree->central_partial_pars;

    tree->copyPhyloTree(phylo_tree);
    tree->optimize_by_newton = phylo_tree->optimize_by_newton;
    tree->setParams(phylo_tree->params);
    tree->setLikelihoodKernel(phylo_tree->sse);
    tree->setNumThreads(phylo_tree->num_threads);

    // initialize model
    ModelFactory *model_fac = new ModelFactory();
    model_fac->joint_optimize = phylo_tree->params->optimize_model_rate_joint;
//    model_fac->unobserved_ptns = phylo_tree->getModelFactory()->unobserved_ptns;

    RateHeterogeneity *site_rate = new RateHeterogeneity; 
    tree->setRate(site_rate);
    site_rate->setTree(tree);
            
    model_fac->site_rate = site_rate;
    tree->model_factory = model_fac;
    tree->setParams(phylo_tree->params);
    double old_score = 0.0;
    // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
    for (int step = 0; step < ncategory; step++) {
        // first compute _pattern_lh_cat
        double score;
        score = phylo_tree->computePatternLhCat(WSL_RATECAT);
        if (score > 0.0) {
            phylo_tree->printTree(cout, WT_BR_LEN+WT_NEWLINE);
            writeInfo(cout);
        }
        ASSERT(score < 0);
        
        if (step > 0) {
            if (score <= old_score-0.1) {
                phylo_tree->printTree(cout, WT_BR_LEN+WT_NEWLINE);
                writeInfo(cout);
                cout << "Partition " << phylo_tree->aln->name << endl;
                cout << "score: " << score << "  old_score: " << old_score << endl;
            }
            ASSERT(score > old_score-0.1);
        }
            
        old_score = score;
        
        memset(new_prop, 0, nmix*sizeof(double));
                
        // E-step
        // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
        for (ptn = 0; ptn < nptn; ptn++) {
            double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
            double lk_ptn = phylo_tree->ptn_invar[ptn];
            for (c = 0; c < nmix; c++) {
                lk_ptn += this_lk_cat[c];
            }
            ASSERT(lk_ptn != 0.0);
            lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
            
            // transform _pattern_lh_cat into posterior probabilities of each category
            for (c = 0; c < nmix; c++) {
                this_lk_cat[c] *= lk_ptn;
                new_prop[c] += this_lk_cat[c];
            }
            
        } 
        
        // M-step, update weights according to (*)
        int maxpropid = 0;
        double new_pinvar = 0.0;    
        for (c = 0; c < nmix; c++) {
            new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
            if (new_prop[c] > new_prop[maxpropid])
                maxpropid = c;
        }
        // regularize prop
        bool zero_prop = false;
        for (c = 0; c < nmix; c++) {
            if (new_prop[c] < MIN_PROP) {
                new_prop[maxpropid] -= (MIN_PROP - new_prop[c]);
                new_prop[c] = MIN_PROP;
                zero_prop = true;
            }
        }
        // break if some probabilities too small
        if (zero_prop) break;

        bool converged = true;
        double sum_prop = 0.0;
        for (c = 0; c < nmix; c++) {
//            new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
            // check for convergence
            sum_prop += new_prop[c];
            converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
            prop[c] = new_prop[c];
            new_pinvar += new_prop[c];
        }

        new_pinvar = 1.0 - new_pinvar;

        if (new_pinvar > 1e-4 && getPInvar() != 0.0) {
            converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4);
            if (isFixPInvar())
                outError("Fixed given p-invar is not supported");
            setPInvar(new_pinvar);
//            setOptimizePInvar(false);
            phylo_tree->computePtnInvar();
        }
        
        ASSERT(fabs(sum_prop+new_pinvar-1.0) < MIN_PROP);
        
        // now optimize rates one by one
        double sum = 0.0;
        for (c = 0; c < nmix; c++) {
            tree->copyPhyloTree(phylo_tree);
            ModelMarkov *subst_model;
            if (phylo_tree->getModel()->isMixture() && phylo_tree->getModelFactory()->fused_mix_rate)
                subst_model = (ModelMarkov*)phylo_tree->getModel()->getMixtureClass(c);
            else
                subst_model = (ModelMarkov*)phylo_tree->getModel();
            tree->setModel(subst_model);
            subst_model->setTree(tree);
            model_fac->model = subst_model;
            if (subst_model->isMixture() || subst_model->isSiteSpecificModel() || !subst_model->isReversible())
                tree->setLikelihoodKernel(phylo_tree->sse);

                        
            // initialize likelihood
            tree->initializeAllPartialLh();
            // copy posterior probability into ptn_freq
            tree->computePtnFreq();
            double *this_lk_cat = phylo_tree->_pattern_lh_cat+c;
            for (ptn = 0; ptn < nptn; ptn++)
                tree->ptn_freq[ptn] = this_lk_cat[ptn*nmix];
            double scaling = rates[c];
            tree->scaleLength(scaling);
            tree->optimizeTreeLengthScaling(MIN_PROP, scaling, 1.0/prop[c], 0.001);
            converged = converged && (fabs(rates[c] - scaling) < 1e-4);
            rates[c] = scaling;
            sum += prop[c] * rates[c];
            // reset subst model
            tree->setModel(NULL);
            subst_model->setTree(phylo_tree);
            
        }
        
        phylo_tree->clearAllPartialLH();
        if (converged) break;
    }
    
        // sort the rates in increasing order
    if (sorted_rates)
        quicksort(rates, 0, ncategory-1, prop);
    
    // deattach memory
//    tree->central_partial_lh = NULL;
//    tree->central_scale_num = NULL;
//    tree->central_partial_pars = NULL;

    delete tree;
    aligned_free(new_prop);
    return phylo_tree->computeLikelihood();
}
コード例 #12
0
int main(int argc, char *argv[])
{

  time_t rawtime;
  struct tm * timeinfo;

  time ( &rawtime );
  timeinfo = localtime ( &rawtime );
  //  std::cout << "Start " << asctime (timeinfo);

  /////////////////////////////////////////
  // Setup
  if( argc < 10 || argc > 12){
    std::cout << "Wrong number of input arguments (" << argc << "), should have format:\n";
    std::cout << "\ttree_to_matrix <infile> <tmpfile> <prunedfile> <refalignment> <outfile> <starting_row> <ending_row> <format M=matrix E=esprit> <Do_Pruning 0=no 1=yes 2=only prune> [outfile_freq] [maxdistance(E format only)]\n";
  }
  char* infilename     = argv[1];    // (input)  Tree file with reference sequences
  char* tempfilename   = argv[2];    // (output) Half-pruned file (after pruning, before cleaning up single-child nodes and internal nodes which have become leaves
  char* prunedfilename = argv[3];    // (in/out) Pruned file name, input if not pruning, output if pruning
  char* refalignname   = argv[4];    // (input)  reference fasta file, only uses the sequence identifiers for pruning
  char* outfilename    = argv[5];    // (output) Output distance matrix/list file name
  int startrow    = atoi(argv[6]);   // (input)  First row to print for the distance matrix (0 for all)
  int endrow      = atoi(argv[7]);   // (input)  Last  row to print for the distance matrix (0 for all)
  char format          = argv[8][0]; // (input)  Format of distance, M=matrix, E=ESPRIT list
  int do_pruning  = atoi(argv[9]);   // (input)  0=no 1=yes 2=only prune
  //    M = matrix format, used by mothur
  //    E = ESPRIT list format
  char* frqfilename;
  float maxdist=0.1;
  if( argc == 12 ){
    frqfilename      = argv[10];     // (Optional output) Frequency file name, used when running ESPRIT
    maxdist     = atof(argv[11]);    // (Optional input)  Maximum distance to print in the distance list (ESPRIT format only)
    std::cout << frqfilename << " " << maxdist << std::endl;
  } else {
    if( format == 'E' ){
      std::cerr << "maximum distance required for ESPRIT printout; quitting\n";
      return EXIT_FAILURE;
    }
  }
  int srow = startrow;
  char* inname;
  if( do_pruning>0 ){
    // Read in raw file, then prune it
    inname = infilename;
  } else {
    // Read in pruned file directly
    inname = prunedfilename;
  }
  if( format == 'E' ){
    std::cout << "Printing output in ESPRIT list format\n";
  } else if( format == 'M' ){
    std::cout << "Printing output in Mothur matrix format\n";
  } else {
    std::cerr << "Unknown format " << format << ". Quitting\n";
    return EXIT_FAILURE;
  }
  std::list<TreeNode>::iterator startit;
  std::list<TreeNode>::iterator endit;

  /////////////////////////////////////////
  // READ IN TREE FROM FILE
  std::cout << "Reading in " << inname << std::endl;
  PhyloTree<TreeNode>* tr = new PhyloTree<TreeNode>();
  std::ifstream infile;
  infile.open(inname);
  if( !infile.is_open() ){ std::cout << "Unable to open file " << inname << std::endl; }
  tr->readTree(infile);
  std::cout << "LEAVES: " << tr->getNleaves() << std::endl;
  tr->check_root();
  /////////////////////////////////////////
  // Prune tree (if necessary)
  if( do_pruning>0 ){
    std::cout << "Pruning tree\n";
    // Read in reference alignment file and grab reference file names
    std::ifstream reffile;
    reffile.open(refalignname);
    if( !reffile.is_open() ){ std::cout << "Unable to open file " << refalignname << std::endl; }
    char line[100];
    reffile >> line;
    while( !reffile.eof() ){
      if( line[0] == '>' ){
	// Clean-up the file name
	std::string name(line);
	int slash = (int)name.find("/");
	name = name.substr(1, slash-1);
	int bar = (int)name.find("|");
	if ( bar != name.npos ){
	  name = name.replace(bar, 1, "_");
	}
	// Remove this leaf from the tree
	tr->deleteLeaf(name.c_str());
      }
      reffile >> line;
    }
    reffile.close();

    // Print to tmp file, just in case 
    std::ofstream treeout;
    treeout.open( tempfilename );
    if( !treeout.is_open() ){ std::cout << "Unable to open file " << tempfilename << std::endl; }
    treeout.precision(5);
    treeout.setf(std::ios::fixed,std::ios::floatfield);
    tr->writeTree( treeout );
    treeout.close();
    std::cout << "Printed to file " << tempfilename << std::endl;
    
    // Remove internal nodes that are now leaves
    while( tr->deleteLeaf("") > 0 );

    // Smooth to remove single child nodes
    while( tr->smooth() > 0 );

    // Check that the root doesn't have only one node
    tr->check_root();

    // Print pruned file, for use by parallel jobs
    treeout.open( prunedfilename );
    if( !treeout.is_open() ){ std::cout << "Unable to open file " << prunedfilename << std::endl; }
    treeout.precision(6);
    treeout.setf(std::ios::fixed,std::ios::floatfield);
    tr->writeTree( treeout );
    treeout.close();
    std::cout << "Printed to file " << prunedfilename << std::endl;

    // If I only needed to prune then I'm done
    if( do_pruning>1 ){
      std::cout << "Done pruning tips, ready to launch parallel tree_to_matrix jobs\n";
      return EXIT_SUCCESS;
    }
  }