void make_graph( PhyloTree< TreeNode >& tree, PhyloGraph& pg ){ // use boost's graph algorithms pg.V = tree.size(); pg.E = tree.size()-1; pg.edge_array = new Edge[ tree.size() - 1 ]; pg.weights = new double[ tree.size() - 1 ]; size_t eI = 0; for( size_t vI = 0; vI < pg.V; ++vI ) { if( tree[vI].parents.size() != 0 ) { pg.edge_array[eI] = Edge( vI, tree[vI].parents[0] ); pg.weights[eI] = tree[vI].distance; eI++; } } pg.g = Graph(pg.edge_array, pg.edge_array + pg.E, pg.V); // add edge weights pg.w = get(boost::edge_weight, pg.g); double *wp = pg.weights; boost::graph_traits < Graph >::edge_iterator e, e_end; for (boost::tie(e, e_end) = boost::edges(pg.g); e != e_end; ++e) pg.w[*e] = *wp++; }
int main(int argc, char** argv){ if(argc < 3){ cerr << "Usage: readconciler <reference tree> <gene tree> <gene to species map> <mapping output file>\n"; return -1; } // read ref tree string reftreefile(argv[1]); ifstream reftreein(reftreefile.c_str()); if(!reftreein.is_open()){ cerr << "Unable to read file " << reftreefile << endl; return -2; } PhyloTree< TreeNode > reftree; reftree.readTree( reftreein ); cout << "The reference tree has " << reftree.size() << " nodes\n"; // read map ifstream mapin(argv[3]); if(!mapin.is_open()){ cerr << "Unable to read file " << argv[3] << endl; return -3; } unordered_multimap<string, string> gene_map; string line; while( getline(mapin, line) ){ stringstream line_str(line); string gene; string species; getline(line_str, species, '\t'); getline(line_str, species, '\t'); getline(line_str, gene); gene_map.insert(make_pair(species, gene)); other_map.insert(make_pair(gene,species)); } // read & reconcile each of the read trees string genetreefile = argv[2]; reconcile( reftree, genetreefile, gene_map, argv[4] ); return 0; }
TEST_F(PhyloTreeTest, RandomTree) { vector<PhyloTreeNode*> leaves; unsigned int n = 30; for (unsigned int i = 0; i < n; i++) { leaves.push_back(new PhyloTreeNode()); } PhyloTree t; t.buildRandomTree(leaves); ASSERT_LT(log2(n), t.height()); // |tree| must be ≥ log₂(n) ASSERT_GE(n+1, t.height()); // |tree| must be ≤ n+1 ASSERT_TRUE(t.getRoot()->isRoot()); // there should be a parentless root node // all leaves should now have a parent for (unsigned int i = 0; i < n; i++) { ASSERT_TRUE(!leaves.at(i)->isRoot()); } }
int main( int argc, char* argv[] ) { if( argc < 3 ) { cerr << "Usage: uniquifyTrees <nexus input file> <nexus output file>\n"; cerr << "All trees in the input file must have the same number of taxa and the same taxon labels\n"; } string input_filename = argv[1]; string output_filename = argv[2]; ifstream input_file( input_filename.c_str() ); if( !input_file.is_open() ) { cerr << "Error opening \"" << input_filename << "\"\n"; return -1; } ofstream output_file( output_filename.c_str() ); if( !output_file.is_open() ) { cerr << "Error opening \"" << output_filename << "\"\n"; return -1; } size_t tree_sizes = 0; uint tree_count = 0; vector< string > tree_list; while( true ) { PhyloTree< TreeNode > t; t.readTree( input_file ); if( t.size() == 0 ) break; if( tree_sizes == 0 ) tree_sizes = t.size(); if( t.size() != tree_sizes ) { cerr << "Error: tree " << tree_count + 1 << " has a different number of taxa\n"; return -2; } sortTaxa( t ); relabelTaxaToStartWithZero( t ); stringstream ss; t.writeTree(ss); tree_list.push_back(ss.str()); cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; cout << "Read " << tree_list.size() << " trees"; } cout << endl; cout << "Writing unique trees to \"" << output_filename << "\"\n"; sort(tree_list.begin(), tree_list.end() ); size_t unique_count = 0; for( size_t treeI = 0; treeI < tree_list.size(); treeI++ ) { if( treeI > 0 && tree_list[treeI] == tree_list[treeI - 1] ) continue; output_file << tree_list[treeI] << endl; unique_count++; } cerr << "There are " << unique_count << " unique trees\n"; return 0; }
/** * Assumes that taxa have numeric labels starting at 1 and simply * subtracts 1 from each node label */ void relabelTaxaToStartWithZero( PhyloTree< TreeNode >& t ) { for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ ) { if( t[nodeI].name == "" ) continue; stringstream name_str( t[nodeI].name ); uint number; name_str >> number; number--; stringstream new_name_str; new_name_str << number; t[nodeI].name = new_name_str.str(); } }
void sortTaxa( PhyloTree< TreeNode >& t ) { for( node_id_t nodeI = 0; nodeI < t.size(); nodeI++ ) { if( t[nodeI].children.size() == 0 ) continue; // get the "representative" of each subtree vector< pair<string, node_id_t> > representatives = vector< pair<string, node_id_t> >( t[nodeI].children.size() ); for( size_t repI = 0; repI < representatives.size(); repI++ ) { node_id_t rep_node = getRepresentativeTaxon( t, t[nodeI].children[ repI ] ); representatives[ repI ] = make_pair( t[rep_node].name, repI ); } // sort children on their representative taxon names TaxonNamePairComparator tnc; sort( representatives.begin(), representatives.end(), tnc ); // repopulate the children array with the sorted order vector< node_id_t > sorted_children; for( size_t repI = 0; repI < representatives.size(); repI++ ) sorted_children.push_back( t[nodeI].children[representatives[repI].second] ); t[nodeI].children = sorted_children; } }
TEST_F(PhyloTreeTest, PrefixCoding) { vector<PhyloTreeNode*> nodes1 = Fasta::readFastaFile("tests/aligned.fasta"); PhyloTree t; t.setEvolutionModel(new Kimura(10.0)); t.buildRandomTree(nodes1); double lh1 = t.logLikelihood(); ASSERT_LT(lh1, 0.0); string prefixCoded = PhyloTreeNode::prefixRepresentation(t.getRoot()); vector<PhyloTreeNode*> nodes2 = Fasta::readFastaFile("tests/aligned.fasta"); PhyloTree t2 = PhyloTree::decodePrefixNotation(nodes2, prefixCoded, new Kimura(10.0)); double lh2 = t.logLikelihood(); ASSERT_DOUBLE_EQ(lh1, lh2); }
void reconcile( PhyloTree< TreeNode >& reftree, string treefile, unordered_multimap<string, string>& gene_map, string output_fname ){ // read ref tree ifstream treein(treefile.c_str()); if(!treein.is_open()){ cerr << "Unable to read file " << treefile << endl; return; } // // read a tree with edge numberings from pplacer // assume jplace format with treestring on second line // string line; string treestring; getline( treein, line ); getline( treein, treestring ); size_t qpos = treestring.find("\""); size_t rqpos = treestring.rfind("\""); treestring = treestring.substr( qpos + 1, rqpos - qpos - 1); stringstream treestr(treestring); // cout << "Trying to read " << treestring << endl; PhyloTree< TreeNode > tree; tree.readTree( treestr ); cout << "The read tree has " << tree.size() << " nodes\n"; // // remove edge numbers // assume jplace format // std::unordered_map<int,int> edgenum_map; for(int i=0; i<tree.size(); i++){ size_t atpos = tree[i].name.find("{"); size_t ratpos = tree[i].name.rfind("}"); int edgenum = -1; if( atpos == string::npos ){ edgenum = atoi(tree[i].name.c_str()); }else{ edgenum = atoi(tree[i].name.substr(atpos+1, ratpos - atpos - 1).c_str()); // cerr << "node " << i << " edgenum is " << tree[i].name.substr(atpos+1, ratpos - atpos - 1) << " name is " << tree[i].name.substr(0, atpos) << endl; tree[i].name = tree[i].name.substr(0, atpos); } // cerr << "mapping " << i << " to " << edgenum << "\n"; edgenum_map.insert(make_pair(i,edgenum)); } // cerr << "Done removing edge numbers\n"; // // construct boost graphs of the trees // PhyloGraph pg; make_graph( tree, pg ); PhyloGraph refpg; make_graph( reftree, refpg ); // // Phase 3: construct map to reference tree // // a) cut gene tree on each edge // b) compute splits at cut point // c) cut species tree on each edge // d) determine which species tree split matches the gene tree split best // e) write out the split match // plan for later... // c) compute PD on either side of cut point // d) logical AND splits with reftree splits // e) compute minimum spanning tree among remaining nodes // f) compute PD of minimum spanning trees // vector< boost::dynamic_bitset<> > pg_splitlist; vector<Vertex> pg_vertex_map; enumerate_splits( pg, pg_splitlist, pg_vertex_map ); cout << "Done with gene tree splits\n"; vector< boost::dynamic_bitset<> > ref_splitlist; vector<Vertex> ref_vertex_map; enumerate_splits( refpg, ref_splitlist, ref_vertex_map ); // need a mapping from vertex numbers in refpg to vertex numbers in pg cout << "Making gene tree map\n"; unordered_map< string, int > gtmap; for(int i=0; i<tree.size(); i++){ if(tree[i].children.size()==0){ gtmap.insert(make_pair(tree[i].name, i)); } } cout << gtmap.size() << " genes mapped\n"; cout << "Making species to gene tree map\n"; vector< vector< int > > species_to_gene_map; // maps split IDs in species tree to split IDs in gene tree for(int i=0; i<refpg.V; i++){ if(ref_vertex_map[i]==-1) continue; // which genes does this species contain? pair< unordered_multimap<string,string>::iterator, unordered_multimap<string,string>::iterator> iter; iter = gene_map.equal_range(reftree[i].name); vector<int> curmap; if(iter.first ==iter.second){ cerr << "Error no mapping found for " << reftree[i].name << endl; } for(; iter.first !=iter.second; iter.first++){ if( pg_vertex_map[ gtmap[iter.first->second] ] == -1 ) continue; curmap.push_back( pg_vertex_map[ gtmap[iter.first->second] ] ); // cout << "mapped ref " << reftree[i].name << "\t" << ref_vertex_map[i] << " to " << curmap.back() << endl; // cout << "reverse map to " << tree[gtmap[iter.first->second]].name << " and " << other_map[tree[gtmap[iter.first->second]].name] << endl; } // add a list of gene vertices for this species species_to_gene_map.push_back(curmap); } cout << species_to_gene_map.size() << " species mapped\n"; cout << "rs.size() " << ref_splitlist[0].size() << endl; cout << "Finding best edges\n"; ofstream mapout(output_fname.c_str()); for( size_t i=0; i < pg.E; i++ ){ // for each reftree edge, calculate mapping quality between this edge and reftree edges double scoresum = 0; double bestscore = 0; vector<double> maxscores; // cout << "ts1.count()\t" << pg_splitlist[i].count() << endl; if(pg_splitlist[i].count() == 1){ size_t f = pg_splitlist[i].find_first(); int qq=0; for(int abc=-1; abc<(int)f; qq++) if(pg_vertex_map[qq]!=-1) abc++; // cout << "gene tree " << other_map[ tree[qq].name ] << " treenode " << qq << " split id " << f << " edge " << i << endl; } boost::dynamic_bitset<> treesplit1 = pg_splitlist[i]; boost::dynamic_bitset<> treesplit2 = pg_splitlist[i]; treesplit2.flip(); for( size_t j=0; j < refpg.E; j++ ){ // logical AND boost::dynamic_bitset<> refsplit1 = ref_splitlist[j]; boost::dynamic_bitset<> refsplit2 = ref_splitlist[j]; refsplit2.flip(); // cout << "rs1.count() " << refsplit1.count() << "\trs2.count() " << refsplit2.count() << endl; normalize_split( refsplit1, species_to_gene_map, pg_splitlist[i].size() ); normalize_split( refsplit2, species_to_gene_map, pg_splitlist[i].size() ); // cout << "normalized rs1.count() " << refsplit1.count() << "\trs2.count() " << refsplit2.count() << endl; boost::dynamic_bitset<> and11 = treesplit1 & refsplit1; boost::dynamic_bitset<> and21 = treesplit2 & refsplit1; boost::dynamic_bitset<> and12 = treesplit1 & refsplit2; boost::dynamic_bitset<> and22 = treesplit2 & refsplit2; double a11score = (double)and11.count() / (double)treesplit1.count(); double a22score = (double)and22.count() / (double)treesplit2.count(); double a1122score = (a11score + a22score) / 2.0; double a12score = (double)and12.count() / (double)treesplit1.count(); double a21score = (double)and21.count() / (double)treesplit2.count(); double a1212score = (a12score + a21score) / 2.0; a1212score = pow( a1212score, 100.0 ); a1122score = pow( a1122score, 100.0 ); maxscores.push_back( max(a1122score, a1212score)); scoresum += maxscores.back(); bestscore = max(maxscores.back(), bestscore); } // count the number of nodes with the max score. if it is more than a threshold, ignore this node since it is too hard to reconcile int place_count = 0; for(size_t j=0; j<maxscores.size(); j++){ if(maxscores[j] < bestscore) continue; place_count++; } if(place_count < placement_limit ){ for(size_t j=0; j<maxscores.size(); j++){ if(maxscores[j] < bestscore) continue; string refnodename = reftree[ refpg.edge_array[j].first ].name; // cout << "gene tree edge " << i << " linking " << other_map[tree[pg.edge_array[i].first].name] << " best reftree edge " << refnodename << endl; // cout << "found edge " << pg.edge_array[i].first << "\n"; mapout << edgenum_map[pg.edge_array[i].first] << "\t" << refnodename << endl; } } // if(pg_splitlist[i].count() == 1) // return; } }
//********************************************************************************************************************** vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) { try{ conTax = ""; vector<string> allNames; map<string, string>::iterator it; map<string, string>::iterator it2; //create a tree containing sequences from this bin PhyloTree* phylo = new PhyloTree(); size = 0; for (int i = 0; i < names.size(); i++) { //if namesfile include the names if (namefile != "") { //is this sequence in the name file - namemap maps seqName -> repSeqName it2 = nameMap.find(names[i]); if (it2 == nameMap.end()) { //this name is not in name file, skip it m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique it = taxMap.find(it2->second); if (it == taxMap.end()) { //this name is not in taxonomy file, skip it if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); } else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); } }else{ //add seq to tree phylo->addSeqToTree(names[i], it->second); size++; allNames.push_back(names[i]); } } }else{ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique it = taxMap.find(names[i]); if (it == taxMap.end()) { //this name is not in taxonomy file, skip it m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ if (countfile != "") { int numDups = ct->getNumSeqs(names[i]); for (int j = 0; j < numDups; j++) { phylo->addSeqToTree(names[i], it->second); } size += numDups; }else{ //add seq to tree phylo->addSeqToTree(names[i], it->second); size++; } allNames.push_back(names[i]); } } if (m->control_pressed) { delete phylo; return allNames; } } //build tree phylo->assignHeirarchyIDs(0); TaxNode currentNode = phylo->get(0); int myLevel = 0; //at each level while (currentNode.children.size() != 0) { //you still have more to explore TaxNode bestChild; int bestChildSize = 0; //go through children for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) { TaxNode temp = phylo->get(itChild->second); //select child with largest accesions - most seqs assigned to it if (temp.accessions.size() > bestChildSize) { bestChild = phylo->get(itChild->second); bestChildSize = temp.accessions.size(); } } //phylotree adds an extra unknown so we want to remove that if (bestChild.name == "unknown") { bestChildSize--; } //is this taxonomy above cutoff int consensusConfidence = ceil((bestChildSize / (float) size) * 100); if (consensusConfidence >= cutoff) { //if yes, add it if (probs) { conTax += bestChild.name + "(" + toString(consensusConfidence) + ");"; }else{ conTax += bestChild.name + ";"; } myLevel++; }else{ //if no, quit break; } //move down a level currentNode = bestChild; } if (myLevel != phylo->getMaxLevel()) { while (myLevel != phylo->getMaxLevel()) { conTax += "unclassified;"; myLevel++; } } if (conTax == "") { conTax = "no_consensus;"; } delete phylo; return allNames; } catch(exception& e) { m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy"); exit(1); } }
int SplitMatrix::splitClassify() { try { cutoff = int(cutoff); map<string, int> seqGroup; map<string, int>::iterator it; map<string, int>::iterator it2; int numGroups = 0; //build tree from users taxonomy file PhyloTree* phylo = new PhyloTree(); map<string, string> temp; m->readTax(taxFile, temp, true); for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) { phylo->addSeqToTree(itTemp->first, itTemp->second); temp.erase(itTemp++); } phylo->assignHeirarchyIDs(0); //make sure the cutoff is not greater than maxlevel if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); } //for each node in tree for (int i = 0; i < phylo->getNumNodes(); i++) { //is this node within the cutoff TaxNode taxon = phylo->get(i); if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton for (int j = 0; j < taxon.accessions.size(); j++) { seqGroup[taxon.accessions[j]] = numGroups; } numGroups++; } } } delete phylo; if (method == "classify") { splitDistanceFileByTax(seqGroup, numGroups); } else { createDistanceFilesFromTax(seqGroup, numGroups); } return 0; } catch(exception& e) { m->errorOut(e, "SplitMatrix", "splitClassify"); exit(1); } }
double RateFree::optimizeWithEM() { size_t ptn, c; size_t nptn = phylo_tree->aln->getNPattern(); size_t nmix = ncategory; const double MIN_PROP = 1e-4; // double *lk_ptn = aligned_alloc<double>(nptn); double *new_prop = aligned_alloc<double>(nmix); PhyloTree *tree = new PhyloTree; // attach memory to save space // tree->central_partial_lh = phylo_tree->central_partial_lh; // tree->central_scale_num = phylo_tree->central_scale_num; // tree->central_partial_pars = phylo_tree->central_partial_pars; tree->copyPhyloTree(phylo_tree); tree->optimize_by_newton = phylo_tree->optimize_by_newton; tree->setParams(phylo_tree->params); tree->setLikelihoodKernel(phylo_tree->sse); tree->setNumThreads(phylo_tree->num_threads); // initialize model ModelFactory *model_fac = new ModelFactory(); model_fac->joint_optimize = phylo_tree->params->optimize_model_rate_joint; // model_fac->unobserved_ptns = phylo_tree->getModelFactory()->unobserved_ptns; RateHeterogeneity *site_rate = new RateHeterogeneity; tree->setRate(site_rate); site_rate->setTree(tree); model_fac->site_rate = site_rate; tree->model_factory = model_fac; tree->setParams(phylo_tree->params); double old_score = 0.0; // EM algorithm loop described in Wang, Li, Susko, and Roger (2008) for (int step = 0; step < ncategory; step++) { // first compute _pattern_lh_cat double score; score = phylo_tree->computePatternLhCat(WSL_RATECAT); if (score > 0.0) { phylo_tree->printTree(cout, WT_BR_LEN+WT_NEWLINE); writeInfo(cout); } ASSERT(score < 0); if (step > 0) { if (score <= old_score-0.1) { phylo_tree->printTree(cout, WT_BR_LEN+WT_NEWLINE); writeInfo(cout); cout << "Partition " << phylo_tree->aln->name << endl; cout << "score: " << score << " old_score: " << old_score << endl; } ASSERT(score > old_score-0.1); } old_score = score; memset(new_prop, 0, nmix*sizeof(double)); // E-step // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i for (ptn = 0; ptn < nptn; ptn++) { double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix; double lk_ptn = phylo_tree->ptn_invar[ptn]; for (c = 0; c < nmix; c++) { lk_ptn += this_lk_cat[c]; } ASSERT(lk_ptn != 0.0); lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn; // transform _pattern_lh_cat into posterior probabilities of each category for (c = 0; c < nmix; c++) { this_lk_cat[c] *= lk_ptn; new_prop[c] += this_lk_cat[c]; } } // M-step, update weights according to (*) int maxpropid = 0; double new_pinvar = 0.0; for (c = 0; c < nmix; c++) { new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite(); if (new_prop[c] > new_prop[maxpropid]) maxpropid = c; } // regularize prop bool zero_prop = false; for (c = 0; c < nmix; c++) { if (new_prop[c] < MIN_PROP) { new_prop[maxpropid] -= (MIN_PROP - new_prop[c]); new_prop[c] = MIN_PROP; zero_prop = true; } } // break if some probabilities too small if (zero_prop) break; bool converged = true; double sum_prop = 0.0; for (c = 0; c < nmix; c++) { // new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite(); // check for convergence sum_prop += new_prop[c]; converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4); prop[c] = new_prop[c]; new_pinvar += new_prop[c]; } new_pinvar = 1.0 - new_pinvar; if (new_pinvar > 1e-4 && getPInvar() != 0.0) { converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4); if (isFixPInvar()) outError("Fixed given p-invar is not supported"); setPInvar(new_pinvar); // setOptimizePInvar(false); phylo_tree->computePtnInvar(); } ASSERT(fabs(sum_prop+new_pinvar-1.0) < MIN_PROP); // now optimize rates one by one double sum = 0.0; for (c = 0; c < nmix; c++) { tree->copyPhyloTree(phylo_tree); ModelMarkov *subst_model; if (phylo_tree->getModel()->isMixture() && phylo_tree->getModelFactory()->fused_mix_rate) subst_model = (ModelMarkov*)phylo_tree->getModel()->getMixtureClass(c); else subst_model = (ModelMarkov*)phylo_tree->getModel(); tree->setModel(subst_model); subst_model->setTree(tree); model_fac->model = subst_model; if (subst_model->isMixture() || subst_model->isSiteSpecificModel() || !subst_model->isReversible()) tree->setLikelihoodKernel(phylo_tree->sse); // initialize likelihood tree->initializeAllPartialLh(); // copy posterior probability into ptn_freq tree->computePtnFreq(); double *this_lk_cat = phylo_tree->_pattern_lh_cat+c; for (ptn = 0; ptn < nptn; ptn++) tree->ptn_freq[ptn] = this_lk_cat[ptn*nmix]; double scaling = rates[c]; tree->scaleLength(scaling); tree->optimizeTreeLengthScaling(MIN_PROP, scaling, 1.0/prop[c], 0.001); converged = converged && (fabs(rates[c] - scaling) < 1e-4); rates[c] = scaling; sum += prop[c] * rates[c]; // reset subst model tree->setModel(NULL); subst_model->setTree(phylo_tree); } phylo_tree->clearAllPartialLH(); if (converged) break; } // sort the rates in increasing order if (sorted_rates) quicksort(rates, 0, ncategory-1, prop); // deattach memory // tree->central_partial_lh = NULL; // tree->central_scale_num = NULL; // tree->central_partial_pars = NULL; delete tree; aligned_free(new_prop); return phylo_tree->computeLikelihood(); }
int main(int argc, char *argv[]) { time_t rawtime; struct tm * timeinfo; time ( &rawtime ); timeinfo = localtime ( &rawtime ); // std::cout << "Start " << asctime (timeinfo); ///////////////////////////////////////// // Setup if( argc < 10 || argc > 12){ std::cout << "Wrong number of input arguments (" << argc << "), should have format:\n"; std::cout << "\ttree_to_matrix <infile> <tmpfile> <prunedfile> <refalignment> <outfile> <starting_row> <ending_row> <format M=matrix E=esprit> <Do_Pruning 0=no 1=yes 2=only prune> [outfile_freq] [maxdistance(E format only)]\n"; } char* infilename = argv[1]; // (input) Tree file with reference sequences char* tempfilename = argv[2]; // (output) Half-pruned file (after pruning, before cleaning up single-child nodes and internal nodes which have become leaves char* prunedfilename = argv[3]; // (in/out) Pruned file name, input if not pruning, output if pruning char* refalignname = argv[4]; // (input) reference fasta file, only uses the sequence identifiers for pruning char* outfilename = argv[5]; // (output) Output distance matrix/list file name int startrow = atoi(argv[6]); // (input) First row to print for the distance matrix (0 for all) int endrow = atoi(argv[7]); // (input) Last row to print for the distance matrix (0 for all) char format = argv[8][0]; // (input) Format of distance, M=matrix, E=ESPRIT list int do_pruning = atoi(argv[9]); // (input) 0=no 1=yes 2=only prune // M = matrix format, used by mothur // E = ESPRIT list format char* frqfilename; float maxdist=0.1; if( argc == 12 ){ frqfilename = argv[10]; // (Optional output) Frequency file name, used when running ESPRIT maxdist = atof(argv[11]); // (Optional input) Maximum distance to print in the distance list (ESPRIT format only) std::cout << frqfilename << " " << maxdist << std::endl; } else { if( format == 'E' ){ std::cerr << "maximum distance required for ESPRIT printout; quitting\n"; return EXIT_FAILURE; } } int srow = startrow; char* inname; if( do_pruning>0 ){ // Read in raw file, then prune it inname = infilename; } else { // Read in pruned file directly inname = prunedfilename; } if( format == 'E' ){ std::cout << "Printing output in ESPRIT list format\n"; } else if( format == 'M' ){ std::cout << "Printing output in Mothur matrix format\n"; } else { std::cerr << "Unknown format " << format << ". Quitting\n"; return EXIT_FAILURE; } std::list<TreeNode>::iterator startit; std::list<TreeNode>::iterator endit; ///////////////////////////////////////// // READ IN TREE FROM FILE std::cout << "Reading in " << inname << std::endl; PhyloTree<TreeNode>* tr = new PhyloTree<TreeNode>(); std::ifstream infile; infile.open(inname); if( !infile.is_open() ){ std::cout << "Unable to open file " << inname << std::endl; } tr->readTree(infile); std::cout << "LEAVES: " << tr->getNleaves() << std::endl; tr->check_root(); ///////////////////////////////////////// // Prune tree (if necessary) if( do_pruning>0 ){ std::cout << "Pruning tree\n"; // Read in reference alignment file and grab reference file names std::ifstream reffile; reffile.open(refalignname); if( !reffile.is_open() ){ std::cout << "Unable to open file " << refalignname << std::endl; } char line[100]; reffile >> line; while( !reffile.eof() ){ if( line[0] == '>' ){ // Clean-up the file name std::string name(line); int slash = (int)name.find("/"); name = name.substr(1, slash-1); int bar = (int)name.find("|"); if ( bar != name.npos ){ name = name.replace(bar, 1, "_"); } // Remove this leaf from the tree tr->deleteLeaf(name.c_str()); } reffile >> line; } reffile.close(); // Print to tmp file, just in case std::ofstream treeout; treeout.open( tempfilename ); if( !treeout.is_open() ){ std::cout << "Unable to open file " << tempfilename << std::endl; } treeout.precision(5); treeout.setf(std::ios::fixed,std::ios::floatfield); tr->writeTree( treeout ); treeout.close(); std::cout << "Printed to file " << tempfilename << std::endl; // Remove internal nodes that are now leaves while( tr->deleteLeaf("") > 0 ); // Smooth to remove single child nodes while( tr->smooth() > 0 ); // Check that the root doesn't have only one node tr->check_root(); // Print pruned file, for use by parallel jobs treeout.open( prunedfilename ); if( !treeout.is_open() ){ std::cout << "Unable to open file " << prunedfilename << std::endl; } treeout.precision(6); treeout.setf(std::ios::fixed,std::ios::floatfield); tr->writeTree( treeout ); treeout.close(); std::cout << "Printed to file " << prunedfilename << std::endl; // If I only needed to prune then I'm done if( do_pruning>1 ){ std::cout << "Done pruning tips, ready to launch parallel tree_to_matrix jobs\n"; return EXIT_SUCCESS; } }