void ConditionalCladeProbabilityDistribution::readFromFile(std::string fname) { std::string tree_string; std::ifstream file_stream; file_stream.open(fname.c_str()); std::string reading="#nothing"; if (file_stream.is_open()) // ########## read state ############ { while (! file_stream.eof()) { std::string line; getline (file_stream,line); if (boost::find_first(line, "#")) { boost::trim(line); reading=line; } else if (reading=="#constructor_string") { //cout << reading << std::endl; boost::trim(line); NewickConverter c; BranchLengthTree *blTree = c.convertFromNewick(line); constructorTree = new BranchLengthTree( *blTree ); construct(*constructorTree); reading="#nothing"; } else if (reading=="#observations") { boost::trim(line); numberOfObservedTrees=atof(line.c_str()); } else if (reading=="#Bip_counts") { //cout << reading << std::endl; std::vector<std::string> tokens; boost::trim(line); boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); BipartitionCounts[atol(tokens[0].c_str())]=atof(tokens[1].c_str()); } else if (reading=="#Bip_bls") { //cout << reading << std::endl; std::vector<std::string> tokens; boost::trim(line); boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); std::vector< double > branchLengths; for (size_t i = 1; i < tokens.size(); ++i) { branchLengths.push_back( atof(tokens[i].c_str()) ); } BipartitionBranchLengths[atol(tokens[0].c_str())] = branchLengths; } else if (reading=="#Dip_counts") { //cout << reading << std::endl; std::vector<std::string> tokens; boost::trim(line); boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); std::pair<long int, long int> parts; /*PREVECTORIZATION CODE set<long int> parts; parts.insert(atoi(tokens[1].c_str())); parts.insert(atoi(tokens[2].c_str())); tripletFrequencies[atol(tokens[0].c_str())][parts]=atof(tokens[3].c_str()); */ parts.first = atoi(tokens[1].c_str()); parts.second = atoi(tokens[2].c_str()); if ( atol(tokens[0].c_str()) >= (long int) ( tripletFrequencies.size() ) ) { boost::unordered_map< std::pair<size_t, size_t>,double> temp ; while (atol(tokens[0].c_str()) > (int) tripletFrequencies.size() ) { tripletFrequencies.push_back(temp); } temp[parts]=atof(tokens[3].c_str()); tripletFrequencies.push_back(temp); } else { tripletFrequencies[atol(tokens[0].c_str())][parts]=atof(tokens[3].c_str()); } } else if (reading=="#last_leafset_id") { //cout << reading << std::endl; boost::trim(line); last_leafset_id=atol(line.c_str()); } else if (reading=="#leaf-id") { //cout << reading << std::endl; std::vector<std::string> tokens; boost::trim(line); boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); int id=atoi(tokens[1].c_str()); std::string leaf_name=tokens[0]; leaf_ids[leaf_name]=id; id_leaves[id]=leaf_name; } else if (reading=="#set-id") { //cout << reading << std::endl; std::vector<std::string> fields; boost::trim(line); boost::split(fields,line,boost::is_any_of(":"),boost::token_compress_on); boost::trim(fields[0]); long int set_id=atol(fields[0].c_str()); std::vector<std::string> tokens; boost::trim(fields[1]); boost::split(tokens,fields[1],boost::is_any_of("\t "),boost::token_compress_on); boost::dynamic_bitset<> temp( numTaxons + 1 ); for (std::vector<std::string>::iterator it=tokens.begin(); it!=tokens.end(); it++) { //Setting the proper bits to 1 temp[static_cast<int>(atoi((*it).c_str()))] = 1; } // std::cout <<"setid : "<< set_id << " READING: " << temp << std::endl; set_ids[temp]=set_id; id_sets[set_id]=temp; } } } //Attempt adding something for the root bipartition: boost::dynamic_bitset<> temp (numTaxons +1); for (size_t i = 1 ; i<numTaxons +1; ++i) { temp[i] = 1; } id_sets[-1] = temp; set_ids[temp] = -1; for ( std::map< size_t, boost::dynamic_bitset<> >::const_iterator it = id_sets.begin(); it != id_sets.end(); it++ ) { size_t size = 0; for (size_t i=0; i< numTaxons + 1; ++i) { // if (BipartitionTools::testBit( (*it).second, static_cast<int>(i) ) ) { if ( (*it).second[i] ) { size+=1; } } set_sizes[ (*it).first ] = size; size_ordered_bips[size].push_back( (*it).first ); // std::cout << size << " AND "<< (*it).first <<std::endl; /* set_sizes[(*it).first]=(*it).second.size(); size_ordered_bips[(*it).second.size()].push_back((*it).first);*/ } /* for ( auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++ ) { VectorTools::print ( (*it).second ); }*/ }
bool TestAdmixtureGraph::run(void) { std::cout << "Running TestAdmixtureGraph\n"; std::cout << "argc: " << argc << "\n"; if (argc > 1) { for (int i = 0; i < argc; i++) { argTokens.push_back(argv[i]); std::cout << i << " " << argTokens[i] << "\n"; } std::cout << argc << " == " << argTokens.size() << " arguments\n"; snpFilename = argTokens[1]; } // MODEL GRAPH std::vector<unsigned int> seed; seed = GLOBAL_RNG->getSeed(); std::cout << "seed " << seed[0] << " " << seed[1] << "\n"; //seed.clear(); seed.push_back(53866); seed.push_back(21201); GLOBAL_RNG->setSeed(seed); // 53866 21201 // read in data std::string fn = snpFilepath + snpFilename; int snpThinBy = 100; SnpData* snps = PopulationDataReader().readSnpData(fn,snpThinBy); // read in tree std::vector<AdmixtureTree*> trees; bool startTree = false; //treeFilename = ""; if (treeFilename != "") { // NclReader does not seem to work for Newick strings at this time /* trees = NclReader::getInstance().readAdmixtureTrees( snpFilepath + treeFilename, "newick" ); std::cout << "Read " << trees.size() << " trees." << std::endl; std::cout << trees[0]->getNewickRepresentation() << std::endl; */ // hacky workaround for now... //std::string newickStr = "(San:1,((Han:0.348958,Dai:0.348958):0.194964,(((Ket:0.351687,(Koryak:0.344761,(SiberianEskimo:0.325112,(((Huichol:0.269256,(Pima:0.233362,((Karitiana:0.104362,Aymara:0.104362):0.0120051,(Yukpa:0.100877,Mayan:0.100877):0.0154902):0.116995):0.0358943):0.0341607,Athabascan:0.303417):0.0128117,((EastGreenland:0.158699,WestGreenland:0.158699):0.0369703,Aleuts:0.195669):0.120559):0.00888335):0.0196487):0.0069259):0.0987521,((Nivhks:0.278599,Buryat:0.278599):0.0642882,Yakut:0.342887):0.107551):0.0756044,Altai:0.526043):0.0178791):0.456078)"; //std::string newickStr = "((A:.5,B:.5):.5,C:.5)"; std::string newickStr = "(San:1,((Koryak:0.926938,(SiberianEskimo:0.812519,((Aleuts:0.762302,(EastGreenland:0.4824,WestGreenland:0.4824):0.279902):0.0144746,((Huichol:0.353522,(Pima:0.324226,((Mayan:0.223067,Yukpa:0.223067):0.0856916,(Aymara:0.173581,Karitiana:0.173581):0.135178):0.015467):0.0292956):0.109212,Athabascan:0.462734):0.314043):0.0357419):0.114419):0.0600046,(Ket:0.772384,((Altai:0.640188,(Han:0.481097,Dai:0.481097):0.159091):0.0445788,((Buryat:0.596074,Nivhks:0.596074):0.015206,Yakut:0.61128):0.0734876):0.0876171):0.214558):0.0130575);"; NewickConverter nc; BranchLengthTree* blt = nc.convertFromNewick(newickStr); AdmixtureTree* at = TreeUtilities::convertToAdmixtureTree(*blt, snps->getPopulationNames()); at->setNames(snps->getPopulationNames()); at->updateTipOrderByNames(snps->getPopulationNames()); trees.push_back(at); startTree = true; } size_t numTaxa = snps->getNumPopulations(); size_t numNodes = 2 * numTaxa - 1; size_t numBranches = numNodes - 1; //size_t numSites = snps->getNumSnps(); int blockSize = 5000; double divGens = 1;//.01; int delay = 1000; int numTreeResults = 500; int numAdmixtureResults = 500; int maxNumberOfAdmixtureEvents = 1; double residualWeight = 2.0; bool useWishart = true; // if false, the composite likelihood function is used bool useBias = true; // if false, no covariance bias correction for small sample size is used bool useAdmixtureEdges = true; // if false, no admixture moves or edges are used bool useBranchRates = true; // if false, all populations are of the same size bool allowSisterAdmixture = true; // if false, admixture events cannot be between internal lineages who share a divergence parent bool discardNonPosDefMtx = true; // if false, round negative eigenvalues to positive eps bool useContrasts = false; // nothing really, need to remove bool updateParameters = true; bool updateTopology = true; bool updateNodeAges = true; bool useParallelMcmcmc = true; int numChains = 4; int numProcesses = numChains; // numProcesses=80; int swapInterval = 1; double deltaTemp = .1; double sigmaTemp = 1.0; double hottestTemp = 0.001; if (!true) { deltaTemp = exp(-log(hottestTemp)/pow(numChains-1,sigmaTemp)) - 1; std::cout << deltaTemp << "\n"; } double startingHeat = 1.0; double likelihoodScaler = 1.0; std::stringstream rndStr; rndStr << std::setw(9) << std::fixed << std::setprecision(0) << std::setfill('0') << std::floor(GLOBAL_RNG->uniform01()*1e9); // std::string outName = "papa." + rndStr.str(); std::string simName = "hgdp"; std::string outName = simName + "." + rndStr.str(); // std::string outName = simName + "." + rndStr.str() + "." + snpFilename; // BM diffusion rate ConstantNode<double>* a_bm = new ConstantNode<double>( "bm_a", new double(3)); ConstantNode<double>* b_bm = new ConstantNode<double>( "bm_b", new double(100)); //ConstantNode<double>* c_bm = new ConstantNode<double>( "bm_c", new double(0)); //ConstantNode<double>* d_bm = new ConstantNode<double>( "bm_d", new double(100)); StochasticNode<double>* diffusionRate = new StochasticNode<double> ("rate_BM", new ExponentialDistribution(b_bm)); //StochasticNode<double>* diffusionRate = new StochasticNode<double> ("rate_BM", new UniformDistribution(c_bm, d_bm)); // CPP rate // MJL 071713: Flat Poisson prior cannot overpower model overfitting when lnL is large. // Consider implementing Conway-Maxewell-Poisson distn instead. // This prior requires admixture events to improve lnL by N units // Negative values -> admixture rare // Positive values -> admixture common (set admixture cap) double adm_th_lnL = 10; double rate_cpp_prior = exp(adm_th_lnL); ConstantNode<double>* c = new ConstantNode<double>( "c", new double(1.0/rate_cpp_prior)); // admixture rate prior StochasticNode<double>* admixtureRate = new StochasticNode<double> ("rate_CPP", new ExponentialDistribution(c)); StochasticNode<int>* admixtureCount = new StochasticNode<int> ("count_CPP", new PoissonDistribution(admixtureRate)); admixtureCount->clamp(new int(0)); admixtureRate->clamp(new double(rate_cpp_prior)); if (!useAdmixtureEdges) { admixtureRate->clamp(new double(rate_cpp_prior)); admixtureCount->clamp(new int(0)); } // birth-death process for ultrametric tree StochasticNode<double>* diversificationRate = new StochasticNode<double>("div", new UniformDistribution(new ConstantNode<double>("div_lower", new double(0.0)), new ConstantNode<double>("div_upper", new double(50.0)) )); StochasticNode<double>* turnover = new StochasticNode<double>("turnover", new UniformDistribution(new ConstantNode<double>("do_lower", new double(0.0)), new ConstantNode<double>("do_upper", new double(1.0)) )); // tree node StochasticNode<AdmixtureTree>* tau = new StochasticNode<AdmixtureTree>( "tau", new AdmixtureConstantBirthDeathProcess(diversificationRate, turnover, (int)numTaxa, snps->getPopulationNames(), snps->getOutgroup()) ); if (startTree) { tau->setValue(new AdmixtureTree(*trees[0])); tau->setIgnoreRedraw(true); } // branch multipliers (mutation rate is clocklike, but population sizes are not) std::vector<const TypedDagNode<double> *> branchRates; std::vector< ContinuousStochasticNode *> branchRates_nonConst; ConstantNode<double>* branchRateA = new ConstantNode<double>( "branchRateA", new double(1)); ConstantNode<double>* branchRateB = new ConstantNode<double>( "branchRateB", new double(2)); ConstantNode<double>* branchRateC = new ConstantNode<double>( "branchRateC", new double(0)); ConstantNode<double>* branchRateD = new ConstantNode<double>( "branchRateD", new double(.01)); //ConstantNode<double>* branchRateE = new ConstantNode<double>( "branchRateE", new double(10)); for( size_t i=0; i<numBranches; i++){ std::ostringstream br_name; br_name << "br_" << i; //ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode( br_name.str(), new ExponentialDistribution(branchRateD) ); //ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode( br_name.str(), new InverseGammaDistribution(branchRateA, branchRateB)); //ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode(br_name.str(), new LognormalDistribution(branchRateC, branchRateA)); //ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode(br_name.str(), new UniformDistribution(branchRateC, branchRateE)); // ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode( br_name.str(), new GammaDistribution(branchRateB, branchRateD)); ContinuousStochasticNode* tmp_branch_rate = new ContinuousStochasticNode( br_name.str(), new GammaDistribution(branchRateB, branchRateB)); if (!useBranchRates) { tmp_branch_rate->clamp(new double(1.0)); } branchRates.push_back( tmp_branch_rate ); branchRates_nonConst.push_back( tmp_branch_rate ); } DeterministicNode< std::vector< double > >* br_vector = new DeterministicNode< std::vector< double > >( "br_vector", new VectorFunction< double >( branchRates ) ); // model node BrownianMotionAdmixtureGraph* bmag = new BrownianMotionAdmixtureGraph( tau, diffusionRate, admixtureRate, br_vector, snps, useWishart, useContrasts, useBias, discardNonPosDefMtx, blockSize, likelihoodScaler ); StochasticNode<ContinuousCharacterData >* admixtureModel; admixtureModel = new StochasticNode<ContinuousCharacterData >("AdmixtureGraph", bmag); // have to clamp to distinguish likelihood from prior (incidentally calls setValue(), but this is handled otherwise) admixtureModel->clamp( new ContinuousCharacterData() ); // does it event matter how it's clamped? // residuals DeterministicNode<std::vector<double> >* residuals = new DeterministicNode<std::vector<double> >("residuals", new BrownianMotionAdmixtureGraphResiduals(admixtureModel)); // MOVES std::cout << "Adding moves\n"; // moves vector RbVector<Move> moves; // model parameters if (updateParameters) { moves.push_back( new MetropolisHastingsMove( new ScaleProposal(diffusionRate, 0.1), 5, false ) ); moves.push_back( new MetropolisHastingsMove( new ScaleProposal(diversificationRate, 0.5), 5, false ) ); moves.push_back( new MetropolisHastingsMove( new ScaleProposal(turnover, 0.5), 5, false ) ); } // non-admixture tree updates if (updateTopology) { moves.push_back( new AdmixtureNarrowExchange( tau, 0.1, numTaxa/2) ); moves.push_back( new AdmixtureSubtreePruneRegraft( tau, 0.1, numTaxa/4) ); moves.push_back( new AdmixtureFixedNodeheightPruneRegraft(tau, numTaxa/4)); moves.push_back( new AdmixtureEdgeReplaceNNI( tau, residuals, residualWeight, delay, 0, allowSisterAdmixture, numTaxa)); moves.push_back( new AdmixtureEdgeReplaceFNPR( tau, residuals, residualWeight, delay, 0, allowSisterAdmixture, numTaxa)); moves.push_back( new AdmixtureEdgeReplaceSubtreeRegraft( tau, residuals, residualWeight, delay, 0, allowSisterAdmixture, numTaxa)); } if (updateNodeAges) { for (size_t i = numTaxa; i < numNodes - 1; i++) { moves.push_back( new AdmixtureNodeTimeSlideBeta( tau, (int)i, 15.0, false, 1.0 ) ); moves.push_back( new AdmixtureNodeTimeSlideBeta( tau, (int)i, 1.0, false, 0.5 ) ); } } // branch rate updates if (useBranchRates) { // branch rate multipliers for( size_t i=0; i < numBranches; i++) { moves.push_back( new MetropolisHastingsMove( new ScaleProposal(branchRates_nonConst[i], 0.1), 1, false ) ); moves.push_back( new MetropolisHastingsMove( new ScaleProposal(branchRates_nonConst[i], 1.0), .5, false ) ); } // tree rate shift moves.push_back( new AdmixtureShiftTreeRates(diffusionRate, branchRates_nonConst, 0.5, false, 2.0)); // shift node age for branch rate for (size_t i = numTaxa; i < numNodes - 1; i++) { if (updateNodeAges) moves.push_back( new AdmixtureShiftNodeAgeAndRate(tau, branchRates_nonConst, (int)i, 0.7, false, 1.0) ); // MJL 081513: not working, I think... if (updateTopology) { std::vector<DagNode*> pvec; pvec.push_back(tau); pvec.push_back(branchRates_nonConst[i]); //moves.push_back( new AdmixtureSubtreePruneRegraftAndRateShift(pvec, i, 0.5, 1.0) ); // ... something wrong with how the lnProb is computed using the lnProb ratios... } } // NNI with branch rate modifier (not working quite right, disabled) if (updateTopology) moves.push_back( new AdmixtureNearestNeighborInterchangeAndRateShift( tau, branchRates_nonConst, 0.1, false, numTaxa)); } // admixture tree updates if (useAdmixtureEdges) { moves.push_back( new AdmixtureEdgeAddResidualWeights( tau, admixtureRate, admixtureCount, residuals, residualWeight, delay, maxNumberOfAdmixtureEvents, allowSisterAdmixture, 10.0) ); moves.push_back( new AdmixtureEdgeRemoveResidualWeights( tau, admixtureRate, admixtureCount, residuals, residualWeight, delay, 10.0) ); moves.push_back( new AdmixtureEdgeReplaceResidualWeights( tau, admixtureRate, branchRates_nonConst, residuals, residualWeight, delay, allowSisterAdmixture, 20.0) ); //moves.push_back( new AdmixtureEdgeMultiRemove( tau, admixtureRate, admixtureCount, residuals, residualWeight, delay, 2.0 ) ); //moves.push_back( new AdmixtureReplaceAndNNI( tau, 0.5, 10.0) ); //moves.push_back( new AdmixtureEdgeAddCladeResiduals( tau, admixtureRate, admixtureCount, residuals, delay, maxNumberOfAdmixtureEvents, allowSisterAdmixture, 2.0) ); //moves.push_back( new AdmixtureEdgeReplaceCladeResiduals( tau, admixtureRate, branchRates_nonConst, residuals, delay, allowSisterAdmixture, 15.0) ); if (updateTopology) { moves.push_back( new AdmixtureEdgeDivergenceMerge( tau, admixtureRate, branchRates_nonConst, admixtureCount, residuals, delay, allowSisterAdmixture, 5.0 )); moves.push_back( new AdmixtureEdgeRegraftReplace( tau, residuals, 1.0, delay, maxNumberOfAdmixtureEvents, allowSisterAdmixture, 5.0)); ; } moves.push_back( new AdmixtureEdgeReweight( tau, delay, 10.0, 10.0) ); moves.push_back( new AdmixtureEdgeReversePolarity( tau, delay, 2.0, 10.0) ); moves.push_back( new AdmixtureEdgeSlide( tau, branchRates_nonConst, delay, allowSisterAdmixture, 10.0, 10.0) ); moves.push_back( new AdmixtureEdgeFNPR( tau, branchRates_nonConst, delay, allowSisterAdmixture, 10.0, 10.0) ); moves.push_back( new MetropolisHastingsMove( new ScaleProposal(admixtureRate, 0.1), 5, false ) ); } // MONITORS std::cout << "Adding monitors\n"; RbVector<Monitor> monitors; // parameter monitor std::vector<DagNode*> monitoredNodes; monitoredNodes.push_back( diffusionRate ); monitoredNodes.push_back( admixtureRate ); monitoredNodes.push_back( diversificationRate ); monitoredNodes.push_back( turnover ); monitoredNodes.push_back( admixtureCount ); if (useBranchRates) { for( size_t i=0; i<numBranches; i++){ monitoredNodes.push_back( branchRates_nonConst[i] ); } } monitors.push_back( new FileMonitor( monitoredNodes, 1, "/Users/mlandis/data/admix/output/" + outName + ".parameters.txt", "\t", true, true, true, useParallelMcmcmc, useParallelMcmcmc, useParallelMcmcmc ) ); monitors.push_back( new ScreenMonitor( monitoredNodes, 1, "\t" ) ); monitors.push_back( new AdmixtureBipartitionMonitor(tau, diffusionRate, br_vector, numTreeResults, numAdmixtureResults, 1, "/Users/mlandis/data/admix/output/" + outName + ".bipartitions.txt", "\t", true, true, true, true, true, true ) ); monitors.push_back( new AdmixtureResidualsMonitor(residuals, snps->getPopulationNames(), 10, "/Users/mlandis/data/admix/output/" + outName + ".residuals.txt", "\t", true, true, true, true ) ); //monitors.push_back( new ExtendedNewickAdmixtureTreeMonitor( tau, br_vector, true, true, 10, "/Users/mlandis/data/admix/output/" + outName + ".admixture_trees.txt", "\t", true, true, true, true ) ); //monitors.push_back( new ExtendedNewickAdmixtureTreeMonitor( tau, br_vector, false, true, 10, "/Users/mlandis/data/admix/output/" + outName + ".topology_trees.trees", "\t", true, true, true, true ) ); monitors.push_back( new ExtendedNewickAdmixtureTreeMonitor( tau, br_vector, false, false, 10, "/Users/mlandis/data/admix/output/" + outName + ".time_trees.trees", "\t", true, true, true, true ) ); // MODEL std::cout << "Calling model\n"; std::set<const DagNode*> mset; mset.insert(admixtureRate); Model myModel = Model(mset); // MCMC std::cout << "Calling mcmc\n"; if (!useParallelMcmcmc) { Mcmc myMcmc = Mcmc(myModel, moves, monitors); myMcmc.run(mcmcGenerations); myMcmc.printOperatorSummary(); } else { ParallelMcmcmc myPmc3(myModel, moves, monitors, "random", numChains, numProcesses, swapInterval, deltaTemp, sigmaTemp, startingHeat); myPmc3.run(mcmcGenerations/divGens); myPmc3.printOperatorSummary(); } std::cout << "All done!\n"; // OBJECT CLEANUP delete snps; delete a_bm; delete b_bm; delete c; delete tau; delete diversificationRate; delete turnover; delete diffusionRate; delete admixtureRate; delete admixtureCount; delete branchRateA; delete branchRateB; delete branchRateC; delete branchRateD; branchRates_nonConst.clear(); branchRates.clear(); delete br_vector; //delete bmag; // malloc deallocation error delete admixtureModel; delete residuals; moves.clear(); monitors.clear(); return true; }