int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.01, B:0.01):0.02,C:0.03):0.01,D:0.04);"); vector<string> seqNames = tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET; SubstitutionModel* model = new T92(alphabet, 3.); DiscreteDistribution* rdist = new GammaDiscreteDistribution(4, 1.0); rdist->aliasParameters("alpha", "beta"); VectorSiteContainer sites(alphabet); sites.addSequence(BasicSequence("A", "AAATGGCTGTGCACGTC", alphabet)); sites.addSequence(BasicSequence("B", "AACTGGATCTGCATGTC", alphabet)); sites.addSequence(BasicSequence("C", "ATCTGGACGTGCACGTG", alphabet)); sites.addSequence(BasicSequence("D", "CAACGGGAGTGCGCCTA", alphabet)); try { fitModelH(model, rdist, *tree, sites, 93.017264552603336369, 71.265543199977557265); } catch (Exception& ex) { cerr << ex.what() << endl; return 1; } try { fitModelHClock(model, rdist, *tree, sites, 92.27912072473920090943, 71.26554020984087856050); } catch (Exception& ex) { cerr << ex.what() << endl; return 1; } //------------- delete tree; delete model; delete rdist; return 0; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("((A:0.01, B:0.02):0.03,C:0.01,D:0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- NucleicAlphabet* alphabet = new DNA(); SubstitutionModel* model = new T92(alphabet, 3.); FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); DiscreteDistribution* rdist = new ConstantRateDistribution(); vector<double> thetas; for (unsigned int i = 0; i < modelSet->getNumberOfModels(); ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas.push_back(theta); } NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree); unsigned int n = 100000; OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Check fast simulation first: cout << "Fast check:" << endl; //Generate data set: VectorSiteContainer sites(seqNames, alphabet); for (unsigned int i = 0; i < n; ++i) { auto_ptr<Site> site(simulator.simulateSite()); site->setPosition(static_cast<int>(i)); sites.addSite(*site, false); } //Now fit model: SubstitutionModelSet* modelSet2 = modelSet->clone(); RNonHomogeneousTreeLikelihood tl(*tree, sites, modelSet2, rdist); tl.initialize(); OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue() << endl; double diff = abs(thetas[i] - modelSet2->getModel(i)->getParameter("theta").getValue()); if (diff > 0.1) return 1; } delete modelSet2; //Now try detailed simulations: cout << "Detailed check:" << endl; //Generate data set: VectorSiteContainer sites2(seqNames, alphabet); for (unsigned int i = 0; i < n; ++i) { RASiteSimulationResult* result = simulator.dSimulateSite(); auto_ptr<Site> site(result->getSite(*simulator.getSubstitutionModelSet()->getModel(0))); site->setPosition(static_cast<int>(i)); sites2.addSite(*site, false); delete result; } //Now fit model: SubstitutionModelSet* modelSet3 = modelSet->clone(); RNonHomogeneousTreeLikelihood tl2(*tree, sites2, modelSet3, rdist); tl2.initialize(); OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl; double diff = abs(thetas[i] - modelSet3->getModel(i)->getParameter("theta").getValue()); if (diff > 0.1) return 1; } delete modelSet3; //------------- delete tree; delete alphabet; delete modelSet; delete rdist; return 0; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("((A:0.001, B:0.002):0.008,C:0.01,D:0.1);"); vector<int> ids = tree->getNodesId(); ids.pop_back(); //Ignore root //------------- CodonAlphabet* alphabet = new CodonAlphabet(&AlphabetTools::DNA_ALPHABET); GeneticCode* gc = new StandardGeneticCode(&AlphabetTools::DNA_ALPHABET); CodonSubstitutionModel* model = new YN98(gc, CodonFrequenciesSet::getFrequenciesSetForCodons(CodonFrequenciesSet::F0, gc)); //SubstitutionModel* model = new CodonRateSubstitutionModel( // gc, // new JCnuc(dynamic_cast<CodonAlphabet*>(alphabet)->getNucleicAlphabet())); cout << model->getNumberOfStates() << endl; MatrixTools::printForR(model->getGenerator(), "g"); DiscreteDistribution* rdist = new ConstantDistribution(1.0); HomogeneousSequenceSimulator simulator(model, rdist, tree); TotalSubstitutionRegister* totReg = new TotalSubstitutionRegister(model); DnDsSubstitutionRegister* dndsReg = new DnDsSubstitutionRegister(model); unsigned int n = 20000; vector< vector<double> > realMap(n); vector< vector< vector<double> > > realMapTotal(n); vector< vector< vector<double> > > realMapDnDs(n); VectorSiteContainer sites(tree->getLeavesNames(), alphabet); for (unsigned int i = 0; i < n; ++i) { ApplicationTools::displayGauge(i, n-1, '='); RASiteSimulationResult* result = simulator.dSimulateSite(); realMap[i].resize(ids.size()); realMapTotal[i].resize(ids.size()); realMapDnDs[i].resize(ids.size()); for (size_t j = 0; j < ids.size(); ++j) { realMap[i][j] = static_cast<double>(result->getSubstitutionCount(ids[j])); realMapTotal[i][j].resize(totReg->getNumberOfSubstitutionTypes()); realMapDnDs[i][j].resize(dndsReg->getNumberOfSubstitutionTypes()); result->getSubstitutionCount(ids[j], *totReg, realMapTotal[i][j]); result->getSubstitutionCount(ids[j], *dndsReg, realMapDnDs[i][j]); if (realMapTotal[i][j][0] != realMap[i][j]) { cerr << "Error, total substitution register provides wrong result." << endl; return 1; } //if (abs(VectorTools::sum(realMapDetailed[i][j]) - realMap[i][j]) > 0.000001) { // cerr << "Error, detailed substitution register provides wrong result." << endl; // return 1; //} } auto_ptr<Site> site(result->getSite(*model)); site->setPosition(static_cast<int>(i)); sites.addSite(*site, false); delete result; } ApplicationTools::displayTaskDone(); //------------- //Now build the substitution vectors with the true model: //Fasta fasta; //fasta.write("Simulations.fasta", sites); DRHomogeneousTreeLikelihood drhtl(*tree, sites, model, rdist); drhtl.initialize(); cout << drhtl.getValue() << endl; SubstitutionCount* sCountAna = new LaplaceSubstitutionCount(model, 10); Matrix<double>* m = sCountAna->getAllNumbersOfSubstitutions(0.001,1); cout << "Analytical total count:" << endl; MatrixTools::print(*m); delete m; ProbabilisticSubstitutionMapping* probMapAna = SubstitutionMappingTools::computeSubstitutionVectors(drhtl, ids, *sCountAna); SubstitutionCount* sCountTot = new NaiveSubstitutionCount(model, totReg); m = sCountTot->getAllNumbersOfSubstitutions(0.001,1); cout << "Simple total count:" << endl; MatrixTools::print(*m); delete m; ProbabilisticSubstitutionMapping* probMapTot = SubstitutionMappingTools::computeSubstitutionVectors(drhtl, ids, *sCountTot); SubstitutionCount* sCountDnDs = new NaiveSubstitutionCount(model, dndsReg); m = sCountDnDs->getAllNumbersOfSubstitutions(0.001,1); cout << "Detailed count, type 1:" << endl; MatrixTools::print(*m); delete m; ProbabilisticSubstitutionMapping* probMapDnDs = SubstitutionMappingTools::computeSubstitutionVectors(drhtl, ids, *sCountDnDs); SubstitutionCount* sCountUniTot = new UniformizationSubstitutionCount(model, totReg); m = sCountUniTot->getAllNumbersOfSubstitutions(0.001,1); cout << "Total count, uniformization method:" << endl; MatrixTools::print(*m); delete m; ProbabilisticSubstitutionMapping* probMapUniTot = SubstitutionMappingTools::computeSubstitutionVectors(drhtl, ids, *sCountUniTot); SubstitutionCount* sCountUniDnDs = new UniformizationSubstitutionCount(model, dndsReg); m = sCountUniDnDs->getAllNumbersOfSubstitutions(0.001,2); cout << "Detailed count, uniformization method, type 2:" << endl; MatrixTools::print(*m); delete m; ProbabilisticSubstitutionMapping* probMapUniDnDs = SubstitutionMappingTools::computeSubstitutionVectors(drhtl, ids, *sCountUniDnDs); //Check per branch: /* //1. Total: for (unsigned int j = 0; j < ids.size(); ++j) { double totalReal = 0; double totalObs1 = 0; double totalObs2 = 0; double totalObs3 = 0; double totalObs4 = 0; double totalObs5 = 0; for (unsigned int i = 0; i < n; ++i) { totalReal += realMap[i][j]; totalObs1 += probMapAna->getNumberOfSubstitutions(ids[j], i, 0); totalObs2 += probMapTot->getNumberOfSubstitutions(ids[j], i, 0); //totalObs3 += VectorTools::sum(probMapDet->getNumberOfSubstitutions(ids[j], i)); totalObs4 += probMapDecTot->getNumberOfSubstitutions(ids[j], i, 0); //totalObs5 += VectorTools::sum(probMapDecDet->getNumberOfSubstitutions(ids[j], i)); } if (tree->isLeaf(ids[j])) cout << tree->getNodeName(ids[j]) << "\t"; cout << tree->getDistanceToFather(ids[j]) << "\t" << totalReal << "\t" << totalObs1 << "\t" << totalObs2 << "\t" << totalObs3 << "\t" << totalObs4 << "\t" << totalObs5 << endl; if (abs(totalReal - totalObs1) / totalReal > 0.1) return 1; if (abs(totalReal - totalObs2) / totalReal > 0.1) return 1; if (abs(totalReal - totalObs3) / totalReal > 0.1) return 1; if (abs(totalReal - totalObs4) / totalReal > 0.1) return 1; } //2. Detail: for (unsigned int j = 0; j < ids.size(); ++j) { vector<double> real(4, 0); vector<double> obs1(4, 0); vector<double> obs2(4, 0); for (unsigned int i = 0; i < n; ++i) { real += realMapDetailed[i][j]; //VectorTools::print(real); //vector<double> c = probMapDet->getNumberOfSubstitutions(ids[j], i); //VectorTools::print(c); //obs1 += probMapDet->getNumberOfSubstitutions(ids[j], i); //obs2 += probMapDecDet->getNumberOfSubstitutions(ids[j], i); } if (tree->isLeaf(ids[j])) cout << tree->getNodeName(ids[j]) << "\t"; cout << tree->getDistanceToFather(ids[j]) << "\t"; for (unsigned int t = 0; t < 4; ++t) { cout << obs1[t] << "/" << real[t] << "\t"; cout << obs2[t] << "/" << real[t] << "\t"; } cout << endl; //if (abs(totalReal - totalObs) / totalReal > 0.1) return 1; } */ //------------- delete tree; delete alphabet; delete model; delete rdist; delete sCountTot; delete sCountDnDs; delete probMapAna; delete probMapTot; delete probMapDnDs; delete probMapUniTot; delete probMapUniDnDs; //return (abs(obs - 0.001) < 0.001 ? 0 : 1); return 0; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET; FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); SubstitutionModel* model = new T92(alphabet, 3.); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); //DiscreteDistribution* rdist = new ConstantDistribution(1.0, true); //Very difficult to optimize on small datasets: DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0); size_t nsites = 1000; unsigned int nrep = 20; size_t nmodels = modelSet->getNumberOfModels(); vector<double> thetas(nmodels); vector<double> thetasEst1(nmodels); vector<double> thetasEst2(nmodels); for (size_t i = 0; i < nmodels; ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas[i] = theta; } NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree); for (unsigned int j = 0; j < nrep; j++) { OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Simulate data: auto_ptr<SiteContainer> sites(simulator.simulate(nsites)); //Now fit model: auto_ptr<SubstitutionModelSet> modelSet2(modelSet->clone()); auto_ptr<SubstitutionModelSet> modelSet3(modelSet->clone()); RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet2.get(), rdist, true, true, false); tl.initialize(); RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet3.get(), rdist, true, true, true); tl2.initialize(); unsigned int c1 = OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int c2 = OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); cout << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl; for (size_t i = 0; i < nmodels; ++i) { cout << modelSet2->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl; //if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1) // return 1; thetasEst1[i] += modelSet2->getModel(i)->getParameter("theta").getValue(); thetasEst2[i] += modelSet3->getModel(i)->getParameter("theta").getValue(); } } thetasEst1 /= static_cast<double>(nrep); thetasEst2 /= static_cast<double>(nrep); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << endl; double diff1 = abs(thetas[i] - thetasEst1[i]); double diff2 = abs(thetas[i] - thetasEst2[i]); if (diff1 > 0.2 || diff2 > 0.2) return 1; } //------------- delete tree; delete modelSet; delete rdist; return 0; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET; FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); SubstitutionModel* model = new T92(alphabet, 3.); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); //Very difficult to optimize on small datasets: DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0); ParametrizableTree* parTree = new ParametrizableTree(*tree); FrequenciesSet* rootFreqs2 = rootFreqs->clone(); DiscreteDistribution* rdist2 = rdist->clone(); SubstitutionModel* model2=model->clone(); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); unique_ptr<SubstitutionModelSet> modelSetSim(modelSet->clone()); NonHomogeneousSubstitutionProcess* subPro= NonHomogeneousSubstitutionProcess::createNonHomogeneousSubstitutionProcess(model2, rdist2, rootFreqs2, parTree, globalParameterNames); // Simulation size_t nsites = 1000; unsigned int nrep = 20; size_t nmodels = modelSet->getNumberOfModels(); vector<double> thetas(nmodels); vector<double> thetasEst1(nmodels); vector<double> thetasEst2(nmodels); vector<double> thetasEst1n(nmodels); vector<double> thetasEst2n(nmodels); for (size_t i = 0; i < nmodels; ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSetSim->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); //subPro->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas[i] = theta; } NonHomogeneousSequenceSimulator simulator(modelSetSim.get(), rdist, tree); NonHomogeneousSubstitutionProcess* subPro2 = subPro->clone(); for (unsigned int j = 0; j < nrep; j++) { OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Simulate data: unique_ptr<SiteContainer> sites(simulator.simulate(nsites)); //Now fit model: unique_ptr<SubstitutionModelSet> modelSet2(modelSet->clone()); RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet, rdist, true, true, false); tl.initialize(); RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet2.get(), rdist, true, true, true); tl2.initialize(); SubstitutionProcess* nsubPro=subPro->clone(); SubstitutionProcess* nsubPro2=subPro2->clone(); RecursiveLikelihoodTreeCalculation* tlComp = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro, true, false); SingleProcessPhyloLikelihood ntl(nsubPro, tlComp, true); RecursiveLikelihoodTreeCalculation* tlComp2 = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro2, true); SingleProcessPhyloLikelihood ntl2(nsubPro2, tlComp2, true); for (size_t i = 0; i < nmodels; ++i) { ntl.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]); ntl2.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]); } cout << setprecision(10) << "OldTL init: " << tl.getValue() << "\t" << tl2.getValue() << endl; cout << setprecision(10) << "NewTL init: " << ntl.getValue() << "\t" << ntl2.getValue() << endl; unsigned int c1 = OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int c2 = OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int nc1 = OptimizationTools::optimizeNumericalParameters2( &ntl, ntl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int nc2 = OptimizationTools::optimizeNumericalParameters2( &ntl2, ntl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); cout << "OldTL: " << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl; cout << "NewTL: " << nc1 << ": " << ntl.getValue() << "\t" << nc2 << ": " << ntl2.getValue() << endl; cout << "Thetas : " << endl; for (size_t i = 0; i < nmodels; ++i) { // cerr << modelSet->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue(); // cerr << "\t" << subPro->getModel(i)->getParameter("theta").getValue() << "\t" << subPro2->getModel(i)->getParameter("theta").getValue() << endl; // if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1) // return 1; thetasEst1[i] += modelSet->getModel(i)->getParameter("theta").getValue(); thetasEst2[i] += modelSet2->getModel(i)->getParameter("theta").getValue(); thetasEst1n[i] += dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro)->getModel(i)->getParameter("theta").getValue(); thetasEst2n[i] += dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro2)->getModel(i)->getParameter("theta").getValue(); } } thetasEst1 /= static_cast<double>(nrep); thetasEst2 /= static_cast<double>(nrep); thetasEst1n /= static_cast<double>(nrep); thetasEst2n /= static_cast<double>(nrep); //Now compare estimated values to real ones: cout << "Real" << "\t" << "Est_Old1" << "\t" << "Est_Old2" << "\t"; cout << "Est_New1" << "\t" << "Est_New2" << endl; for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << "\t"; cout << thetasEst1n[i] << "\t" << thetasEst2n[i] << endl; double diff1 = abs(thetas[i] - thetasEst1[i]); double diff2 = abs(thetas[i] - thetasEst2[i]); double diffn1 = abs(thetas[i] - thetasEst1n[i]); double diffn2 = abs(thetas[i] - thetasEst2n[i]); if (diff1 > 0.2 || diff2 > 0.2 || diffn1 > 0.2 || diffn2 > 0.2) return 1; } return 0; }