int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("((A:0.01, B:0.02):0.03,C:0.01,D:0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- NucleicAlphabet* alphabet = new DNA(); SubstitutionModel* model = new T92(alphabet, 3.); FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); DiscreteDistribution* rdist = new ConstantRateDistribution(); vector<double> thetas; for (unsigned int i = 0; i < modelSet->getNumberOfModels(); ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas.push_back(theta); } NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree); unsigned int n = 100000; OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Check fast simulation first: cout << "Fast check:" << endl; //Generate data set: VectorSiteContainer sites(seqNames, alphabet); for (unsigned int i = 0; i < n; ++i) { auto_ptr<Site> site(simulator.simulateSite()); site->setPosition(static_cast<int>(i)); sites.addSite(*site, false); } //Now fit model: SubstitutionModelSet* modelSet2 = modelSet->clone(); RNonHomogeneousTreeLikelihood tl(*tree, sites, modelSet2, rdist); tl.initialize(); OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue() << endl; double diff = abs(thetas[i] - modelSet2->getModel(i)->getParameter("theta").getValue()); if (diff > 0.1) return 1; } delete modelSet2; //Now try detailed simulations: cout << "Detailed check:" << endl; //Generate data set: VectorSiteContainer sites2(seqNames, alphabet); for (unsigned int i = 0; i < n; ++i) { RASiteSimulationResult* result = simulator.dSimulateSite(); auto_ptr<Site> site(result->getSite(*simulator.getSubstitutionModelSet()->getModel(0))); site->setPosition(static_cast<int>(i)); sites2.addSite(*site, false); delete result; } //Now fit model: SubstitutionModelSet* modelSet3 = modelSet->clone(); RNonHomogeneousTreeLikelihood tl2(*tree, sites2, modelSet3, rdist); tl2.initialize(); OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl; double diff = abs(thetas[i] - modelSet3->getModel(i)->getParameter("theta").getValue()); if (diff > 0.1) return 1; } delete modelSet3; //------------- delete tree; delete alphabet; delete modelSet; delete rdist; return 0; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET; FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); SubstitutionModel* model = new T92(alphabet, 3.); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); //DiscreteDistribution* rdist = new ConstantDistribution(1.0, true); //Very difficult to optimize on small datasets: DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0); size_t nsites = 1000; unsigned int nrep = 20; size_t nmodels = modelSet->getNumberOfModels(); vector<double> thetas(nmodels); vector<double> thetasEst1(nmodels); vector<double> thetasEst2(nmodels); for (size_t i = 0; i < nmodels; ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas[i] = theta; } NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree); for (unsigned int j = 0; j < nrep; j++) { OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Simulate data: auto_ptr<SiteContainer> sites(simulator.simulate(nsites)); //Now fit model: auto_ptr<SubstitutionModelSet> modelSet2(modelSet->clone()); auto_ptr<SubstitutionModelSet> modelSet3(modelSet->clone()); RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet2.get(), rdist, true, true, false); tl.initialize(); RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet3.get(), rdist, true, true, true); tl2.initialize(); unsigned int c1 = OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int c2 = OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); cout << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl; for (size_t i = 0; i < nmodels; ++i) { cout << modelSet2->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl; //if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1) // return 1; thetasEst1[i] += modelSet2->getModel(i)->getParameter("theta").getValue(); thetasEst2[i] += modelSet3->getModel(i)->getParameter("theta").getValue(); } } thetasEst1 /= static_cast<double>(nrep); thetasEst2 /= static_cast<double>(nrep); //Now compare estimated values to real ones: for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << endl; double diff1 = abs(thetas[i] - thetasEst1[i]); double diff2 = abs(thetas[i] - thetasEst2[i]); if (diff1 > 0.2 || diff2 > 0.2) return 1; } //------------- delete tree; delete modelSet; delete rdist; return 0; }