pair<string,scalar_type> tree_LL_nucl(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance) { //const Alphabet* alphabet = new ProteicAlphabet(); const Alphabet* alphabet = new RNA(); OrderedSequenceContainer *alignment; VectorSiteContainer* sites; Fasta Reader; //NexusIOSequence Reader; //Phylip * Reader=new Phylip(true,true,100,true,"\r"); alignment = Reader.read(aln_filename, alphabet); sites = new VectorSiteContainer(*alignment); SiteContainerTools::removeGapOnlySites(*sites); SiteContainerTools::changeGapsToUnknownCharacters(*sites); TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); DiscreteRatesAcrossSitesTreeLikelihood* tl1; SubstitutionModel* model = 0; DiscreteDistribution* rDist = 0; model = new GTR(&AlphabetTools::RNA_ALPHABET); model->setFreqFromData(*sites); rDist = new GammaDiscreteDistribution(8, 1, 1); tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false); tl1->initialize(); if (optimize_bls) { //Newton.. ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); OptimizationTools::optimizeNumericalParameters( dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*> (tl1), //tl1->getParameters(), *parameters, 0, 1, tolerance, 1000, 0, 0, false, 0, OptimizationTools::OPTIMIZATION_NEWTON, //OptimizationTools::OPTIMIZATION_BRENT); OptimizationTools::OPTIMIZATION_BFGS); delete parameters; } scalar_type LL=- tl1->getValue(); //Here's your log likelihood value ! //tl1->getParameters().printParameters(cout); //cout << TreeTemplateTools::treeToParenthesis( tl1->getTree() ) <<endl; pair<string,scalar_type> return_pair; return_pair.first= TreeTemplateTools::treeToParenthesis( tl1->getTree() ) ; return_pair.second=LL; delete sites; delete alphabet; delete model; delete rDist; delete tl1; return return_pair; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Computation of site likelihoods inside mixed models *" << endl; cout << "* Version 2.2.0. *" << endl; cout << "* Author: L. Guéguen Last Modif.: 25/09/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods"); bppmixedlikelihoods.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } // get the data VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); // Get the tree Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams()); ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves())); AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl; string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false); ApplicationTools::displayResult("Heterogeneous model", nhOpt); MixedSubstitutionModel* model = 0; MixedSubstitutionModelSet* modelSet = 0; DiscreteDistribution* rDist = 0; if (nhOpt == "no") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true); } else if (nhOpt == "one_per_branch") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } vector<double> rateFreqs; if (model->getNumberOfStates() != alphabet->getSize()) { // Markov-Modulated Markov Model... unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize()); rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case, // we should assume a rate distribution for the root also!!! } std::map<std::string, std::string> aliasFreqNames; FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs); vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', ""); modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters)); model = 0; tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else if (nhOpt == "general") { modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (modelSet == 0) { cout << "Missing a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantDistribution(1.); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else throw Exception("Unknown option for nonhomogeneous: " + nhOpt); tl->initialize(); double logL = tl->getValue(); if (isinf(logL)) { // This may be due to null branch lengths, leading to null likelihood! ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero."); ApplicationTools::displayWarning("!!! This may be due to branch length == 0."); ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001."); ParameterList pl = tl->getBranchLengthsParameters(); for (unsigned int i = 0; i < pl.size(); i++) { if (pl[i].getValue() < 0.000001) pl[i].setValue(0.000001); } tl->matchParametersValues(pl); logL = tl->getValue(); } if (isinf(logL)) { ApplicationTools::displayError("!!! Unexpected likelihood == 0."); ApplicationTools::displayError("!!! Looking at each site:"); for (unsigned int i = 0; i < sites->getNumberOfSites(); i++) { (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine(); } ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences)."); exit(-1); } // Write parameters to screen: ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ParameterList parameters = tl->getSubstitutionModelParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = tl->getRateDistributionParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // ///////////////////////////////////////////// // Getting likelihoods per submodel string outputFile; outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false); ApplicationTools::displayResult("Output file for likelihoods", outputFile); ofstream out(outputFile.c_str(), ios::out); size_t nSites = sites->getNumberOfSites(); size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true); string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false); if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels()))) { ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + "."); exit(-1); } MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1)); if (!p0) { ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model."); exit(-1); } const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0); if (ptmp) { p0 = ptmp->getMixedModel().clone(); if (nhOpt == "no") model = p0; else { modelSet->replaceModel(nummodel-1, p0); modelSet->isFullySetUpFor(*tree); } } ////////////////////////////////////////////////// // Case of a MixtureOfSubstitutionModels MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0); if (pMSM) { vector<string> colNames; colNames.push_back("Sites"); size_t nummod = pMSM->getNumberOfModels(); for (unsigned int i = 0; i < nummod; i++) { colNames.push_back(pMSM->getNModel(i)->getName()); } DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]"); } Vdouble vprob = pMSM->getProbabilities(); for (unsigned int i = 0; i < nummod; i++) { string modname = pMSM->getNModel(i)->getName(); for (unsigned int j = 0; j < nummod; j++) { pMSM->setNProbability(j, (j == i) ? 1 : 0); } if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble Vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) { (*rates)(j, modname) = TextTools::toString(Vd[j]); } ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Model " + modname + ":"); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15)); } DataTable::write(*rates, out, "\t"); } ////////////////////////////////////////////////// // Case of a MixtureOfASubstitutionModel else { MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0); if (pMSM2 != NULL) { size_t nummod = pMSM2->getNumberOfModels(); if (parname == "") { ParameterList pl=pMSM2->getParameters(); for (size_t i2 = 0; i2 < pl.size(); i2++) { string pl2n = pl[i2].getName(); if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL) { parname=pl2n; while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL) parname=pl2n.substr(0,pl2n.rfind("_")); if (parname.size()>0){ ApplicationTools::displayResult("likelihoods.parameter_name", parname); break; } } } } if (parname == "") { ApplicationTools::displayError("Argument likelihoods.parameter_name is required."); exit(-1); } vector< Vint > vvnmod; size_t i2 = 0; while (i2 < nummod) { string par2 = parname + "_" + TextTools::toString(i2 + 1); Vint vnmod = pMSM2->getSubmodelNumbers(par2); if (vnmod.size() == 0) break; vvnmod.push_back(vnmod); i2++; } size_t nbcl = vvnmod.size(); if (nbcl==0) throw Exception("Parameter " + parname + " is not mixed."); Vdouble vprob = pMSM2->getProbabilities(); vector<vector<double> > vvprob; vector<double> vsprob; for (size_t i = 0; i < nbcl; i++) { vector<double> vprob2; for (size_t j = 0; j < vvnmod[i].size(); j++) { vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]); } vvprob.push_back(vprob2); vsprob.push_back(VectorTools::sum(vvprob[i])); } vector<string> colNames; colNames.push_back("Sites"); Vdouble dval; for (unsigned int i = 0; i < nbcl; i++) { SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0])); double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname)); dval.push_back(valPar); colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar)); } for (unsigned int i = 0; i < nbcl; i++) colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i])); colNames.push_back("mean"); DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i,"Sites")=TextTools::toString(currentSitePosition); } VVdouble vvd; vector<double> vRates = pMSM2->getVRates(); for (size_t i = 0; i < nbcl; ++i) { string par2 = parname + "_" + TextTools::toString(i + 1); for (unsigned int j = 0; j < nummod; ++j) pMSM2->setNProbability(j, 0); for (size_t j = 0; j < vvprob[i].size(); ++j) pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]); if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) (*rates)(j, i + 1) = TextTools::toString(vd[j]); vvd.push_back(vd); ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i])); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15)); } for (unsigned int j = 0; j < nSites; j++) { Vdouble vd; for (unsigned int i = 0; i < nbcl; i++) vd.push_back(std::log(vsprob[i])+vvd[i][j]); VectorTools::logNorm(vd); for (unsigned int i = 0; i < nbcl; i++) (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i])); (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval)); } DataTable::write(*rates, out, "\t"); } } delete alphabet; delete sites; if (model) delete model; if (modelSet) delete modelSet; delete rDist; delete tl; delete tree; ApplicationTools::displayMessage("\n"); bppmixedlikelihoods.done(); } catch (exception& e) { cout << e.what() << endl; return 1; } return 0; }
scalar_type tree_LL(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance) { const Alphabet* alphabet = new ProteicAlphabet(); OrderedSequenceContainer *alignment; VectorSiteContainer* sites; Fasta Reader; //Phylip * Reader=new Phylip(true,true,100,true,"\r"); alignment = Reader.read(aln_filename, alphabet); sites = new VectorSiteContainer(*alignment); SiteContainerTools::changeGapsToUnknownCharacters(*sites); TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); //Newick newick1; //ttree1 = newick1.read(tree); DiscreteRatesAcrossSitesTreeLikelihood* tl1; SubstitutionModel* model = 0; DiscreteDistribution* rDist = 0; model = new LG08(&AlphabetTools::PROTEIN_ALPHABET, new FullProteinFrequenciesSet(&AlphabetTools::PROTEIN_ALPHABET), true); model->setFreqFromData(*sites); rDist = new GammaDiscreteDistribution(4, 1, 1); tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false); tl1->initialize(); /* if (optimize_bls) { Optimizer* optimizer = new PseudoNewtonOptimizer(tl1); // Optimizer* optimizer = new PseudoNewtonOptimizer(tl1); ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); //Newton.. optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); optimizer->setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(0); optimizer->getStopCondition()->setTolerance(0.01); optimizer->init(*parameters); //optimizer->init(tl1->getParameters()); optimizer->setMaximumNumberOfEvaluations(1000); optimizer->optimize(); delete parameters; delete optimizer; } */ if (optimize_bls) { //Newton.. ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); OptimizationTools::optimizeNumericalParameters( dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*> (tl1), //tl1->getParameters(), *parameters, 0, 1, tolerance, 1000, 0, 0, false, 0, OptimizationTools::OPTIMIZATION_NEWTON, //OptimizationTools::OPTIMIZATION_BRENT); OptimizationTools::OPTIMIZATION_BFGS); delete parameters; } scalar_type LL=- tl1->getValue(); //Here's your log likelihood value ! delete sites; delete alphabet; delete model; delete rDist; delete tl1; return LL; }
void AbstractWordSubstitutionModel::updateMatrices() { // First we update position specific models. This need to be done // here and not in fireParameterChanged, as some parameter aliases // might have been defined and need to be resolved first. if (VSubMod_.size() < 2 || VSubMod_[0] == VSubMod_[1]) VSubMod_[0]->matchParametersValues(getParameters()); else for (size_t i = 0; i < VSubMod_.size(); i++) { VSubMod_[i]->matchParametersValues(getParameters()); } size_t nbmod = VSubMod_.size(); size_t salph = getNumberOfStates(); size_t nbStop = 0; vector<bool> vnull; // vector of the indices of lines with only zeros // Generator size_t i, j, n, l, k, m; vector<size_t> vsize; for (k = 0; k < nbmod; k++) { vsize.push_back(VSubMod_[k]->getNumberOfStates()); } RowMatrix<double> gk, exch; m = 1; for (k = nbmod; k > 0; k--) { gk = VSubMod_[k - 1]->getGenerator(); for (i = 0; i < vsize[k - 1]; i++) { for (j = 0; j < vsize[k - 1]; j++) { if (i != j) { n = 0; while (n < salph) { // loop on prefix for (l = 0; l < m; l++) { // loop on suffix generator_(n + i * m + l, n + j * m + l) = gk(i, j) * Vrate_[k - 1]; } n += m * vsize[k - 1]; } } } } m *= vsize[k - 1]; } // modification of generator_ this->completeMatrices(); double x; for (i = 0; i < salph; i++) { x = 0; for (j = 0; j < salph; j++) { if (j != i) x += generator_(i, j); } generator_(i, i) = -x; } // at that point generator_ and freq_ are done for models without // enableEigenDecomposition // Eigen values: if (enableEigenDecomposition()) { for (i = 0; i < salph; i++) { bool flag = true; for (j = 0; j < salph; j++) { if ((i != j) && abs(generator_(i, j)) > NumConstants::TINY()) { flag = false; break; } } if (flag) nbStop++; vnull.push_back(flag); } if (nbStop != 0) { size_t gi = 0, gj = 0; gk.resize(salph - nbStop, salph - nbStop); for (i = 0; i < salph; i++) { if (!vnull[i]) { gj = 0; for (j = 0; j < salph; j++) { if (!vnull[j]) { gk(i - gi, j - gj) = generator_(i, j); } else gj++; } } else gi++; } EigenValue<double> ev(gk); eigenValues_ = ev.getRealEigenValues(); iEigenValues_ = ev.getImagEigenValues(); for (i = 0; i < nbStop; i++) { eigenValues_.push_back(0); iEigenValues_.push_back(0); } RowMatrix<double> rev = ev.getV(); rightEigenVectors_.resize(salph, salph); gi = 0; for (i = 0; i < salph; i++) { if (vnull[i]) { gi++; for (j = 0; j < salph; j++) { rightEigenVectors_(i, j) = 0; } rightEigenVectors_(i, salph - nbStop + gi - 1) = 1; } else { for (j = 0; j < salph - nbStop; j++) { rightEigenVectors_(i, j) = rev(i - gi, j); } for (j = salph - nbStop; j < salph; j++) { rightEigenVectors_(i, j) = 0; } } } } else { EigenValue<double> ev(generator_); eigenValues_ = ev.getRealEigenValues(); iEigenValues_ = ev.getImagEigenValues(); rightEigenVectors_ = ev.getV(); nbStop = 0; } try { MatrixTools::inv(rightEigenVectors_, leftEigenVectors_); // is it diagonalizable ? isDiagonalizable_ = true; for (i = 0; i < size_ && isDiagonalizable_; i++) { if (abs(iEigenValues_[i]) > NumConstants::SMALL()) isDiagonalizable_ = false; } // is it singular? // looking for the 0 eigenvector for which the non-stop right // eigen vector elements are equal. // size_t nulleigen = 0; double val; isNonSingular_ = false; while (nulleigen < salph - nbStop) { if ((abs(eigenValues_[nulleigen]) < NumConstants::SMALL()) && (abs(iEigenValues_[nulleigen]) < NumConstants::SMALL())) { i = 0; while (vnull[i]) i++; val = rightEigenVectors_(i, nulleigen); i++; while (i < salph) { if (!vnull[i]) { if (abs(rightEigenVectors_(i, nulleigen) - val) > NumConstants::SMALL()) break; } i++; } if (i < salph) nulleigen++; else { isNonSingular_ = true; break; } } else nulleigen++; } if (isNonSingular_) { eigenValues_[nulleigen] = 0; // to avoid approximation errors on long long branches iEigenValues_[nulleigen] = 0; // to avoid approximation errors on long long branches for (i = 0; i < salph; i++) freq_[i] = leftEigenVectors_(nulleigen, i); x = 0; for (i = 0; i < salph; i++) x += freq_[i]; for (i = 0; i < salph; i++) freq_[i] /= x; } else { ApplicationTools::displayMessage("Unable to find eigenvector for eigenvalue 1. Taylor series used instead."); isDiagonalizable_ = false; } } // if rightEigenVectors_ is singular catch (ZeroDivisionException& e) { ApplicationTools::displayMessage("Singularity during diagonalization. Taylor series used instead."); isNonSingular_ = false; isDiagonalizable_ = false; } if (!isNonSingular_) { double min = generator_(0, 0); for (i = 1; i < salph; i++) { if (min > generator_(i, i)) min = generator_(i, i); } MatrixTools::scale(generator_, -1 / min); if (vPowGen_.size() == 0) vPowGen_.resize(30); MatrixTools::getId(salph, tmpMat_); // to compute the equilibrium frequency (Q+Id)^256 MatrixTools::add(tmpMat_, generator_); MatrixTools::pow(tmpMat_, 256, vPowGen_[0]); for (i = 0; i < salph; i++) { freq_[i] = vPowGen_[0](0, i); } MatrixTools::getId(salph, vPowGen_[0]); } // normalization x = 0; for (i = 0; i < salph; i++) x += freq_[i] * generator_(i, i); MatrixTools::scale(generator_, -1. / x); for (i = 0; i < salph; i++) { eigenValues_[i] /= -x; iEigenValues_[i] /= -x; } if (!isNonSingular_) MatrixTools::Taylor(generator_, 30, vPowGen_); } else // compute freq_ is no eigenDecomposition { for (j = 0; j < size_; j++) freq_[j] = 1; m = 1; for (k = nbmod; k > 0; k--) { SubstitutionModel* pSM = VSubMod_[k - 1]; for (j = 0; j < vsize[k - 1]; j++) { n = 0; while (n < salph) { // loop on prefix for (l = 0; l < m; l++) { // loop on suffix freq_[n + j * m + l] *= pSM->freq(j); } n += m * vsize[k - 1]; } } m *= vsize[k - 1]; } } // compute the exchangeability_ for (i = 0; i < size_; i++) for (j = 0; j < size_; j++) exchangeability_(i, j) = generator_(i, j) / freq_[j]; }
int main() { TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);"); vector<string> seqNames= tree->getLeavesNames(); vector<int> ids = tree->getNodesId(); //------------- const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET; FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet); SubstitutionModel* model = new T92(alphabet, 3.); std::vector<std::string> globalParameterNames; globalParameterNames.push_back("T92.kappa"); //Very difficult to optimize on small datasets: DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0); ParametrizableTree* parTree = new ParametrizableTree(*tree); FrequenciesSet* rootFreqs2 = rootFreqs->clone(); DiscreteDistribution* rdist2 = rdist->clone(); SubstitutionModel* model2=model->clone(); map<string, string> alias; SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames); unique_ptr<SubstitutionModelSet> modelSetSim(modelSet->clone()); NonHomogeneousSubstitutionProcess* subPro= NonHomogeneousSubstitutionProcess::createNonHomogeneousSubstitutionProcess(model2, rdist2, rootFreqs2, parTree, globalParameterNames); // Simulation size_t nsites = 1000; unsigned int nrep = 20; size_t nmodels = modelSet->getNumberOfModels(); vector<double> thetas(nmodels); vector<double> thetasEst1(nmodels); vector<double> thetasEst2(nmodels); vector<double> thetasEst1n(nmodels); vector<double> thetasEst2n(nmodels); for (size_t i = 0; i < nmodels; ++i) { double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005; cout << "Theta" << i << " set to " << theta << endl; modelSetSim->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); //subPro->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta); thetas[i] = theta; } NonHomogeneousSequenceSimulator simulator(modelSetSim.get(), rdist, tree); NonHomogeneousSubstitutionProcess* subPro2 = subPro->clone(); for (unsigned int j = 0; j < nrep; j++) { OutputStream* profiler = new StlOutputStream(new ofstream("profile.txt", ios::out)); OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out)); //Simulate data: unique_ptr<SiteContainer> sites(simulator.simulate(nsites)); //Now fit model: unique_ptr<SubstitutionModelSet> modelSet2(modelSet->clone()); RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet, rdist, true, true, false); tl.initialize(); RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet2.get(), rdist, true, true, true); tl2.initialize(); SubstitutionProcess* nsubPro=subPro->clone(); SubstitutionProcess* nsubPro2=subPro2->clone(); RecursiveLikelihoodTreeCalculation* tlComp = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro, true, false); SingleProcessPhyloLikelihood ntl(nsubPro, tlComp, true); RecursiveLikelihoodTreeCalculation* tlComp2 = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro2, true); SingleProcessPhyloLikelihood ntl2(nsubPro2, tlComp2, true); for (size_t i = 0; i < nmodels; ++i) { ntl.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]); ntl2.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]); } cout << setprecision(10) << "OldTL init: " << tl.getValue() << "\t" << tl2.getValue() << endl; cout << setprecision(10) << "NewTL init: " << ntl.getValue() << "\t" << ntl2.getValue() << endl; unsigned int c1 = OptimizationTools::optimizeNumericalParameters2( &tl, tl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int c2 = OptimizationTools::optimizeNumericalParameters2( &tl2, tl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int nc1 = OptimizationTools::optimizeNumericalParameters2( &ntl, ntl.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); unsigned int nc2 = OptimizationTools::optimizeNumericalParameters2( &ntl2, ntl2.getParameters(), 0, 0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON); cout << "OldTL: " << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl; cout << "NewTL: " << nc1 << ": " << ntl.getValue() << "\t" << nc2 << ": " << ntl2.getValue() << endl; cout << "Thetas : " << endl; for (size_t i = 0; i < nmodels; ++i) { // cerr << modelSet->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue(); // cerr << "\t" << subPro->getModel(i)->getParameter("theta").getValue() << "\t" << subPro2->getModel(i)->getParameter("theta").getValue() << endl; // if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1) // return 1; thetasEst1[i] += modelSet->getModel(i)->getParameter("theta").getValue(); thetasEst2[i] += modelSet2->getModel(i)->getParameter("theta").getValue(); thetasEst1n[i] += dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro)->getModel(i)->getParameter("theta").getValue(); thetasEst2n[i] += dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro2)->getModel(i)->getParameter("theta").getValue(); } } thetasEst1 /= static_cast<double>(nrep); thetasEst2 /= static_cast<double>(nrep); thetasEst1n /= static_cast<double>(nrep); thetasEst2n /= static_cast<double>(nrep); //Now compare estimated values to real ones: cout << "Real" << "\t" << "Est_Old1" << "\t" << "Est_Old2" << "\t"; cout << "Est_New1" << "\t" << "Est_New2" << endl; for (size_t i = 0; i < thetas.size(); ++i) { cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << "\t"; cout << thetasEst1n[i] << "\t" << thetasEst2n[i] << endl; double diff1 = abs(thetas[i] - thetasEst1[i]); double diff2 = abs(thetas[i] - thetasEst2[i]); double diffn1 = abs(thetas[i] - thetasEst1n[i]); double diffn2 = abs(thetas[i] - thetasEst2n[i]); if (diff1 > 0.2 || diff2 > 0.2 || diffn1 > 0.2 || diffn2 > 0.2) return 1; } return 0; }
int main(int args, char ** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Distance Methods, version 2.2.0 *" << endl; cout << "* Author: J. Dutheil Created 05/05/07 *" << endl; cout << "* Last Modif. 04/02/15 *" << endl; cout << "******************************************************************" << endl; cout << endl; if(args == 1) { help(); return 0; } try { BppApplication bppdist(args, argv, "BppDist"); bppdist.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppdist.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppdist.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppdist.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(* allSites, bppdist.getParams()); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); SubstitutionModel* model = PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppdist.getParams()); DiscreteDistribution* rDist = 0; if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { //Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppdist.getParams()); } DistanceEstimation distEstimation(model, rDist, sites, 1, false); string method = ApplicationTools::getStringParameter("method", bppdist.getParams(), "nj"); ApplicationTools::displayResult("Tree reconstruction method", method); TreeTemplate<Node>* tree; AgglomerativeDistanceMethod* distMethod = 0; if(method == "wpgma") { PGMA* wpgma = new PGMA(true); distMethod = wpgma; } else if(method == "upgma") { PGMA* upgma = new PGMA(false); distMethod = upgma; } else if(method == "nj") { NeighborJoining* nj = new NeighborJoining(); nj->outputPositiveLengths(true); distMethod = nj; } else if(method == "bionj") { BioNJ* bionj = new BioNJ(); bionj->outputPositiveLengths(true); distMethod = bionj; } else throw Exception("Unknown tree reconstruction method."); string type = ApplicationTools::getStringParameter("optimization.method", bppdist.getParams(), "init"); ApplicationTools::displayResult("Model parameters estimation method", type); if (type == "init") type = OptimizationTools::DISTANCEMETHOD_INIT; else if (type == "pairwise") type = OptimizationTools::DISTANCEMETHOD_PAIRWISE; else if (type == "iterations") type = OptimizationTools::DISTANCEMETHOD_ITERATIONS; else throw Exception("Unknown parameter estimation procedure '" + type + "'."); unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>("optimization.verbose", bppdist.getParams(), 2); string mhPath = ApplicationTools::getAFilePath("optimization.message_handler", bppdist.getParams(), false, false); OutputStream* messenger = (mhPath == "none") ? 0 : (mhPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(mhPath.c_str(), ios::out)); ApplicationTools::displayResult("Message handler", mhPath); string prPath = ApplicationTools::getAFilePath("optimization.profiler", bppdist.getParams(), false, false); OutputStream* profiler = (prPath == "none") ? 0 : (prPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(prPath.c_str(), ios::out)); if(profiler) profiler->setPrecision(20); ApplicationTools::displayResult("Profiler", prPath); // Should I ignore some parameters? ParameterList allParameters = model->getParameters(); allParameters.addParameters(rDist->getParameters()); ParameterList parametersToIgnore; string paramListDesc = ApplicationTools::getStringParameter("optimization.ignore_parameter", bppdist.getParams(), "", "", true, false); bool ignoreBrLen = false; StringTokenizer st(paramListDesc, ","); while (st.hasMoreToken()) { try { string param = st.nextToken(); if (param == "BrLen") ignoreBrLen = true; else { if (allParameters.hasParameter(param)) { Parameter* p = &allParameters.getParameter(param); parametersToIgnore.addParameter(*p); } else ApplicationTools::displayWarning("Parameter '" + param + "' not found."); } } catch (ParameterNotFoundException& pnfe) { ApplicationTools::displayError("Parameter '" + pnfe.getParameter() + "' not found, and so can't be ignored!"); } } unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>("optimization.max_number_f_eval", bppdist.getParams(), 1000000); ApplicationTools::displayResult("Max # ML evaluations", TextTools::toString(nbEvalMax)); double tolerance = ApplicationTools::getDoubleParameter("optimization.tolerance", bppdist.getParams(), .000001); ApplicationTools::displayResult("Tolerance", TextTools::toString(tolerance)); //Here it is: ofstream warn("warnings", ios::out); ApplicationTools::warning = new StlOutputStreamWrapper(&warn); tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore, !ignoreBrLen, type, tolerance, nbEvalMax, profiler, messenger, optVerbose); warn.close(); delete ApplicationTools::warning; ApplicationTools::warning = ApplicationTools::message; string matrixPath = ApplicationTools::getAFilePath("output.matrix.file", bppdist.getParams(), false, false, "", false); if (matrixPath != "none") { ApplicationTools::displayResult("Output matrix file", matrixPath); string matrixFormat = ApplicationTools::getAFilePath("output.matrix.format", bppdist.getParams(), false, false, "", false); string format = ""; bool extended = false; std::map<std::string, std::string> unparsedArguments_; KeyvalTools::parseProcedure(matrixFormat, format, unparsedArguments_); if (unparsedArguments_.find("type") != unparsedArguments_.end()) { if (unparsedArguments_["type"] == "extended") { extended = true; } else if (unparsedArguments_["type"] == "classic") extended = false; else ApplicationTools::displayWarning("Argument '" + unparsedArguments_["type"] + "' for parameter 'Phylip#type' is unknown. " + "Default used instead: not extended."); } else ApplicationTools::displayWarning("Argument 'Phylip#type' not found. Default used instead: not extended."); ODistanceMatrix* odm = IODistanceMatrixFactory().createWriter(IODistanceMatrixFactory::PHYLIP_FORMAT, extended); odm->write(*distEstimation.getMatrix(), matrixPath, true); delete odm; } PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); //Output some parameters: if (type == OptimizationTools::DISTANCEMETHOD_ITERATIONS) { // Write parameters to screen: ParameterList parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // Write parameters to file: string parametersFile = ApplicationTools::getAFilePath("output.estimates", bppdist.getParams(), false, false); if (parametersFile != "none") { ofstream out(parametersFile.c_str(), ios::out); parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } out.close(); } } //Bootstrap: unsigned int nbBS = ApplicationTools::getParameter<unsigned int>("bootstrap.number", bppdist.getParams(), 0); if(nbBS > 0) { ApplicationTools::displayResult("Number of bootstrap samples", TextTools::toString(nbBS)); bool approx = ApplicationTools::getBooleanParameter("bootstrap.approximate", bppdist.getParams(), true); ApplicationTools::displayResult("Use approximate bootstrap", TextTools::toString(approx ? "yes" : "no")); if(approx) { type = OptimizationTools::DISTANCEMETHOD_INIT; parametersToIgnore = allParameters; ignoreBrLen = true; } bool bootstrapVerbose = ApplicationTools::getBooleanParameter("bootstrap.verbose", bppdist.getParams(), false, "", true, false); string bsTreesPath = ApplicationTools::getAFilePath("bootstrap.output.file", bppdist.getParams(), false, false); ofstream *out = NULL; if(bsTreesPath != "none") { ApplicationTools::displayResult("Bootstrap trees stored in file", bsTreesPath); out = new ofstream(bsTreesPath.c_str(), ios::out); } Newick newick; vector<Tree *> bsTrees(nbBS); ApplicationTools::displayTask("Bootstrapping", true); for(unsigned int i = 0; i < nbBS; i++) { ApplicationTools::displayGauge(i, nbBS-1, '='); VectorSiteContainer * sample = SiteContainerTools::bootstrapSites(*sites); if(approx) model->setFreqFromData(*sample); distEstimation.setData(sample); bsTrees[i] = OptimizationTools::buildDistanceTree( distEstimation, *distMethod, parametersToIgnore, ignoreBrLen, type, tolerance, nbEvalMax, NULL, NULL, (bootstrapVerbose ? 1 : 0) ); if(out && i == 0) newick.write(*bsTrees[i], bsTreesPath, true); if(out && i > 0) newick.write(*bsTrees[i], bsTreesPath, false); delete sample; } if(out) out->close(); if(out) delete out; ApplicationTools::displayTaskDone(); ApplicationTools::displayTask("Compute bootstrap values"); TreeTools::computeBootstrapValues(*tree, bsTrees); ApplicationTools::displayTaskDone(); for(unsigned int i = 0; i < nbBS; i++) delete bsTrees[i]; //Write resulting tree: PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); } delete alphabet; delete sites; delete distMethod; delete tree; bppdist.done();} catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }