int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Computation of site likelihoods inside mixed models *" << endl; cout << "* Version 2.2.0. *" << endl; cout << "* Author: L. Guéguen Last Modif.: 25/09/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods"); bppmixedlikelihoods.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } // get the data VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); // Get the tree Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams()); ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves())); AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl; string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false); ApplicationTools::displayResult("Heterogeneous model", nhOpt); MixedSubstitutionModel* model = 0; MixedSubstitutionModelSet* modelSet = 0; DiscreteDistribution* rDist = 0; if (nhOpt == "no") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true); } else if (nhOpt == "one_per_branch") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } vector<double> rateFreqs; if (model->getNumberOfStates() != alphabet->getSize()) { // Markov-Modulated Markov Model... unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize()); rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case, // we should assume a rate distribution for the root also!!! } std::map<std::string, std::string> aliasFreqNames; FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs); vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', ""); modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters)); model = 0; tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else if (nhOpt == "general") { modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (modelSet == 0) { cout << "Missing a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantDistribution(1.); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else throw Exception("Unknown option for nonhomogeneous: " + nhOpt); tl->initialize(); double logL = tl->getValue(); if (isinf(logL)) { // This may be due to null branch lengths, leading to null likelihood! ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero."); ApplicationTools::displayWarning("!!! This may be due to branch length == 0."); ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001."); ParameterList pl = tl->getBranchLengthsParameters(); for (unsigned int i = 0; i < pl.size(); i++) { if (pl[i].getValue() < 0.000001) pl[i].setValue(0.000001); } tl->matchParametersValues(pl); logL = tl->getValue(); } if (isinf(logL)) { ApplicationTools::displayError("!!! Unexpected likelihood == 0."); ApplicationTools::displayError("!!! Looking at each site:"); for (unsigned int i = 0; i < sites->getNumberOfSites(); i++) { (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine(); } ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences)."); exit(-1); } // Write parameters to screen: ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ParameterList parameters = tl->getSubstitutionModelParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = tl->getRateDistributionParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // ///////////////////////////////////////////// // Getting likelihoods per submodel string outputFile; outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false); ApplicationTools::displayResult("Output file for likelihoods", outputFile); ofstream out(outputFile.c_str(), ios::out); size_t nSites = sites->getNumberOfSites(); size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true); string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false); if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels()))) { ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + "."); exit(-1); } MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1)); if (!p0) { ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model."); exit(-1); } const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0); if (ptmp) { p0 = ptmp->getMixedModel().clone(); if (nhOpt == "no") model = p0; else { modelSet->replaceModel(nummodel-1, p0); modelSet->isFullySetUpFor(*tree); } } ////////////////////////////////////////////////// // Case of a MixtureOfSubstitutionModels MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0); if (pMSM) { vector<string> colNames; colNames.push_back("Sites"); size_t nummod = pMSM->getNumberOfModels(); for (unsigned int i = 0; i < nummod; i++) { colNames.push_back(pMSM->getNModel(i)->getName()); } DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]"); } Vdouble vprob = pMSM->getProbabilities(); for (unsigned int i = 0; i < nummod; i++) { string modname = pMSM->getNModel(i)->getName(); for (unsigned int j = 0; j < nummod; j++) { pMSM->setNProbability(j, (j == i) ? 1 : 0); } if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble Vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) { (*rates)(j, modname) = TextTools::toString(Vd[j]); } ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Model " + modname + ":"); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15)); } DataTable::write(*rates, out, "\t"); } ////////////////////////////////////////////////// // Case of a MixtureOfASubstitutionModel else { MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0); if (pMSM2 != NULL) { size_t nummod = pMSM2->getNumberOfModels(); if (parname == "") { ParameterList pl=pMSM2->getParameters(); for (size_t i2 = 0; i2 < pl.size(); i2++) { string pl2n = pl[i2].getName(); if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL) { parname=pl2n; while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL) parname=pl2n.substr(0,pl2n.rfind("_")); if (parname.size()>0){ ApplicationTools::displayResult("likelihoods.parameter_name", parname); break; } } } } if (parname == "") { ApplicationTools::displayError("Argument likelihoods.parameter_name is required."); exit(-1); } vector< Vint > vvnmod; size_t i2 = 0; while (i2 < nummod) { string par2 = parname + "_" + TextTools::toString(i2 + 1); Vint vnmod = pMSM2->getSubmodelNumbers(par2); if (vnmod.size() == 0) break; vvnmod.push_back(vnmod); i2++; } size_t nbcl = vvnmod.size(); if (nbcl==0) throw Exception("Parameter " + parname + " is not mixed."); Vdouble vprob = pMSM2->getProbabilities(); vector<vector<double> > vvprob; vector<double> vsprob; for (size_t i = 0; i < nbcl; i++) { vector<double> vprob2; for (size_t j = 0; j < vvnmod[i].size(); j++) { vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]); } vvprob.push_back(vprob2); vsprob.push_back(VectorTools::sum(vvprob[i])); } vector<string> colNames; colNames.push_back("Sites"); Vdouble dval; for (unsigned int i = 0; i < nbcl; i++) { SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0])); double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname)); dval.push_back(valPar); colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar)); } for (unsigned int i = 0; i < nbcl; i++) colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i])); colNames.push_back("mean"); DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i,"Sites")=TextTools::toString(currentSitePosition); } VVdouble vvd; vector<double> vRates = pMSM2->getVRates(); for (size_t i = 0; i < nbcl; ++i) { string par2 = parname + "_" + TextTools::toString(i + 1); for (unsigned int j = 0; j < nummod; ++j) pMSM2->setNProbability(j, 0); for (size_t j = 0; j < vvprob[i].size(); ++j) pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]); if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) (*rates)(j, i + 1) = TextTools::toString(vd[j]); vvd.push_back(vd); ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i])); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15)); } for (unsigned int j = 0; j < nSites; j++) { Vdouble vd; for (unsigned int i = 0; i < nbcl; i++) vd.push_back(std::log(vsprob[i])+vvd[i][j]); VectorTools::logNorm(vd); for (unsigned int i = 0; i < nbcl; i++) (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i])); (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval)); } DataTable::write(*rates, out, "\t"); } } delete alphabet; delete sites; if (model) delete model; if (modelSet) delete modelSet; delete rDist; delete tl; delete tree; ApplicationTools::displayMessage("\n"); bppmixedlikelihoods.done(); } catch (exception& e) { cout << e.what() << endl; return 1; } return 0; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Sequence Manipulator, version 2.3.0. *" << endl; cout << "* Author: J. Dutheil Last Modif. 25/11/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppseqman(args, argv, "BppSeqMan"); bppseqman.startTimer(); // Get alphabet Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true); unique_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); // Get sequences: bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1); OrderedSequenceContainer* sequences = 0; if (aligned) { VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams()); sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false); delete allSites; } else { SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true); sequences = new VectorSequenceContainer(*tmp); delete tmp; } ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences()); // Perform manipulations vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1); for (size_t a = 0; a < actions.size(); a++) { string cmdName; map<string, string> cmdArgs; KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs); ApplicationTools::displayResult("Performing action", cmdName); // +-----------------+ // | Complementation | // +-----------------+ if (cmdName == "Complement") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------------+ // | (Reverse)Transcription | // +------------------------+ else if (cmdName == "Transcript") { if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::transcript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else throw Exception("Transcription error: input alphabet must be of type 'nucleic'."); } // +-------------------------------+ // | Switching nucleotide alphabet | // +-------------------------------+ else if (cmdName == "Switch") { const Alphabet* alpha = 0; if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::RNA_ALPHABET; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::DNA_ALPHABET; } else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'."); OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(alpha); else sc = new VectorSequenceContainer(alpha); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); vector<int> content(old->size()); for (size_t j = 0; j < old->size(); ++j) content[j] = (*old)[j]; Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Translation | // +-------------+ else if (cmdName == "Translate") { if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet())) throw Exception("Error in translation: alphabet is not of type 'codon'."); if (cmdArgs["code"] != "") throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'."); if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { Sequence* seq = gCode->translate(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Remove gaps | // +-------------+ else if (cmdName == "RemoveGaps") { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeGaps(*seq); sc->addSequence(*seq); } delete sequences; sequences = sc; aligned = false; } // +---------------------------+ // | Change gaps to unresolved | // +---------------------------+ else if (cmdName == "GapToUnknown") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeGapsToUnknownCharacters(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +---------------------------+ // | Change unresolved to gaps | // +---------------------------+ else if (cmdName == "UnknownToGap") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeUnresolvedCharactersToGaps(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveStops") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeStops(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } else { VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::replaceStopsWithGaps(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveColumnsWithStops") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } for (size_t i = sites->getNumberOfSites(); i > 0; i--) { if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode)) sites->deleteSite(i - 1); } } // +---------+ // | Get CDS | // +---------+ else if (cmdName == "GetCDS") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { BasicSequence seq = sequences->getSequence(i); size_t len = seq.size(); SequenceTools::getCDS(seq, *gCode, false, true, true, false); if (aligned) { for (size_t c = seq.size(); c < len; ++c) seq.addElement(seq.getAlphabet()->getGapCharacterCode()); } sc->addSequence(seq, false); } delete sequences; sequences = sc; } // +--------------------------+ // | Resolve dotted alignment | // +--------------------------+ else if (actions[a] == "CoerceToAlignment") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if(! sites) { sites = new VectorSiteContainer(*sequences); delete sequences; sequences = sites; } aligned = true; } else if (actions[a] == "ResolvedDotted") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } const Alphabet* alpha = 0; string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1); if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET; else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET; else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET; else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment."); OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha); delete sequences; sequences = resolvedCont; } // +---------------------+ // | Keep complete sites | // +---------------------+ else if (cmdName == "KeepComplete") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1); if (maxGapOption[maxGapOption.size()-1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.; for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, double> freqs; SiteTools::getFrequencies(sites->getSite(i - 1), freqs); if (freqs[-1] > gapFreq) sites->deleteSite(i - 1); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, size_t> counts; SiteTools::getCounts(sites->getSite(i - 1), counts); counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0. if (counts[-1] > gapNum) sites->deleteSite(i-1); } } } // +-----------------+ // | Invert sequence | // +-----------------+ else if (cmdName == "Invert") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); Sequence* seq = SequenceTools::getInvert(*old); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------+ // | GetCodonPosition | // +------------------+ else if (cmdName == "GetCodonPosition") { unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1); OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1)); delete sequences; if (aligned) { sequences = new VectorSiteContainer(*sc); delete sc; } else { sequences = sc; } } // +-----------------+ // | FilterFromTree | // +-----------------+ else if (cmdName == "FilterFromTree") { unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, "")); vector<string> names = tree->getLeavesNames(); OrderedSequenceContainer* reorderedSequences = 0; if (aligned) { reorderedSequences = new VectorSiteContainer(sequences->getAlphabet()); } else { reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet()); } for (size_t i = 0; i < names.size(); ++i) { reorderedSequences->addSequence(sequences->getSequence(names[i]), false); } delete sequences; sequences = reorderedSequences; } // +----------------------+ // | RemoveEmptySequences | // +----------------------+ else if (cmdName == "RemoveEmptySequences") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0) sc->addSequence(sequences->getSequence(i), false); } delete sequences; sequences = sc; } else throw Exception("Unknown action: " + cmdName); } // Write sequences ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned); if (aligned) { SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1); } else { SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1); } delete alphabet; delete sequences; bppseqman.done(); } catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }
int main(int args, char ** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Distance Methods, version 2.2.0 *" << endl; cout << "* Author: J. Dutheil Created 05/05/07 *" << endl; cout << "* Last Modif. 04/02/15 *" << endl; cout << "******************************************************************" << endl; cout << endl; if(args == 1) { help(); return 0; } try { BppApplication bppdist(args, argv, "BppDist"); bppdist.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppdist.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppdist.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppdist.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(* allSites, bppdist.getParams()); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); SubstitutionModel* model = PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppdist.getParams()); DiscreteDistribution* rDist = 0; if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { //Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppdist.getParams()); } DistanceEstimation distEstimation(model, rDist, sites, 1, false); string method = ApplicationTools::getStringParameter("method", bppdist.getParams(), "nj"); ApplicationTools::displayResult("Tree reconstruction method", method); TreeTemplate<Node>* tree; AgglomerativeDistanceMethod* distMethod = 0; if(method == "wpgma") { PGMA* wpgma = new PGMA(true); distMethod = wpgma; } else if(method == "upgma") { PGMA* upgma = new PGMA(false); distMethod = upgma; } else if(method == "nj") { NeighborJoining* nj = new NeighborJoining(); nj->outputPositiveLengths(true); distMethod = nj; } else if(method == "bionj") { BioNJ* bionj = new BioNJ(); bionj->outputPositiveLengths(true); distMethod = bionj; } else throw Exception("Unknown tree reconstruction method."); string type = ApplicationTools::getStringParameter("optimization.method", bppdist.getParams(), "init"); ApplicationTools::displayResult("Model parameters estimation method", type); if (type == "init") type = OptimizationTools::DISTANCEMETHOD_INIT; else if (type == "pairwise") type = OptimizationTools::DISTANCEMETHOD_PAIRWISE; else if (type == "iterations") type = OptimizationTools::DISTANCEMETHOD_ITERATIONS; else throw Exception("Unknown parameter estimation procedure '" + type + "'."); unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>("optimization.verbose", bppdist.getParams(), 2); string mhPath = ApplicationTools::getAFilePath("optimization.message_handler", bppdist.getParams(), false, false); OutputStream* messenger = (mhPath == "none") ? 0 : (mhPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(mhPath.c_str(), ios::out)); ApplicationTools::displayResult("Message handler", mhPath); string prPath = ApplicationTools::getAFilePath("optimization.profiler", bppdist.getParams(), false, false); OutputStream* profiler = (prPath == "none") ? 0 : (prPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(prPath.c_str(), ios::out)); if(profiler) profiler->setPrecision(20); ApplicationTools::displayResult("Profiler", prPath); // Should I ignore some parameters? ParameterList allParameters = model->getParameters(); allParameters.addParameters(rDist->getParameters()); ParameterList parametersToIgnore; string paramListDesc = ApplicationTools::getStringParameter("optimization.ignore_parameter", bppdist.getParams(), "", "", true, false); bool ignoreBrLen = false; StringTokenizer st(paramListDesc, ","); while (st.hasMoreToken()) { try { string param = st.nextToken(); if (param == "BrLen") ignoreBrLen = true; else { if (allParameters.hasParameter(param)) { Parameter* p = &allParameters.getParameter(param); parametersToIgnore.addParameter(*p); } else ApplicationTools::displayWarning("Parameter '" + param + "' not found."); } } catch (ParameterNotFoundException& pnfe) { ApplicationTools::displayError("Parameter '" + pnfe.getParameter() + "' not found, and so can't be ignored!"); } } unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>("optimization.max_number_f_eval", bppdist.getParams(), 1000000); ApplicationTools::displayResult("Max # ML evaluations", TextTools::toString(nbEvalMax)); double tolerance = ApplicationTools::getDoubleParameter("optimization.tolerance", bppdist.getParams(), .000001); ApplicationTools::displayResult("Tolerance", TextTools::toString(tolerance)); //Here it is: ofstream warn("warnings", ios::out); ApplicationTools::warning = new StlOutputStreamWrapper(&warn); tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore, !ignoreBrLen, type, tolerance, nbEvalMax, profiler, messenger, optVerbose); warn.close(); delete ApplicationTools::warning; ApplicationTools::warning = ApplicationTools::message; string matrixPath = ApplicationTools::getAFilePath("output.matrix.file", bppdist.getParams(), false, false, "", false); if (matrixPath != "none") { ApplicationTools::displayResult("Output matrix file", matrixPath); string matrixFormat = ApplicationTools::getAFilePath("output.matrix.format", bppdist.getParams(), false, false, "", false); string format = ""; bool extended = false; std::map<std::string, std::string> unparsedArguments_; KeyvalTools::parseProcedure(matrixFormat, format, unparsedArguments_); if (unparsedArguments_.find("type") != unparsedArguments_.end()) { if (unparsedArguments_["type"] == "extended") { extended = true; } else if (unparsedArguments_["type"] == "classic") extended = false; else ApplicationTools::displayWarning("Argument '" + unparsedArguments_["type"] + "' for parameter 'Phylip#type' is unknown. " + "Default used instead: not extended."); } else ApplicationTools::displayWarning("Argument 'Phylip#type' not found. Default used instead: not extended."); ODistanceMatrix* odm = IODistanceMatrixFactory().createWriter(IODistanceMatrixFactory::PHYLIP_FORMAT, extended); odm->write(*distEstimation.getMatrix(), matrixPath, true); delete odm; } PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); //Output some parameters: if (type == OptimizationTools::DISTANCEMETHOD_ITERATIONS) { // Write parameters to screen: ParameterList parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // Write parameters to file: string parametersFile = ApplicationTools::getAFilePath("output.estimates", bppdist.getParams(), false, false); if (parametersFile != "none") { ofstream out(parametersFile.c_str(), ios::out); parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } out.close(); } } //Bootstrap: unsigned int nbBS = ApplicationTools::getParameter<unsigned int>("bootstrap.number", bppdist.getParams(), 0); if(nbBS > 0) { ApplicationTools::displayResult("Number of bootstrap samples", TextTools::toString(nbBS)); bool approx = ApplicationTools::getBooleanParameter("bootstrap.approximate", bppdist.getParams(), true); ApplicationTools::displayResult("Use approximate bootstrap", TextTools::toString(approx ? "yes" : "no")); if(approx) { type = OptimizationTools::DISTANCEMETHOD_INIT; parametersToIgnore = allParameters; ignoreBrLen = true; } bool bootstrapVerbose = ApplicationTools::getBooleanParameter("bootstrap.verbose", bppdist.getParams(), false, "", true, false); string bsTreesPath = ApplicationTools::getAFilePath("bootstrap.output.file", bppdist.getParams(), false, false); ofstream *out = NULL; if(bsTreesPath != "none") { ApplicationTools::displayResult("Bootstrap trees stored in file", bsTreesPath); out = new ofstream(bsTreesPath.c_str(), ios::out); } Newick newick; vector<Tree *> bsTrees(nbBS); ApplicationTools::displayTask("Bootstrapping", true); for(unsigned int i = 0; i < nbBS; i++) { ApplicationTools::displayGauge(i, nbBS-1, '='); VectorSiteContainer * sample = SiteContainerTools::bootstrapSites(*sites); if(approx) model->setFreqFromData(*sample); distEstimation.setData(sample); bsTrees[i] = OptimizationTools::buildDistanceTree( distEstimation, *distMethod, parametersToIgnore, ignoreBrLen, type, tolerance, nbEvalMax, NULL, NULL, (bootstrapVerbose ? 1 : 0) ); if(out && i == 0) newick.write(*bsTrees[i], bsTreesPath, true); if(out && i > 0) newick.write(*bsTrees[i], bsTreesPath, false); delete sample; } if(out) out->close(); if(out) delete out; ApplicationTools::displayTaskDone(); ApplicationTools::displayTask("Compute bootstrap values"); TreeTools::computeBootstrapValues(*tree, bsTrees); ApplicationTools::displayTaskDone(); for(unsigned int i = 0; i < nbBS; i++) delete bsTrees[i]; //Write resulting tree: PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); } delete alphabet; delete sites; delete distMethod; delete tree; bppdist.done();} catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }