Tree* TreeTools::MRPMultilabel(const vector<Tree*>& vecTr) { // matrix representation VectorSiteContainer* sites = TreeTools::MRPEncode(vecTr); // starting bioNJ tree const DNA* alphabet = dynamic_cast<const DNA*>(sites->getAlphabet()); JCnuc* jc = new JCnuc(alphabet); ConstantDistribution* constRate = new ConstantDistribution(1.); DistanceEstimation distFunc(jc, constRate, sites, 0, true); BioNJ bionjTreeBuilder(false, false); bionjTreeBuilder.setDistanceMatrix(*(distFunc.getMatrix())); bionjTreeBuilder.computeTree(); if (ApplicationTools::message) ApplicationTools::message->endLine(); TreeTemplate<Node>* startTree = new TreeTemplate<Node>(*bionjTreeBuilder.getTree()); // MP optimization DRTreeParsimonyScore* MPScore = new DRTreeParsimonyScore(*startTree, *sites, false); MPScore = OptimizationTools::optimizeTreeNNI(MPScore, 0); delete startTree; Tree* retTree = new TreeTemplate<Node>(MPScore->getTree()); delete MPScore; return retTree; }
VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index) { VectorSiteContainer* sample = new VectorSiteContainer(sites.getSequencesNames(), sites.getAlphabet()); for (size_t i = 0; i < nbSites; i++) { size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites()))); sample->addSite(sites.getSite(pos), false); if (index) index->push_back(pos); } return sample; }
SiteContainer* SiteContainerTools::getCompleteSites(const SiteContainer& sites) { vector<string> seqNames = sites.getSequencesNames(); VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); noGapCont->setSequencesNames(seqNames, false); CompleteSiteContainerIterator csi(sites); while (csi.hasMoreSites()) { noGapCont->addSite(*csi.nextSite()); } return noGapCont; }
SiteContainer* SiteContainerTools::removeGapSites(const SiteContainer& sites, double maxFreqGaps) { vector<string> seqNames = sites.getSequencesNames(); VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); noGapCont->setSequencesNames(seqNames, false); for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) { map<int, double> freq; SiteTools::getFrequencies(sites.getSite(i), freq); if (freq[-1] <= maxFreqGaps) noGapCont->addSite(sites.getSite(i), false); } return noGapCont; }
SiteContainer* SiteContainerTools::removeGapOrUnresolvedOnlySites(const SiteContainer& sites) { vector<string> seqNames = sites.getSequencesNames(); VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); noGapCont->setSequencesNames(seqNames, false); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { const Site* site = &sites.getSite(i); if (!SiteTools::isGapOrUnresolvedOnly(*site)) noGapCont->addSite(*site, false); } return noGapCont; }
SiteContainer* SiteContainerTools::getSelectedSites( const SiteContainer& sequences, const SiteSelection& selection) { vector<string> seqNames = sequences.getSequencesNames(); VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet()); sc->setSequencesNames(seqNames, false); for (unsigned int i = 0; i < selection.size(); i++) { sc->addSite(sequences.getSite(selection[i]), false); // We do not check names, we suppose that the container passed as an argument is correct. // WARNING: what if selection contains many times the same indice? ... } sc->setGeneralComments(sequences.getGeneralComments()); return sc; }
SiteContainer* SiteContainerTools::removeStopCodonSites(const SiteContainer& sites) throw (AlphabetException) { const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet()); if (!pca) throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet()); vector<string> seqNames = sites.getSequencesNames(); VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet()); noStopCont->setSequencesNames(seqNames, false); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { const Site* site = &sites.getSite(i); if (!SiteTools::hasStopCodon(*site)) noStopCont->addSite(*site, false); } return noStopCont; }
VectorSiteContainer::VectorSiteContainer(const VectorSiteContainer& vsc) : AbstractSequenceContainer(vsc), sites_(0), names_(vsc.names_), comments_(vsc.getNumberOfSequences()), sequences_(vsc.getNumberOfSequences()) { // Now try to add each site: for (size_t i = 0; i < vsc.getNumberOfSites(); i++) { addSite(vsc.getSite(i), false); // We assume that positions are correct. } // Seq comments: for (size_t i = 0; i < vsc.getNumberOfSequences(); i++) { comments_[i] = new Comments(vsc.getComments(i)); } }
VectorSiteContainer* SequenceApplicationTools::getSiteContainer( const Alphabet* alpha, map<string, string>& params, const string& suffix, bool suffixIsOptional, bool verbose, int warn) { string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn); string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn); BppOAlignmentReaderFormat bppoReader(warn); unique_ptr<IAlignment> iAln(bppoReader.read(sequenceFormat)); map<string, string> args(bppoReader.getUnparsedArguments()); if (verbose) { ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath); ApplicationTools::displayResult("Sequence format " + suffix, iAln->getFormatName()); } const Alphabet* alpha2; if (AlphabetTools::isRNYAlphabet(alpha)) alpha2 = &dynamic_cast<const RNY*>(alpha)->getLetterAlphabet(); else alpha2 = alpha; const SequenceContainer* seqCont = iAln->readAlignment(sequenceFilePath, alpha2); VectorSiteContainer* sites2 = new VectorSiteContainer(*dynamic_cast<const OrderedSequenceContainer*>(seqCont)); delete seqCont; VectorSiteContainer* sites; if (AlphabetTools::isRNYAlphabet(alpha)) { const SequenceTools ST; sites = new VectorSiteContainer(alpha); for (unsigned int i = 0; i < sites2->getNumberOfSequences(); i++) { sites->addSequence(*(ST.RNYslice(sites2->getSequence(i)))); } delete sites2; } else sites = sites2; // Look for site selection: if (iAln->getFormatName() == "MASE file") { // getting site set: string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1); if (siteSet != "none") { VectorSiteContainer* selectedSites; try { selectedSites = dynamic_cast<VectorSiteContainer*>(MaseTools::getSelectedSites(*sites, siteSet)); if (verbose) ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites."); } catch (IOException& ioe) { throw ioe; } if (selectedSites->getNumberOfSites() == 0) { throw Exception("Site set '" + siteSet + "' is empty."); } delete sites; sites = selectedSites; } } else { // getting site set: size_t nbSites = sites->getNumberOfSites(); string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1); VectorSiteContainer* selectedSites = 0; if (siteSet != "none") { vector<size_t> vSite; try { vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet,",",":"); for (size_t i = 0; i < vSite1.size(); ++i) { int x = (vSite1[i] >= 0 ? vSite1[i] : static_cast<int>(nbSites) + vSite1[i]+ 1); if (x<=(int)nbSites) { if (x > 0) vSite.push_back(static_cast<size_t>(x - 1)); else throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index: " + TextTools::toString(x)); } else throw Exception("SequenceApplicationTools::getSiteContainer(). Too large index: " + TextTools::toString(x)); } selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite)); selectedSites->reindexSites(); } catch (Exception& e) { string seln; map<string, string> selArgs; KeyvalTools::parseProcedure(siteSet, seln, selArgs); if (seln == "Sample") { size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1); bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1); vSite.resize(n); vector<size_t> vPos; for (size_t p = 0; p < nbSites; ++p) { vPos.push_back(p); } RandomTools::getSample(vPos, vSite, replace); selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite)); if (replace) selectedSites->reindexSites(); } else throw Exception("Unknown site selection description: " + siteSet); } if (verbose) ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet)); if (selectedSites && (selectedSites->getNumberOfSites() == 0)) { throw Exception("Site set '" + siteSet + "' is empty."); } delete sites; sites = selectedSites; } } return sites; }
VectorSiteContainer* VectorSiteContainer::createEmptyContainer() const { VectorSiteContainer* vsc = new VectorSiteContainer(getAlphabet()); vsc->setGeneralComments(getGeneralComments()); return vsc; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Sequence Manipulator, version 2.3.0. *" << endl; cout << "* Author: J. Dutheil Last Modif. 25/11/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppseqman(args, argv, "BppSeqMan"); bppseqman.startTimer(); // Get alphabet Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true); unique_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); // Get sequences: bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1); OrderedSequenceContainer* sequences = 0; if (aligned) { VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams()); sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false); delete allSites; } else { SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true); sequences = new VectorSequenceContainer(*tmp); delete tmp; } ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences()); // Perform manipulations vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1); for (size_t a = 0; a < actions.size(); a++) { string cmdName; map<string, string> cmdArgs; KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs); ApplicationTools::displayResult("Performing action", cmdName); // +-----------------+ // | Complementation | // +-----------------+ if (cmdName == "Complement") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------------+ // | (Reverse)Transcription | // +------------------------+ else if (cmdName == "Transcript") { if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::transcript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else throw Exception("Transcription error: input alphabet must be of type 'nucleic'."); } // +-------------------------------+ // | Switching nucleotide alphabet | // +-------------------------------+ else if (cmdName == "Switch") { const Alphabet* alpha = 0; if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::RNA_ALPHABET; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::DNA_ALPHABET; } else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'."); OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(alpha); else sc = new VectorSequenceContainer(alpha); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); vector<int> content(old->size()); for (size_t j = 0; j < old->size(); ++j) content[j] = (*old)[j]; Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Translation | // +-------------+ else if (cmdName == "Translate") { if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet())) throw Exception("Error in translation: alphabet is not of type 'codon'."); if (cmdArgs["code"] != "") throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'."); if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { Sequence* seq = gCode->translate(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Remove gaps | // +-------------+ else if (cmdName == "RemoveGaps") { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeGaps(*seq); sc->addSequence(*seq); } delete sequences; sequences = sc; aligned = false; } // +---------------------------+ // | Change gaps to unresolved | // +---------------------------+ else if (cmdName == "GapToUnknown") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeGapsToUnknownCharacters(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +---------------------------+ // | Change unresolved to gaps | // +---------------------------+ else if (cmdName == "UnknownToGap") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeUnresolvedCharactersToGaps(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveStops") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeStops(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } else { VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::replaceStopsWithGaps(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveColumnsWithStops") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } for (size_t i = sites->getNumberOfSites(); i > 0; i--) { if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode)) sites->deleteSite(i - 1); } } // +---------+ // | Get CDS | // +---------+ else if (cmdName == "GetCDS") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { BasicSequence seq = sequences->getSequence(i); size_t len = seq.size(); SequenceTools::getCDS(seq, *gCode, false, true, true, false); if (aligned) { for (size_t c = seq.size(); c < len; ++c) seq.addElement(seq.getAlphabet()->getGapCharacterCode()); } sc->addSequence(seq, false); } delete sequences; sequences = sc; } // +--------------------------+ // | Resolve dotted alignment | // +--------------------------+ else if (actions[a] == "CoerceToAlignment") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if(! sites) { sites = new VectorSiteContainer(*sequences); delete sequences; sequences = sites; } aligned = true; } else if (actions[a] == "ResolvedDotted") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } const Alphabet* alpha = 0; string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1); if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET; else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET; else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET; else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment."); OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha); delete sequences; sequences = resolvedCont; } // +---------------------+ // | Keep complete sites | // +---------------------+ else if (cmdName == "KeepComplete") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1); if (maxGapOption[maxGapOption.size()-1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.; for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, double> freqs; SiteTools::getFrequencies(sites->getSite(i - 1), freqs); if (freqs[-1] > gapFreq) sites->deleteSite(i - 1); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, size_t> counts; SiteTools::getCounts(sites->getSite(i - 1), counts); counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0. if (counts[-1] > gapNum) sites->deleteSite(i-1); } } } // +-----------------+ // | Invert sequence | // +-----------------+ else if (cmdName == "Invert") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); Sequence* seq = SequenceTools::getInvert(*old); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------+ // | GetCodonPosition | // +------------------+ else if (cmdName == "GetCodonPosition") { unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1); OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1)); delete sequences; if (aligned) { sequences = new VectorSiteContainer(*sc); delete sc; } else { sequences = sc; } } // +-----------------+ // | FilterFromTree | // +-----------------+ else if (cmdName == "FilterFromTree") { unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, "")); vector<string> names = tree->getLeavesNames(); OrderedSequenceContainer* reorderedSequences = 0; if (aligned) { reorderedSequences = new VectorSiteContainer(sequences->getAlphabet()); } else { reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet()); } for (size_t i = 0; i < names.size(); ++i) { reorderedSequences->addSequence(sequences->getSequence(names[i]), false); } delete sequences; sequences = reorderedSequences; } // +----------------------+ // | RemoveEmptySequences | // +----------------------+ else if (cmdName == "RemoveEmptySequences") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0) sc->addSequence(sequences->getSequence(i), false); } delete sequences; sequences = sc; } else throw Exception("Unknown action: " + cmdName); } // Write sequences ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned); if (aligned) { SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1); } else { SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1); } delete alphabet; delete sequences; bppseqman.done(); } catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Computation of site likelihoods inside mixed models *" << endl; cout << "* Version 2.2.0. *" << endl; cout << "* Author: L. Guéguen Last Modif.: 25/09/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods"); bppmixedlikelihoods.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } // get the data VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); // Get the tree Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams()); ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves())); AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl; string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false); ApplicationTools::displayResult("Heterogeneous model", nhOpt); MixedSubstitutionModel* model = 0; MixedSubstitutionModelSet* modelSet = 0; DiscreteDistribution* rDist = 0; if (nhOpt == "no") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true); } else if (nhOpt == "one_per_branch") { model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (model == 0) { cout << "Model is not a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } vector<double> rateFreqs; if (model->getNumberOfStates() != alphabet->getSize()) { // Markov-Modulated Markov Model... unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize()); rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case, // we should assume a rate distribution for the root also!!! } std::map<std::string, std::string> aliasFreqNames; FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs); vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', ""); modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters)); model = 0; tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else if (nhOpt == "general") { modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams())); if (modelSet == 0) { cout << "Missing a Mixed model" << endl; exit(0); } SiteContainerTools::changeGapsToUnknownCharacters(*sites); if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize()) { // Markov-modulated Markov model! rDist = new ConstantDistribution(1.); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams()); } tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true); } else throw Exception("Unknown option for nonhomogeneous: " + nhOpt); tl->initialize(); double logL = tl->getValue(); if (isinf(logL)) { // This may be due to null branch lengths, leading to null likelihood! ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero."); ApplicationTools::displayWarning("!!! This may be due to branch length == 0."); ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001."); ParameterList pl = tl->getBranchLengthsParameters(); for (unsigned int i = 0; i < pl.size(); i++) { if (pl[i].getValue() < 0.000001) pl[i].setValue(0.000001); } tl->matchParametersValues(pl); logL = tl->getValue(); } if (isinf(logL)) { ApplicationTools::displayError("!!! Unexpected likelihood == 0."); ApplicationTools::displayError("!!! Looking at each site:"); for (unsigned int i = 0; i < sites->getNumberOfSites(); i++) { (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine(); } ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences)."); exit(-1); } // Write parameters to screen: ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ParameterList parameters = tl->getSubstitutionModelParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = tl->getRateDistributionParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // ///////////////////////////////////////////// // Getting likelihoods per submodel string outputFile; outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false); ApplicationTools::displayResult("Output file for likelihoods", outputFile); ofstream out(outputFile.c_str(), ios::out); size_t nSites = sites->getNumberOfSites(); size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true); string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false); if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels()))) { ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + "."); exit(-1); } MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1)); if (!p0) { ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model."); exit(-1); } const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0); if (ptmp) { p0 = ptmp->getMixedModel().clone(); if (nhOpt == "no") model = p0; else { modelSet->replaceModel(nummodel-1, p0); modelSet->isFullySetUpFor(*tree); } } ////////////////////////////////////////////////// // Case of a MixtureOfSubstitutionModels MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0); if (pMSM) { vector<string> colNames; colNames.push_back("Sites"); size_t nummod = pMSM->getNumberOfModels(); for (unsigned int i = 0; i < nummod; i++) { colNames.push_back(pMSM->getNModel(i)->getName()); } DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]"); } Vdouble vprob = pMSM->getProbabilities(); for (unsigned int i = 0; i < nummod; i++) { string modname = pMSM->getNModel(i)->getName(); for (unsigned int j = 0; j < nummod; j++) { pMSM->setNProbability(j, (j == i) ? 1 : 0); } if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble Vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) { (*rates)(j, modname) = TextTools::toString(Vd[j]); } ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Model " + modname + ":"); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15)); } DataTable::write(*rates, out, "\t"); } ////////////////////////////////////////////////// // Case of a MixtureOfASubstitutionModel else { MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0); if (pMSM2 != NULL) { size_t nummod = pMSM2->getNumberOfModels(); if (parname == "") { ParameterList pl=pMSM2->getParameters(); for (size_t i2 = 0; i2 < pl.size(); i2++) { string pl2n = pl[i2].getName(); if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL) { parname=pl2n; while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL) parname=pl2n.substr(0,pl2n.rfind("_")); if (parname.size()>0){ ApplicationTools::displayResult("likelihoods.parameter_name", parname); break; } } } } if (parname == "") { ApplicationTools::displayError("Argument likelihoods.parameter_name is required."); exit(-1); } vector< Vint > vvnmod; size_t i2 = 0; while (i2 < nummod) { string par2 = parname + "_" + TextTools::toString(i2 + 1); Vint vnmod = pMSM2->getSubmodelNumbers(par2); if (vnmod.size() == 0) break; vvnmod.push_back(vnmod); i2++; } size_t nbcl = vvnmod.size(); if (nbcl==0) throw Exception("Parameter " + parname + " is not mixed."); Vdouble vprob = pMSM2->getProbabilities(); vector<vector<double> > vvprob; vector<double> vsprob; for (size_t i = 0; i < nbcl; i++) { vector<double> vprob2; for (size_t j = 0; j < vvnmod[i].size(); j++) { vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]); } vvprob.push_back(vprob2); vsprob.push_back(VectorTools::sum(vvprob[i])); } vector<string> colNames; colNames.push_back("Sites"); Vdouble dval; for (unsigned int i = 0; i < nbcl; i++) { SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0])); double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname)); dval.push_back(valPar); colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar)); } for (unsigned int i = 0; i < nbcl; i++) colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i])); colNames.push_back("mean"); DataTable* rates = new DataTable(nSites, colNames.size()); rates->setColumnNames(colNames); for (unsigned int i = 0; i < nSites; i++) { const Site* currentSite = &sites->getSite(i); int currentSitePosition = currentSite->getPosition(); (*rates)(i,"Sites")=TextTools::toString(currentSitePosition); } VVdouble vvd; vector<double> vRates = pMSM2->getVRates(); for (size_t i = 0; i < nbcl; ++i) { string par2 = parname + "_" + TextTools::toString(i + 1); for (unsigned int j = 0; j < nummod; ++j) pMSM2->setNProbability(j, 0); for (size_t j = 0; j < vvprob[i].size(); ++j) pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]); if (tl) delete tl; if (nhOpt == "no") tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true); else tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true); tl->initialize(); logL = tl->getValue(); Vdouble vd = tl->getLogLikelihoodForEachSite(); for (unsigned int j = 0; j < nSites; j++) (*rates)(j, i + 1) = TextTools::toString(vd[j]); vvd.push_back(vd); ApplicationTools::displayMessage("\n"); ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i])); ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15)); ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15)); } for (unsigned int j = 0; j < nSites; j++) { Vdouble vd; for (unsigned int i = 0; i < nbcl; i++) vd.push_back(std::log(vsprob[i])+vvd[i][j]); VectorTools::logNorm(vd); for (unsigned int i = 0; i < nbcl; i++) (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i])); (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval)); } DataTable::write(*rates, out, "\t"); } } delete alphabet; delete sites; if (model) delete model; if (modelSet) delete modelSet; delete rDist; delete tl; delete tree; ApplicationTools::displayMessage("\n"); bppmixedlikelihoods.done(); } catch (exception& e) { cout << e.what() << endl; return 1; } return 0; }
VectorSiteContainer * SequenceApplicationTools::getSitesToAnalyse( const SiteContainer & allSites, map<string, string> & params, string suffix, bool suffixIsOptional, bool gapAsUnknown, bool verbose) { // Fully resolved sites, i.e. without jokers and gaps: VectorSiteContainer * sitesToAnalyse; string option = ApplicationTools::getStringParameter("sequence.sites_to_use", params, "complete", suffix, suffixIsOptional); if(verbose) ApplicationTools::displayResult("Sites to use", option); sitesToAnalyse = new VectorSiteContainer(allSites); if(option == "all") { string maxGapOption = ApplicationTools::getStringParameter("sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional); if(maxGapOption[maxGapOption.size()-1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0,maxGapOption.size()-1)) / 100.; for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { map<int, double> freq; SiteTools::getFrequencies(*sitesToAnalyse->getSite(i-1), freq); if(freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i-1); } } else { unsigned int gapNum=TextTools::to<unsigned int>(maxGapOption); for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { map<int, unsigned int> counts; SiteTools::getCounts(*sitesToAnalyse->getSite(i-1), counts); if(counts[-1] > gapNum) sitesToAnalyse->deleteSite(i-1); } } if(gapAsUnknown) { SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse); } } else if(option == "complete") { sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getCompleteSites(allSites)); int nbSites = sitesToAnalyse->getNumberOfSites(); if(verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites)); } else if(option == "nogap") { sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getSitesWithoutGaps(allSites)); int nbSites = sitesToAnalyse->getNumberOfSites(); if(verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites)); } else { ApplicationTools::displayError("Option '" + option + "' unknown in parameter 'sequence.sitestouse'."); exit(-1); } return sitesToAnalyse; }
VectorSiteContainer * SequenceApplicationTools::getSiteContainer( const Alphabet * alpha, map<string, string> & params, const string & suffix, bool suffixIsOptional, bool verbose) { string sequenceFilePath = ApplicationTools::getAFilePath("sequence.file",params, true, true, suffix, suffixIsOptional); string sequenceFormat = ApplicationTools::getStringParameter("sequence.format", params, "Fasta", suffix, suffixIsOptional); if(verbose) ApplicationTools::displayResult("Sequence format " + suffix, sequenceFormat); ISequence * iSeq = NULL; if(sequenceFormat == "Mase") { iSeq = new Mase(); } else if(sequenceFormat == "Phylip") { bool sequential = true, extended = true; string split = " "; if(params.find("sequence.format_phylip.order") != params.end()) { if(params["sequence.format_phylip.order"] == "sequential" ) sequential = true; else if(params["sequence.format_phylip.order"] == "interleaved") sequential = false; else ApplicationTools::displayWarning("Argument '" + params["sequence.format_phylip.order"] + "' for parameter 'sequence.format_phylip.order' is unknown. " + "Default used instead: sequential."); } else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.order' not found. Default used instead: sequential."); if(params.find("sequence.format_phylip.ext") != params.end()) { if(params["sequence.format_phylip.ext"] == "extended") { extended = true; split = ApplicationTools::getStringParameter("sequence.format_phylip.extended.split", params, "spaces", suffix, suffixIsOptional); if(split == "spaces") split = " "; else if(split == "tab") split = "\t"; else throw Exception("Unknown option for sequence.format_phylip.extended.split: " + split); } else if(params["sequence.format_phylip.ext"] == "classic" ) extended = false; else ApplicationTools::displayWarning("Argument '" + params["sequence.format_phylip.ext"] + "' for parameter 'sequence.format_phylip.ext' is unknown. " + "Default used instead: extended."); } else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.ext' not found. Default used instead: extended."); iSeq = new Phylip(extended, sequential, 100, true, split); } else if(sequenceFormat == "Fasta") iSeq = new Fasta(); else if(sequenceFormat == "Clustal") iSeq = new Clustal(); else { ApplicationTools::displayError("Unknown sequence format."); exit(-1); } const SequenceContainer * seqCont = iSeq->read(sequenceFilePath, alpha); VectorSiteContainer * sites = new VectorSiteContainer(* dynamic_cast<const OrderedSequenceContainer *>(seqCont)); delete seqCont; delete iSeq; if(verbose) ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath); // Look for site selection: if(sequenceFormat == "Mase") { //getting site set: string siteSet = ApplicationTools::getStringParameter("sequence.format_mase.site_selection", params, "none", suffix, suffixIsOptional, false); if(siteSet != "none") { VectorSiteContainer * selectedSites; try { selectedSites = dynamic_cast<VectorSiteContainer *>(MaseTools::getSelectedSites(* sites, siteSet)); if(verbose) ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites."); } catch(IOException ioe) { ApplicationTools::displayError("Site Set '" + siteSet + "' not found."); exit(-1); } if(selectedSites->getNumberOfSites() == 0) { ApplicationTools::displayError("Site Set '" + siteSet + "' is empty."); exit(-1); } delete sites; sites = selectedSites; } } return sites; }
SiteContainer* SiteContainerTools::resolveDottedAlignment( const SiteContainer& dottedAln, const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception) { if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet())) throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet()); // First we look for the reference sequence: size_t n = dottedAln.getNumberOfSequences(); if (n == 0) throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence."); const Sequence* refSeq = 0; for (size_t i = 0; i < n; ++i) // Test each sequence { const Sequence* seq = &dottedAln.getSequence(i); bool isRef = true; for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence { if (seq->getChar(j) == ".") isRef = false; } if (isRef) // We found the reference sequence! { refSeq = new BasicSequence(*seq); } } if (!refSeq) throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment."); // Now we build a new VectorSiteContainer: VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet); // We add each site one by one: size_t m = dottedAln.getNumberOfSites(); string state; for (unsigned int i = 0; i < m; ++i) { string resolved = refSeq->getChar(i); const Site* site = &dottedAln.getSite(i); Site resolvedSite(resolvedAlphabet, site->getPosition()); for (unsigned int j = 0; j < n; j++) { state = site->getChar(j); if (state == ".") { state = resolved; } resolvedSite.addElement(state); } // Add the new site: sites->addSite(resolvedSite); } // Seq sequence names: sites->setSequencesNames(dottedAln.getSequencesNames()); // Delete the copied sequence: delete refSeq; // Return result: return sites; }
int main(int args, char ** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Distance Methods, version 2.2.0 *" << endl; cout << "* Author: J. Dutheil Created 05/05/07 *" << endl; cout << "* Last Modif. 04/02/15 *" << endl; cout << "******************************************************************" << endl; cout << endl; if(args == 1) { help(); return 0; } try { BppApplication bppdist(args, argv, "BppDist"); bppdist.startTimer(); Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppdist.getParams(), "", false); auto_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); if (codonAlphabet) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppdist.getParams(), "Standard", "", true, true); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppdist.getParams()); VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(* allSites, bppdist.getParams()); delete allSites; ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences())); ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites())); SubstitutionModel* model = PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppdist.getParams()); DiscreteDistribution* rDist = 0; if (model->getNumberOfStates() > model->getAlphabet()->getSize()) { //Markov-modulated Markov model! rDist = new ConstantRateDistribution(); } else { rDist = PhylogeneticsApplicationTools::getRateDistribution(bppdist.getParams()); } DistanceEstimation distEstimation(model, rDist, sites, 1, false); string method = ApplicationTools::getStringParameter("method", bppdist.getParams(), "nj"); ApplicationTools::displayResult("Tree reconstruction method", method); TreeTemplate<Node>* tree; AgglomerativeDistanceMethod* distMethod = 0; if(method == "wpgma") { PGMA* wpgma = new PGMA(true); distMethod = wpgma; } else if(method == "upgma") { PGMA* upgma = new PGMA(false); distMethod = upgma; } else if(method == "nj") { NeighborJoining* nj = new NeighborJoining(); nj->outputPositiveLengths(true); distMethod = nj; } else if(method == "bionj") { BioNJ* bionj = new BioNJ(); bionj->outputPositiveLengths(true); distMethod = bionj; } else throw Exception("Unknown tree reconstruction method."); string type = ApplicationTools::getStringParameter("optimization.method", bppdist.getParams(), "init"); ApplicationTools::displayResult("Model parameters estimation method", type); if (type == "init") type = OptimizationTools::DISTANCEMETHOD_INIT; else if (type == "pairwise") type = OptimizationTools::DISTANCEMETHOD_PAIRWISE; else if (type == "iterations") type = OptimizationTools::DISTANCEMETHOD_ITERATIONS; else throw Exception("Unknown parameter estimation procedure '" + type + "'."); unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>("optimization.verbose", bppdist.getParams(), 2); string mhPath = ApplicationTools::getAFilePath("optimization.message_handler", bppdist.getParams(), false, false); OutputStream* messenger = (mhPath == "none") ? 0 : (mhPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(mhPath.c_str(), ios::out)); ApplicationTools::displayResult("Message handler", mhPath); string prPath = ApplicationTools::getAFilePath("optimization.profiler", bppdist.getParams(), false, false); OutputStream* profiler = (prPath == "none") ? 0 : (prPath == "std") ? ApplicationTools::message : new StlOutputStream(new ofstream(prPath.c_str(), ios::out)); if(profiler) profiler->setPrecision(20); ApplicationTools::displayResult("Profiler", prPath); // Should I ignore some parameters? ParameterList allParameters = model->getParameters(); allParameters.addParameters(rDist->getParameters()); ParameterList parametersToIgnore; string paramListDesc = ApplicationTools::getStringParameter("optimization.ignore_parameter", bppdist.getParams(), "", "", true, false); bool ignoreBrLen = false; StringTokenizer st(paramListDesc, ","); while (st.hasMoreToken()) { try { string param = st.nextToken(); if (param == "BrLen") ignoreBrLen = true; else { if (allParameters.hasParameter(param)) { Parameter* p = &allParameters.getParameter(param); parametersToIgnore.addParameter(*p); } else ApplicationTools::displayWarning("Parameter '" + param + "' not found."); } } catch (ParameterNotFoundException& pnfe) { ApplicationTools::displayError("Parameter '" + pnfe.getParameter() + "' not found, and so can't be ignored!"); } } unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>("optimization.max_number_f_eval", bppdist.getParams(), 1000000); ApplicationTools::displayResult("Max # ML evaluations", TextTools::toString(nbEvalMax)); double tolerance = ApplicationTools::getDoubleParameter("optimization.tolerance", bppdist.getParams(), .000001); ApplicationTools::displayResult("Tolerance", TextTools::toString(tolerance)); //Here it is: ofstream warn("warnings", ios::out); ApplicationTools::warning = new StlOutputStreamWrapper(&warn); tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore, !ignoreBrLen, type, tolerance, nbEvalMax, profiler, messenger, optVerbose); warn.close(); delete ApplicationTools::warning; ApplicationTools::warning = ApplicationTools::message; string matrixPath = ApplicationTools::getAFilePath("output.matrix.file", bppdist.getParams(), false, false, "", false); if (matrixPath != "none") { ApplicationTools::displayResult("Output matrix file", matrixPath); string matrixFormat = ApplicationTools::getAFilePath("output.matrix.format", bppdist.getParams(), false, false, "", false); string format = ""; bool extended = false; std::map<std::string, std::string> unparsedArguments_; KeyvalTools::parseProcedure(matrixFormat, format, unparsedArguments_); if (unparsedArguments_.find("type") != unparsedArguments_.end()) { if (unparsedArguments_["type"] == "extended") { extended = true; } else if (unparsedArguments_["type"] == "classic") extended = false; else ApplicationTools::displayWarning("Argument '" + unparsedArguments_["type"] + "' for parameter 'Phylip#type' is unknown. " + "Default used instead: not extended."); } else ApplicationTools::displayWarning("Argument 'Phylip#type' not found. Default used instead: not extended."); ODistanceMatrix* odm = IODistanceMatrixFactory().createWriter(IODistanceMatrixFactory::PHYLIP_FORMAT, extended); odm->write(*distEstimation.getMatrix(), matrixPath, true); delete odm; } PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); //Output some parameters: if (type == OptimizationTools::DISTANCEMETHOD_ITERATIONS) { // Write parameters to screen: ParameterList parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue())); } // Write parameters to file: string parametersFile = ApplicationTools::getAFilePath("output.estimates", bppdist.getParams(), false, false); if (parametersFile != "none") { ofstream out(parametersFile.c_str(), ios::out); parameters = model->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } parameters = rDist->getParameters(); for (unsigned int i = 0; i < parameters.size(); i++) { out << parameters[i].getName() << " = " << parameters[i].getValue() << endl; } out.close(); } } //Bootstrap: unsigned int nbBS = ApplicationTools::getParameter<unsigned int>("bootstrap.number", bppdist.getParams(), 0); if(nbBS > 0) { ApplicationTools::displayResult("Number of bootstrap samples", TextTools::toString(nbBS)); bool approx = ApplicationTools::getBooleanParameter("bootstrap.approximate", bppdist.getParams(), true); ApplicationTools::displayResult("Use approximate bootstrap", TextTools::toString(approx ? "yes" : "no")); if(approx) { type = OptimizationTools::DISTANCEMETHOD_INIT; parametersToIgnore = allParameters; ignoreBrLen = true; } bool bootstrapVerbose = ApplicationTools::getBooleanParameter("bootstrap.verbose", bppdist.getParams(), false, "", true, false); string bsTreesPath = ApplicationTools::getAFilePath("bootstrap.output.file", bppdist.getParams(), false, false); ofstream *out = NULL; if(bsTreesPath != "none") { ApplicationTools::displayResult("Bootstrap trees stored in file", bsTreesPath); out = new ofstream(bsTreesPath.c_str(), ios::out); } Newick newick; vector<Tree *> bsTrees(nbBS); ApplicationTools::displayTask("Bootstrapping", true); for(unsigned int i = 0; i < nbBS; i++) { ApplicationTools::displayGauge(i, nbBS-1, '='); VectorSiteContainer * sample = SiteContainerTools::bootstrapSites(*sites); if(approx) model->setFreqFromData(*sample); distEstimation.setData(sample); bsTrees[i] = OptimizationTools::buildDistanceTree( distEstimation, *distMethod, parametersToIgnore, ignoreBrLen, type, tolerance, nbEvalMax, NULL, NULL, (bootstrapVerbose ? 1 : 0) ); if(out && i == 0) newick.write(*bsTrees[i], bsTreesPath, true); if(out && i > 0) newick.write(*bsTrees[i], bsTreesPath, false); delete sample; } if(out) out->close(); if(out) delete out; ApplicationTools::displayTaskDone(); ApplicationTools::displayTask("Compute bootstrap values"); TreeTools::computeBootstrapValues(*tree, bsTrees); ApplicationTools::displayTaskDone(); for(unsigned int i = 0; i < nbBS; i++) delete bsTrees[i]; //Write resulting tree: PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams()); } delete alphabet; delete sites; delete distMethod; delete tree; bppdist.done();} catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }