void SiteContainerTools::removeGapOrUnresolvedOnlySites(SiteContainer& sites) { size_t n = sites.getNumberOfSites(); size_t i = n; while (i > 1) { ApplicationTools::displayGauge(n - i + 1, n); const Site* site = &sites.getSite(i - 1); if (SiteTools::isGapOnly(*site)) { size_t end = i; while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1) { --i; site = &sites.getSite(i - 1); } sites.deleteSites(i, end - i); } else { --i; } } ApplicationTools::displayGauge(n, n); const Site* site = &sites.getSite(0); if (SiteTools::isGapOrUnresolvedOnly(*site)) sites.deleteSite(0); }
void RewardMappingTools::writeToStream( const ProbabilisticRewardMapping& rewards, const SiteContainer& sites, ostream& out) throw (IOException) { if (!out) throw IOException("RewardMappingTools::writeToFile. Can't write to stream."); out << "Branches"; out << "\tMean"; for (size_t i = 0; i < rewards.getNumberOfSites(); i++) { out << "\tSite" << sites.getSite(i).getPosition(); } out << endl; for (size_t j = 0; j < rewards.getNumberOfBranches(); j++) { out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather(); for (size_t i = 0; i < rewards.getNumberOfSites(); i++) { out << "\t" << rewards(j, i); } out << endl; } }
void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps) { for (size_t i = sites.getNumberOfSites(); i > 0; i--) { map<int, double> freq; SiteTools::getFrequencies(sites.getSite(i - 1), freq); if (freq[-1] > maxFreqGaps) sites.deleteSite(i - 1); } }
VectorSiteContainer::VectorSiteContainer(const SiteContainer& sc) : AbstractSequenceContainer(sc), sites_(0), names_(sc.getSequencesNames()), comments_(sc.getNumberOfSequences()), sequences_(sc.getNumberOfSequences()) { // Now try to add each site: for (size_t i = 0; i < sc.getNumberOfSites(); i++) { addSite(sc.getSite(i), false); // We assume that positions are correct. } // Seq comments: for (size_t i = 0; i < sc.getNumberOfSequences(); i++) { comments_[i] = new Comments(sc.getComments(i)); } }
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse( const SiteContainer& allSites, map<string, string>& params, string suffix, bool suffixIsOptional, bool gapAsUnknown, bool verbose, int warn) { // Fully resolved sites, i.e. without jokers and gaps: SiteContainer* sitesToAnalyse; VectorSiteContainer* sitesToAnalyse2; string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn); if (verbose) ApplicationTools::displayResult("Sites to use", option); if (option == "all") { sitesToAnalyse = new VectorSiteContainer(allSites); string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn); if (maxGapOption[maxGapOption.size() - 1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.; if (gapFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); if (freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); if (gapNum < sitesToAnalyse->getNumberOfSequences()) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); if (counts[-1] > gapNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn); int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize()); if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%') { double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.; if (unresolvedFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove unresolved sites", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); double x = 0; for (int l = 0; l < sAlph; ++l) { x += freq[l]; } if (1 - x > unresolvedFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t nbSeq = sitesToAnalyse->getNumberOfSequences(); size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption); if (unresolvedNum < nbSeq) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); size_t x = 0; for (int l = 0; l < sAlph; l++) { x += counts[l]; } if (nbSeq - x > unresolvedNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } if (gapAsUnknown) { SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse); } } else if (option == "complete") { sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites)); } else if (option == "nogap") { sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites)); } else { throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'."); } const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet()); if (ca) { option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn); if ((option != "") && verbose) ApplicationTools::displayResult("Remove Stop Codons", option); if (option == "yes") { string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn); unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc)); sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode)); delete sitesToAnalyse; } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); return sitesToAnalyse2; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Sequence Manipulator, version 2.3.0. *" << endl; cout << "* Author: J. Dutheil Last Modif. 25/11/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppseqman(args, argv, "BppSeqMan"); bppseqman.startTimer(); // Get alphabet Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true); unique_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); // Get sequences: bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1); OrderedSequenceContainer* sequences = 0; if (aligned) { VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams()); sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false); delete allSites; } else { SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true); sequences = new VectorSequenceContainer(*tmp); delete tmp; } ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences()); // Perform manipulations vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1); for (size_t a = 0; a < actions.size(); a++) { string cmdName; map<string, string> cmdArgs; KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs); ApplicationTools::displayResult("Performing action", cmdName); // +-----------------+ // | Complementation | // +-----------------+ if (cmdName == "Complement") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------------+ // | (Reverse)Transcription | // +------------------------+ else if (cmdName == "Transcript") { if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::transcript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else throw Exception("Transcription error: input alphabet must be of type 'nucleic'."); } // +-------------------------------+ // | Switching nucleotide alphabet | // +-------------------------------+ else if (cmdName == "Switch") { const Alphabet* alpha = 0; if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::RNA_ALPHABET; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::DNA_ALPHABET; } else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'."); OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(alpha); else sc = new VectorSequenceContainer(alpha); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); vector<int> content(old->size()); for (size_t j = 0; j < old->size(); ++j) content[j] = (*old)[j]; Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Translation | // +-------------+ else if (cmdName == "Translate") { if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet())) throw Exception("Error in translation: alphabet is not of type 'codon'."); if (cmdArgs["code"] != "") throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'."); if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { Sequence* seq = gCode->translate(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Remove gaps | // +-------------+ else if (cmdName == "RemoveGaps") { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeGaps(*seq); sc->addSequence(*seq); } delete sequences; sequences = sc; aligned = false; } // +---------------------------+ // | Change gaps to unresolved | // +---------------------------+ else if (cmdName == "GapToUnknown") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeGapsToUnknownCharacters(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +---------------------------+ // | Change unresolved to gaps | // +---------------------------+ else if (cmdName == "UnknownToGap") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeUnresolvedCharactersToGaps(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveStops") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeStops(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } else { VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::replaceStopsWithGaps(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveColumnsWithStops") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } for (size_t i = sites->getNumberOfSites(); i > 0; i--) { if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode)) sites->deleteSite(i - 1); } } // +---------+ // | Get CDS | // +---------+ else if (cmdName == "GetCDS") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { BasicSequence seq = sequences->getSequence(i); size_t len = seq.size(); SequenceTools::getCDS(seq, *gCode, false, true, true, false); if (aligned) { for (size_t c = seq.size(); c < len; ++c) seq.addElement(seq.getAlphabet()->getGapCharacterCode()); } sc->addSequence(seq, false); } delete sequences; sequences = sc; } // +--------------------------+ // | Resolve dotted alignment | // +--------------------------+ else if (actions[a] == "CoerceToAlignment") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if(! sites) { sites = new VectorSiteContainer(*sequences); delete sequences; sequences = sites; } aligned = true; } else if (actions[a] == "ResolvedDotted") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } const Alphabet* alpha = 0; string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1); if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET; else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET; else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET; else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment."); OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha); delete sequences; sequences = resolvedCont; } // +---------------------+ // | Keep complete sites | // +---------------------+ else if (cmdName == "KeepComplete") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1); if (maxGapOption[maxGapOption.size()-1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.; for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, double> freqs; SiteTools::getFrequencies(sites->getSite(i - 1), freqs); if (freqs[-1] > gapFreq) sites->deleteSite(i - 1); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, size_t> counts; SiteTools::getCounts(sites->getSite(i - 1), counts); counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0. if (counts[-1] > gapNum) sites->deleteSite(i-1); } } } // +-----------------+ // | Invert sequence | // +-----------------+ else if (cmdName == "Invert") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); Sequence* seq = SequenceTools::getInvert(*old); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------+ // | GetCodonPosition | // +------------------+ else if (cmdName == "GetCodonPosition") { unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1); OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1)); delete sequences; if (aligned) { sequences = new VectorSiteContainer(*sc); delete sc; } else { sequences = sc; } } // +-----------------+ // | FilterFromTree | // +-----------------+ else if (cmdName == "FilterFromTree") { unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, "")); vector<string> names = tree->getLeavesNames(); OrderedSequenceContainer* reorderedSequences = 0; if (aligned) { reorderedSequences = new VectorSiteContainer(sequences->getAlphabet()); } else { reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet()); } for (size_t i = 0; i < names.size(); ++i) { reorderedSequences->addSequence(sequences->getSequence(names[i]), false); } delete sequences; sequences = reorderedSequences; } // +----------------------+ // | RemoveEmptySequences | // +----------------------+ else if (cmdName == "RemoveEmptySequences") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0) sc->addSequence(sequences->getSequence(i), false); } delete sequences; sequences = sc; } else throw Exception("Unknown action: " + cmdName); } // Write sequences ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned); if (aligned) { SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1); } else { SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1); } delete alphabet; delete sequences; bppseqman.done(); } catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }