void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process) throw (Exception) { if (sites.getNumberOfSequences() == 1) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set."); if (sites.getNumberOfSequences() == 0) throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set."); if (!process.isCompatibleWith(sites)) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible."); alphabet_ = sites.getAlphabet(); nbStates_ = process.getNumberOfStates(); nbSites_ = sites.getNumberOfSites(); unique_ptr<SitePatterns> patterns; if (usePatterns_) { patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); setPatterns(patternLinks_); } else { patterns.reset(new SitePatterns(&sites)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process); } }
void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps) { for (size_t i = sites.getNumberOfSites(); i > 0; i--) { map<int, double> freq; SiteTools::getFrequencies(sites.getSite(i - 1), freq); if (freq[-1] > maxFreqGaps) sites.deleteSite(i - 1); } }
void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites) { // NB: use iterators for a better algorithm? int gapCode = sites.getAlphabet()->getGapCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isUnresolved(*element)) *element = gapCode; } } }
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites) { // NB: use iterators for a better algorithm? int unknownCode = sites.getAlphabet()->getUnknownCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isGap(*element)) *element = unknownCode; } } }
void RewardMappingTools::writeToStream( const ProbabilisticRewardMapping& rewards, const SiteContainer& sites, ostream& out) throw (IOException) { if (!out) throw IOException("RewardMappingTools::writeToFile. Can't write to stream."); out << "Branches"; out << "\tMean"; for (size_t i = 0; i < rewards.getNumberOfSites(); i++) { out << "\tSite" << sites.getSite(i).getPosition(); } out << endl; for (size_t j = 0; j < rewards.getNumberOfBranches(); j++) { out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather(); for (size_t i = 0; i < rewards.getNumberOfSites(); i++) { out << "\t" << rewards(j, i); } out << endl; } }
/* * Inheriting from SubstitutionProcess */ bool SubstitutionProcessCollectionMember::isCompatibleWith(const SiteContainer& data) const { if (modelToNodes_.size() > 0) return data.getAlphabet()->getAlphabetType() == pSubProColl_->getModel(modelToNodes_.begin()->first).getAlphabet()->getAlphabetType(); else return true; }
SiteContainer* SequenceSimulationTools::simulateSites(const SiteSimulator& simulator, const vector<double>& rates) { size_t numberOfSites = rates.size(); vector<const Site*> vs(numberOfSites); for (size_t i = 0; i < numberOfSites; i++) { Site* s = simulator.simulateSite(rates[i]); s->setPosition(static_cast<int>(i)); vs[i] = s; } SiteContainer* sites = new VectorSiteContainer(vs, simulator.getAlphabet()); sites->setSequencesNames(simulator.getSequencesNames(), false); // Freeing memory: for (size_t i = 0; i < numberOfSites; i++) { delete vs[i]; } return sites; }
void DCSE::appendAlignmentFromStream(istream& input, SiteContainer& sc) const throw (Exception) { // Checking the existence of specified file if (!input) { throw IOException ("DCSE::read : fail to open file"); } // Initialization const Alphabet * alpha = sc.getAlphabet(); string line, name, sequence = ""; line = FileTools::getNextLine(input); // Copy current line in temporary string //StringTokenizer st(line); //st.nextToken(); //First line ignored for now! //int n1 = TextTools::toInt(st.nextToken()); //int n2 = TextTools::toInt(st.nextToken()); //int nbSites = n2 - n1 //cout << nbSpecies << " species and " << nbSites << " sites." << endl; // Main loop : for all file lines while (!input.eof()) { line = FileTools::getNextLine(input); // Copy current line in temporary string if(line == "") break; string::size_type endOfSeq = line.find(" "); if(endOfSeq == line.npos) break; sequence = string(line.begin(), line.begin() + static_cast<ptrdiff_t>(endOfSeq)); sequence = TextTools::removeWhiteSpaces(sequence); sequence = TextTools::removeChar(sequence, '{'); sequence = TextTools::removeChar(sequence, '}'); sequence = TextTools::removeChar(sequence, '['); sequence = TextTools::removeChar(sequence, ']'); sequence = TextTools::removeChar(sequence, '('); sequence = TextTools::removeChar(sequence, ')'); sequence = TextTools::removeChar(sequence, '^'); name = string(line.begin() + static_cast<ptrdiff_t>(endOfSeq + 1), line.end()), name = TextTools::removeFirstWhiteSpaces(name); if(name.find("Helix numbering") == name.npos && name.find("mask") == name.npos) sc.addSequence(BasicSequence(name, sequence, alpha), true); } }
void SiteContainerTools::removeGapOrUnresolvedOnlySites(SiteContainer& sites) { size_t n = sites.getNumberOfSites(); size_t i = n; while (i > 1) { ApplicationTools::displayGauge(n - i + 1, n); const Site* site = &sites.getSite(i - 1); if (SiteTools::isGapOnly(*site)) { size_t end = i; while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1) { --i; site = &sites.getSite(i - 1); } sites.deleteSites(i, end - i); } else { --i; } } ApplicationTools::displayGauge(n, n); const Site* site = &sites.getSite(0); if (SiteTools::isGapOrUnresolvedOnly(*site)) sites.deleteSite(0); }
void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs) throw (AlphabetMismatchException, Exception) { if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet()); vector<string> seqNames1 = seqCont1.getSequencesNames(); vector<string> seqNames2 = seqCont2.getSequencesNames(); const SiteContainer* seqCont2bis = 0; bool del = false; if (seqNames1 == seqNames2) { seqCont2bis = &seqCont2; } else { // We shall reorder sequences first: SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet()); SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter); seqCont2bis = seqCont2ter; del = true; } if (leavePositionAsIs) { for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) { seqCont1.addSite(seqCont2bis->getSite(i), false); } } else { int offset = static_cast<int>(seqCont1.getNumberOfSites()); for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) { seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false); } } if (del) delete seqCont2bis; }
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore( const Tree& tree, const SiteContainer& data, const StateMap* statesMap, bool verbose) throw (Exception) : tree_(new TreeTemplate<Node>(tree)), data_(0), alphabet_(data.getAlphabet()), statesMap_(statesMap), nbStates_(statesMap->getNumberOfModelStates()) { init_(data, verbose); }
SiteContainer* SequenceSimulationTools::simulateSites(const SiteSimulator& simulator, const vector<double>& rates, const vector<size_t>& states) throw (Exception) { size_t numberOfSites = rates.size(); if (states.size() != numberOfSites) throw Exception("SequenceSimulationTools::simulateSites., 'rates' and 'states' must have the same length."); vector<const Site*> vs(numberOfSites); for (size_t i = 0; i < numberOfSites; i++) { Site* s = simulator.simulateSite(states[i], rates[i]); s->setPosition(static_cast<int>(i)); vs[i] = s; } SiteContainer* sites = new VectorSiteContainer(vs, simulator.getAlphabet()); sites->setSequencesNames(simulator.getSequencesNames(), false); // Freeing memory: for (size_t i = 0; i < numberOfSites; i++) { delete vs[i]; } return sites; }
std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2) { const Sequence* seq1 = &sequences.getSequence(i1); const Sequence* seq2 = &sequences.getSequence(i2); map<size_t, size_t> tln; size_t count1 = 0; // Sequence 1 counter size_t count2 = 0; // Sequence 2 counter int state1; int state2; for (size_t i = 0; i < sequences.getNumberOfSites(); i++) { state1 = (*seq1)[i]; if (state1 != -1) count1++; state2 = (*seq2)[i]; if (state2 != -1) count2++; if (state1 != -1) { tln[count1] = (state2 == -1 ? 0 : count2); } } return tln; }
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore( const Tree& tree, const SiteContainer& data, bool verbose, bool includeGaps) throw (Exception) : tree_(new TreeTemplate<Node>(tree)), data_(0), alphabet_(data.getAlphabet()), statesMap_(0), nbStates_(0) { statesMap_ = new CanonicalStateMap(alphabet_, includeGaps); nbStates_ = statesMap_->getNumberOfModelStates(); init_(data, verbose); }
VectorSiteContainer::VectorSiteContainer(const SiteContainer& sc) : AbstractSequenceContainer(sc), sites_(0), names_(sc.getSequencesNames()), comments_(sc.getNumberOfSequences()), sequences_(sc.getNumberOfSequences()) { // Now try to add each site: for (size_t i = 0; i < sc.getNumberOfSites(); i++) { addSite(sc.getSite(i), false); // We assume that positions are correct. } // Seq comments: for (size_t i = 0; i < sc.getNumberOfSequences(); i++) { comments_[i] = new Comments(sc.getComments(i)); } }
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions) { positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites()); int gap = sites.getAlphabet()->getGapCharacterCode(); for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) { const Sequence& seq = sites.getSequence(i); unsigned int pos = 0; for (size_t j = 0; j < sites.getNumberOfSites(); ++j) { if (seq[j] != gap) { ++pos; positions(i, j) = pos; } else { positions(i, j) = 0; } } } }
int main() { //ProteicAlphabet* alpha = new ProteicAlphabet; RNA* alpha = new RNA(); SiteContainer* sites = new VectorSiteContainer(alpha); BasicSequence seq1("seq1", "----AUGCCG---GCGU----UUU----G--G-CCGACGUGUUUU--", alpha); BasicSequence seq2("seq2", "---GAAGGCG---G-GU----UUU----GC-GACCGACG--UUUU--", alpha); sites->addSequence(seq1, false); sites->addSequence(seq2, false); cout << sites->getNumberOfSites() << endl; cout << sites->toString("seq1") << endl; cout << sites->toString("seq2") << endl; SiteContainerTools::removeGapOnlySites(*sites); cout << endl; cout << sites->getNumberOfSites() << endl; cout << sites->toString("seq1") << endl; cout << sites->toString("seq2") << endl; return (sites->getNumberOfSites() == 30 ? 0 : 1); }
void Clustal::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception) { output << "CLUSTAL W (1.81) multiple sequence alignment" << endl; output << endl; if (sc.getNumberOfSequences() == 0) return; vector<string> text; size_t length = 0; for (size_t i = 0; i < sc.getNumberOfSequences(); ++i ) { const Sequence& seq = sc.getSequence(i); if (seq.getName().size() > length) length = seq.getName().size(); text.push_back(sc.getSequence(i).toString()); } length += nbSpacesBeforeSeq_; for (unsigned int j = 0; j < text[0].size(); j += charsByLine_) { for (unsigned int i = 0; i < sc.getNumberOfSequences(); ++i ) { output << TextTools::resizeRight(sc.getSequence(i).getName(), length); output << text[i].substr(j, charsByLine_) << endl; } output << endl; } }
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse( const SiteContainer& allSites, map<string, string>& params, string suffix, bool suffixIsOptional, bool gapAsUnknown, bool verbose, int warn) { // Fully resolved sites, i.e. without jokers and gaps: SiteContainer* sitesToAnalyse; VectorSiteContainer* sitesToAnalyse2; string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn); if (verbose) ApplicationTools::displayResult("Sites to use", option); if (option == "all") { sitesToAnalyse = new VectorSiteContainer(allSites); string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn); if (maxGapOption[maxGapOption.size() - 1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.; if (gapFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); if (freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); if (gapNum < sitesToAnalyse->getNumberOfSequences()) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); if (counts[-1] > gapNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn); int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize()); if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%') { double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.; if (unresolvedFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove unresolved sites", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); double x = 0; for (int l = 0; l < sAlph; ++l) { x += freq[l]; } if (1 - x > unresolvedFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t nbSeq = sitesToAnalyse->getNumberOfSequences(); size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption); if (unresolvedNum < nbSeq) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); size_t x = 0; for (int l = 0; l < sAlph; l++) { x += counts[l]; } if (nbSeq - x > unresolvedNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } if (gapAsUnknown) { SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse); } } else if (option == "complete") { sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites)); } else if (option == "nogap") { sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites)); } else { throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'."); } const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet()); if (ca) { option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn); if ((option != "") && verbose) ApplicationTools::displayResult("Remove Stop Codons", option); if (option == "yes") { string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn); unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc)); sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode)); delete sitesToAnalyse; } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); return sitesToAnalyse2; }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Sequence Manipulator, version 2.3.0. *" << endl; cout << "* Author: J. Dutheil Last Modif. 25/11/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppseqman(args, argv, "BppSeqMan"); bppseqman.startTimer(); // Get alphabet Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true); unique_ptr<GeneticCode> gCode; CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet); // Get sequences: bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1); OrderedSequenceContainer* sequences = 0; if (aligned) { VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams()); sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false); delete allSites; } else { SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true); sequences = new VectorSequenceContainer(*tmp); delete tmp; } ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences()); // Perform manipulations vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1); for (size_t a = 0; a < actions.size(); a++) { string cmdName; map<string, string> cmdArgs; KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs); ApplicationTools::displayResult("Performing action", cmdName); // +-----------------+ // | Complementation | // +-----------------+ if (cmdName == "Complement") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------------+ // | (Reverse)Transcription | // +------------------------+ else if (cmdName == "Transcript") { if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::transcript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET); for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } else throw Exception("Transcription error: input alphabet must be of type 'nucleic'."); } // +-------------------------------+ // | Switching nucleotide alphabet | // +-------------------------------+ else if (cmdName == "Switch") { const Alphabet* alpha = 0; if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::RNA_ALPHABET; } else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType()) { alpha = &AlphabetTools::DNA_ALPHABET; } else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'."); OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(alpha); else sc = new VectorSequenceContainer(alpha); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); vector<int> content(old->size()); for (size_t j = 0; j < old->size(); ++j) content[j] = (*old)[j]; Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Translation | // +-------------+ else if (cmdName == "Translate") { if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet())) throw Exception("Error in translation: alphabet is not of type 'codon'."); if (cmdArgs["code"] != "") throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'."); if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET); else sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { Sequence* seq = gCode->translate(sequences->getSequence(i)); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +-------------+ // | Remove gaps | // +-------------+ else if (cmdName == "RemoveGaps") { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeGaps(*seq); sc->addSequence(*seq); } delete sequences; sequences = sc; aligned = false; } // +---------------------------+ // | Change gaps to unresolved | // +---------------------------+ else if (cmdName == "GapToUnknown") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeGapsToUnknownCharacters(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +---------------------------+ // | Change unresolved to gaps | // +---------------------------+ else if (cmdName == "UnknownToGap") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { Sequence* seq = new BasicSequence(sequences->getSequence(i)); SymbolListTools::changeUnresolvedCharactersToGaps(*seq); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveStops") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::removeStops(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } else { VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { unique_ptr<Sequence> seq(sequences->getSequence(i).clone()); SequenceTools::replaceStopsWithGaps(*seq, *gCode); sc->addSequence(*seq); } delete sequences; sequences = sc; } } // +--------------+ // | Remove stops | // +--------------+ else if (cmdName == "RemoveColumnsWithStops") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if (!sites) { throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } for (size_t i = sites->getNumberOfSites(); i > 0; i--) { if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode)) sites->deleteSite(i - 1); } } // +---------+ // | Get CDS | // +---------+ else if (cmdName == "GetCDS") { if (!gCode.get()) { string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1); ApplicationTools::displayResult("Genetic Code", codeDesc); gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc)); } OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { BasicSequence seq = sequences->getSequence(i); size_t len = seq.size(); SequenceTools::getCDS(seq, *gCode, false, true, true, false); if (aligned) { for (size_t c = seq.size(); c < len; ++c) seq.addElement(seq.getAlphabet()->getGapCharacterCode()); } sc->addSequence(seq, false); } delete sequences; sequences = sc; } // +--------------------------+ // | Resolve dotted alignment | // +--------------------------+ else if (actions[a] == "CoerceToAlignment") { SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences); if(! sites) { sites = new VectorSiteContainer(*sequences); delete sequences; sequences = sites; } aligned = true; } else if (actions[a] == "ResolvedDotted") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } const Alphabet* alpha = 0; string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1); if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET; else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET; else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET; else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment."); OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha); delete sequences; sequences = resolvedCont; } // +---------------------+ // | Keep complete sites | // +---------------------+ else if (cmdName == "KeepComplete") { SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences); if (!sites) { throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command."); } string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1); if (maxGapOption[maxGapOption.size()-1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.; for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, double> freqs; SiteTools::getFrequencies(sites->getSite(i - 1), freqs); if (freqs[-1] > gapFreq) sites->deleteSite(i - 1); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); for (size_t i = sites->getNumberOfSites(); i > 0; i--) { map<int, size_t> counts; SiteTools::getCounts(sites->getSite(i - 1), counts); counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0. if (counts[-1] > gapNum) sites->deleteSite(i-1); } } } // +-----------------+ // | Invert sequence | // +-----------------+ else if (cmdName == "Invert") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); i++) { const Sequence* old = &sequences->getSequence(i); Sequence* seq = SequenceTools::getInvert(*old); sc->addSequence(*seq, false); delete seq; } delete sequences; sequences = sc; } // +------------------+ // | GetCodonPosition | // +------------------+ else if (cmdName == "GetCodonPosition") { unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1); OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1)); delete sequences; if (aligned) { sequences = new VectorSiteContainer(*sc); delete sc; } else { sequences = sc; } } // +-----------------+ // | FilterFromTree | // +-----------------+ else if (cmdName == "FilterFromTree") { unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, "")); vector<string> names = tree->getLeavesNames(); OrderedSequenceContainer* reorderedSequences = 0; if (aligned) { reorderedSequences = new VectorSiteContainer(sequences->getAlphabet()); } else { reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet()); } for (size_t i = 0; i < names.size(); ++i) { reorderedSequences->addSequence(sequences->getSequence(names[i]), false); } delete sequences; sequences = reorderedSequences; } // +----------------------+ // | RemoveEmptySequences | // +----------------------+ else if (cmdName == "RemoveEmptySequences") { OrderedSequenceContainer* sc = 0; if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet()); else sc = new VectorSequenceContainer(sequences->getAlphabet()); for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i) { if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0) sc->addSequence(sequences->getSequence(i), false); } delete sequences; sequences = sc; } else throw Exception("Unknown action: " + cmdName); } // Write sequences ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned); if (aligned) { SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1); } else { SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1); } delete alphabet; delete sequences; bppseqman.done(); } catch(exception & e) { cout << e.what() << endl; return 1; } return 0; }
void Clustal::appendAlignmentFromStream(std::istream& input, SiteContainer & sc) const throw (Exception) { // Checking the existence of specified file if (!input) { throw IOException ("Clustal::read : fail to open file"); } const Alphabet * alpha = sc.getAlphabet(); vector<BasicSequence> sequences; string lineRead(""); Comments comments(1); comments[0] = FileTools::getNextLine(input); // First line gives file generator. lineRead = FileTools::getNextLine(input); // This is the first sequence of the first block. string::size_type beginSeq = 0; unsigned int count = 0; for (size_t i = lineRead.size(); i > 0; i--) { char c = lineRead[i-1]; if (c == ' ') { count++; if (count == nbSpacesBeforeSeq_) { beginSeq = i - 1 + nbSpacesBeforeSeq_; break; } } else count = 0; } if (beginSeq == 0) throw IOException("Clustal::read. Bad intput file."); unsigned int countSequences = 0; //Read first sequences block: bool test = true; do { sequences.push_back(BasicSequence(TextTools::removeSurroundingWhiteSpaces(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)), lineRead.substr(beginSeq), alpha)); getline(input, lineRead, '\n'); countSequences++; test = !TextTools::isEmpty(lineRead) && !TextTools::isEmpty(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)); } while (input && test); // Read other blocks lineRead = FileTools::getNextLine(input); // Read first sequence of next block. while (!TextTools::isEmpty(lineRead)) { // Read next block: for (unsigned int i = 0; i < countSequences; ++i) { // Complete sequences if (TextTools::isEmpty(lineRead)) throw IOException("Clustal::read. Bad intput file."); sequences[i].append(lineRead.substr(beginSeq)); getline(input, lineRead, '\n'); } //At this point, lineRead is the first line after the current block. lineRead = FileTools::getNextLine(input); } for (unsigned int i = 0; i < countSequences; ++i) sc.addSequence(sequences[i], checkNames_); sc.setGeneralComments(comments); }
void RecursiveLikelihoodTree::initLikelihoodsWithoutPatterns_(const Node* node, const SiteContainer& sequences, const SubstitutionProcess& process) throw (Exception) { int nId = node->getId(); // Initialize likelihood vector: if (!node->hasFather()) { resetAboveLikelihoods(nId, nbDistinctSites_, nbStates_); resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D0); resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D1); resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D2); } resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D0); resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D1); resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D2); // Now initialize likelihood values and pointers: if (node->hasNoSon()) { const Sequence* seq; try { seq = &sequences.getSequence(node->getName()); } catch (SequenceNotFoundException snfe) { throw SequenceNotFoundException("RecursiveLikelihoodTree::initTreelikelihoods. Leaf name in tree not found in site conainer: ", (node->getName())); } for (size_t c = 0; c < nbClasses_; c++) { RecursiveLikelihoodNode& lNode = *dynamic_cast<RecursiveLikelihoodNode*>(vTree_[c]->getNode(nId)); VVdouble& array = lNode.getBelowLikelihoodArray_(ComputingNode::D0); for (size_t i = 0; i < nbDistinctSites_; i++) { Vdouble* array_i = &array[i]; int state = seq->getValue(i); double test = 0.; for (size_t s = 0; s < nbStates_; s++) { double x = process.getInitValue(s, state); if (lNode.usesLog()) { if (x <= 0) (*array_i)[s] = -10000; else (*array_i)[s] = log(x); } else (*array_i)[s] = x; test += x; } if (test < 0.000001) std::cerr << "WARNING!!! Likelihood will be 0 for this site " << TextTools::toString(i) << std::endl; } lNode.updateBelow_(true, ComputingNode::D0); } } else { // 'node' is an internal node. std::map<int, std::vector<size_t> >* patternLinks_node = &patternLinks_[nId]; int nbSonNodes = static_cast<int>(node->getNumberOfSons()); for (int l = 0; l < nbSonNodes; ++l) { // For each son node, const Node* son = (*node)[l]; initLikelihoodsWithoutPatterns_(son, sequences, process); std::vector<size_t>* patternLinks_node_son = &(*patternLinks_node)[son->getId()]; // Init map: patternLinks_node_son->resize(nbDistinctSites_); for (size_t i = 0; i < nbDistinctSites_; i++) { (*patternLinks_node_son)[i] = i; } } } if (!node->hasFather()) setAboveLikelihoods(nId, process.getRootFrequencies()); }