void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process) throw (Exception) { if (sites.getNumberOfSequences() == 1) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set."); if (sites.getNumberOfSequences() == 0) throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set."); if (!process.isCompatibleWith(sites)) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible."); alphabet_ = sites.getAlphabet(); nbStates_ = process.getNumberOfStates(); nbSites_ = sites.getNumberOfSites(); unique_ptr<SitePatterns> patterns; if (usePatterns_) { patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); setPatterns(patternLinks_); } else { patterns.reset(new SitePatterns(&sites)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process); } }
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions) { positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites()); int gap = sites.getAlphabet()->getGapCharacterCode(); for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) { const Sequence& seq = sites.getSequence(i); unsigned int pos = 0; for (size_t j = 0; j < sites.getNumberOfSites(); ++j) { if (seq[j] != gap) { ++pos; positions(i, j) = pos; } else { positions(i, j) = 0; } } } }
VectorSiteContainer::VectorSiteContainer(const SiteContainer& sc) : AbstractSequenceContainer(sc), sites_(0), names_(sc.getSequencesNames()), comments_(sc.getNumberOfSequences()), sequences_(sc.getNumberOfSequences()) { // Now try to add each site: for (size_t i = 0; i < sc.getNumberOfSites(); i++) { addSite(sc.getSite(i), false); // We assume that positions are correct. } // Seq comments: for (size_t i = 0; i < sc.getNumberOfSequences(); i++) { comments_[i] = new Comments(sc.getComments(i)); } }
void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites) { // NB: use iterators for a better algorithm? int gapCode = sites.getAlphabet()->getGapCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isUnresolved(*element)) *element = gapCode; } } }
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites) { // NB: use iterators for a better algorithm? int unknownCode = sites.getAlphabet()->getUnknownCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isGap(*element)) *element = unknownCode; } } }
void Clustal::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception) { output << "CLUSTAL W (1.81) multiple sequence alignment" << endl; output << endl; if (sc.getNumberOfSequences() == 0) return; vector<string> text; size_t length = 0; for (size_t i = 0; i < sc.getNumberOfSequences(); ++i ) { const Sequence& seq = sc.getSequence(i); if (seq.getName().size() > length) length = seq.getName().size(); text.push_back(sc.getSequence(i).toString()); } length += nbSpacesBeforeSeq_; for (unsigned int j = 0; j < text[0].size(); j += charsByLine_) { for (unsigned int i = 0; i < sc.getNumberOfSequences(); ++i ) { output << TextTools::resizeRight(sc.getSequence(i).getName(), length); output << text[i].substr(j, charsByLine_) << endl; } output << endl; } }
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse( const SiteContainer& allSites, map<string, string>& params, string suffix, bool suffixIsOptional, bool gapAsUnknown, bool verbose, int warn) { // Fully resolved sites, i.e. without jokers and gaps: SiteContainer* sitesToAnalyse; VectorSiteContainer* sitesToAnalyse2; string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn); if (verbose) ApplicationTools::displayResult("Sites to use", option); if (option == "all") { sitesToAnalyse = new VectorSiteContainer(allSites); string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn); if (maxGapOption[maxGapOption.size() - 1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.; if (gapFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); if (freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); if (gapNum < sitesToAnalyse->getNumberOfSequences()) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); if (counts[-1] > gapNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn); int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize()); if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%') { double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.; if (unresolvedFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove unresolved sites", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); double x = 0; for (int l = 0; l < sAlph; ++l) { x += freq[l]; } if (1 - x > unresolvedFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t nbSeq = sitesToAnalyse->getNumberOfSequences(); size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption); if (unresolvedNum < nbSeq) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); size_t x = 0; for (int l = 0; l < sAlph; l++) { x += counts[l]; } if (nbSeq - x > unresolvedNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } if (gapAsUnknown) { SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse); } } else if (option == "complete") { sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites)); } else if (option == "nogap") { sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites)); } else { throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'."); } const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet()); if (ca) { option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn); if ((option != "") && verbose) ApplicationTools::displayResult("Remove Stop Codons", option); if (option == "yes") { string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn); unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc)); sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode)); delete sitesToAnalyse; } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); return sitesToAnalyse2; }