void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites) { // NB: use iterators for a better algorithm? int gapCode = sites.getAlphabet()->getGapCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isUnresolved(*element)) *element = gapCode; } } }
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites) { // NB: use iterators for a better algorithm? int unknownCode = sites.getAlphabet()->getUnknownCharacterCode(); for (unsigned int i = 0; i < sites.getNumberOfSites(); i++) { for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++) { int* element = &sites(j, i); if (sites.getAlphabet()->isGap(*element)) *element = unknownCode; } } }
void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process) throw (Exception) { if (sites.getNumberOfSequences() == 1) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set."); if (sites.getNumberOfSequences() == 0) throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set."); if (!process.isCompatibleWith(sites)) throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible."); alphabet_ = sites.getAlphabet(); nbStates_ = process.getNumberOfStates(); nbSites_ = sites.getNumberOfSites(); unique_ptr<SitePatterns> patterns; if (usePatterns_) { patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); setPatterns(patternLinks_); } else { patterns.reset(new SitePatterns(&sites)); shrunkData_.reset(patterns->getSites()); rootWeights_ = patterns->getWeights(); rootPatternLinks_ = patterns->getIndices(); nbDistinctSites_ = shrunkData_->getNumberOfSites(); initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process); } }
/* * Inheriting from SubstitutionProcess */ bool SubstitutionProcessCollectionMember::isCompatibleWith(const SiteContainer& data) const { if (modelToNodes_.size() > 0) return data.getAlphabet()->getAlphabetType() == pSubProColl_->getModel(modelToNodes_.begin()->first).getAlphabet()->getAlphabetType(); else return true; }
void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs) throw (AlphabetMismatchException, Exception) { if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet()); vector<string> seqNames1 = seqCont1.getSequencesNames(); vector<string> seqNames2 = seqCont2.getSequencesNames(); const SiteContainer* seqCont2bis = 0; bool del = false; if (seqNames1 == seqNames2) { seqCont2bis = &seqCont2; } else { // We shall reorder sequences first: SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet()); SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter); seqCont2bis = seqCont2ter; del = true; } if (leavePositionAsIs) { for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) { seqCont1.addSite(seqCont2bis->getSite(i), false); } } else { int offset = static_cast<int>(seqCont1.getNumberOfSites()); for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++) { seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false); } } if (del) delete seqCont2bis; }
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore( const Tree& tree, const SiteContainer& data, const StateMap* statesMap, bool verbose) throw (Exception) : tree_(new TreeTemplate<Node>(tree)), data_(0), alphabet_(data.getAlphabet()), statesMap_(statesMap), nbStates_(statesMap->getNumberOfModelStates()) { init_(data, verbose); }
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore( const Tree& tree, const SiteContainer& data, bool verbose, bool includeGaps) throw (Exception) : tree_(new TreeTemplate<Node>(tree)), data_(0), alphabet_(data.getAlphabet()), statesMap_(0), nbStates_(0) { statesMap_ = new CanonicalStateMap(alphabet_, includeGaps); nbStates_ = statesMap_->getNumberOfModelStates(); init_(data, verbose); }
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions) { positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites()); int gap = sites.getAlphabet()->getGapCharacterCode(); for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) { const Sequence& seq = sites.getSequence(i); unsigned int pos = 0; for (size_t j = 0; j < sites.getNumberOfSites(); ++j) { if (seq[j] != gap) { ++pos; positions(i, j) = pos; } else { positions(i, j) = 0; } } } }
void DCSE::appendAlignmentFromStream(istream& input, SiteContainer& sc) const throw (Exception) { // Checking the existence of specified file if (!input) { throw IOException ("DCSE::read : fail to open file"); } // Initialization const Alphabet * alpha = sc.getAlphabet(); string line, name, sequence = ""; line = FileTools::getNextLine(input); // Copy current line in temporary string //StringTokenizer st(line); //st.nextToken(); //First line ignored for now! //int n1 = TextTools::toInt(st.nextToken()); //int n2 = TextTools::toInt(st.nextToken()); //int nbSites = n2 - n1 //cout << nbSpecies << " species and " << nbSites << " sites." << endl; // Main loop : for all file lines while (!input.eof()) { line = FileTools::getNextLine(input); // Copy current line in temporary string if(line == "") break; string::size_type endOfSeq = line.find(" "); if(endOfSeq == line.npos) break; sequence = string(line.begin(), line.begin() + static_cast<ptrdiff_t>(endOfSeq)); sequence = TextTools::removeWhiteSpaces(sequence); sequence = TextTools::removeChar(sequence, '{'); sequence = TextTools::removeChar(sequence, '}'); sequence = TextTools::removeChar(sequence, '['); sequence = TextTools::removeChar(sequence, ']'); sequence = TextTools::removeChar(sequence, '('); sequence = TextTools::removeChar(sequence, ')'); sequence = TextTools::removeChar(sequence, '^'); name = string(line.begin() + static_cast<ptrdiff_t>(endOfSeq + 1), line.end()), name = TextTools::removeFirstWhiteSpaces(name); if(name.find("Helix numbering") == name.npos && name.find("mask") == name.npos) sc.addSequence(BasicSequence(name, sequence, alpha), true); } }
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse( const SiteContainer& allSites, map<string, string>& params, string suffix, bool suffixIsOptional, bool gapAsUnknown, bool verbose, int warn) { // Fully resolved sites, i.e. without jokers and gaps: SiteContainer* sitesToAnalyse; VectorSiteContainer* sitesToAnalyse2; string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn); if (verbose) ApplicationTools::displayResult("Sites to use", option); if (option == "all") { sitesToAnalyse = new VectorSiteContainer(allSites); string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn); if (maxGapOption[maxGapOption.size() - 1] == '%') { double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.; if (gapFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); if (freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t gapNum = TextTools::to<size_t>(maxGapOption); if (gapNum < sitesToAnalyse->getNumberOfSequences()) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); if (counts[-1] > gapNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn); int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize()); if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%') { double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.; if (unresolvedFreq < 1) { if (verbose) ApplicationTools::displayTask("Remove unresolved sites", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, double> freq; SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq); double x = 0; for (int l = 0; l < sAlph; ++l) { x += freq[l]; } if (1 - x > unresolvedFreq) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } else { size_t nbSeq = sitesToAnalyse->getNumberOfSequences(); size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption); if (unresolvedNum < nbSeq) { if (verbose) ApplicationTools::displayTask("Remove sites with gaps", true); for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--) { if (verbose) ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '='); map<int, size_t> counts; SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts); size_t x = 0; for (int l = 0; l < sAlph; l++) { x += counts[l]; } if (nbSeq - x > unresolvedNum) sitesToAnalyse->deleteSite(i - 1); } if (verbose) ApplicationTools::displayTaskDone(); } } if (gapAsUnknown) { SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse); } } else if (option == "complete") { sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites)); } else if (option == "nogap") { sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites); size_t nbSites = sitesToAnalyse->getNumberOfSites(); if (verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites)); } else { throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'."); } const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet()); if (ca) { option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn); if ((option != "") && verbose) ApplicationTools::displayResult("Remove Stop Codons", option); if (option == "yes") { string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn); unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc)); sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode)); delete sitesToAnalyse; } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); } else sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse); return sitesToAnalyse2; }
void Clustal::appendAlignmentFromStream(std::istream& input, SiteContainer & sc) const throw (Exception) { // Checking the existence of specified file if (!input) { throw IOException ("Clustal::read : fail to open file"); } const Alphabet * alpha = sc.getAlphabet(); vector<BasicSequence> sequences; string lineRead(""); Comments comments(1); comments[0] = FileTools::getNextLine(input); // First line gives file generator. lineRead = FileTools::getNextLine(input); // This is the first sequence of the first block. string::size_type beginSeq = 0; unsigned int count = 0; for (size_t i = lineRead.size(); i > 0; i--) { char c = lineRead[i-1]; if (c == ' ') { count++; if (count == nbSpacesBeforeSeq_) { beginSeq = i - 1 + nbSpacesBeforeSeq_; break; } } else count = 0; } if (beginSeq == 0) throw IOException("Clustal::read. Bad intput file."); unsigned int countSequences = 0; //Read first sequences block: bool test = true; do { sequences.push_back(BasicSequence(TextTools::removeSurroundingWhiteSpaces(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)), lineRead.substr(beginSeq), alpha)); getline(input, lineRead, '\n'); countSequences++; test = !TextTools::isEmpty(lineRead) && !TextTools::isEmpty(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)); } while (input && test); // Read other blocks lineRead = FileTools::getNextLine(input); // Read first sequence of next block. while (!TextTools::isEmpty(lineRead)) { // Read next block: for (unsigned int i = 0; i < countSequences; ++i) { // Complete sequences if (TextTools::isEmpty(lineRead)) throw IOException("Clustal::read. Bad intput file."); sequences[i].append(lineRead.substr(beginSeq)); getline(input, lineRead, '\n'); } //At this point, lineRead is the first line after the current block. lineRead = FileTools::getNextLine(input); } for (unsigned int i = 0; i < countSequences; ++i) sc.addSequence(sequences[i], checkNames_); sc.setGeneralComments(comments); }