double SequenceTools::getPercentIdentity(const Sequence& seq1, const Sequence& seq2, bool ignoreGaps) throw (AlphabetMismatchException, SequenceNotAlignedException) { if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet()); if (seq1.size() != seq2.size()) throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2); int gap = seq1.getAlphabet()->getGapCharacterCode(); size_t id = 0; size_t tot = 0; for (size_t i = 0; i < seq1.size(); i++) { int x = seq1.getValue(i); int y = seq2.getValue(i); if (ignoreGaps) { if (x != gap && y != gap) { tot++; if (x == y) id++; } } else { tot++; if (x == y) id++; } } return static_cast<double>(id) / static_cast<double>(tot) * 100.; }
double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception) { if (seq1.size() != seq2.size()) throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2); if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet()); const Alphabet* alpha = seq1.getAlphabet(); unsigned int s = 0; unsigned int t = 0; for (size_t i = 0; i < seq1.size(); i++) { int x = seq1[i]; int y = seq2[i]; int gapCode = alpha->getGapCharacterCode(); if (unresolvedAsGap) { if (alpha->isUnresolved(x)) x = gapCode; if (alpha->isUnresolved(y)) y = gapCode; } if (gapOption == SIMILARITY_ALL) { t++; if (x == y && !alpha->isGap(x) && !alpha->isGap(y)) s++; } else if (gapOption == SIMILARITY_NODOUBLEGAP) { if (!alpha->isGap(x) || !alpha->isGap(y)) { t++; if (x == y) s++; } } else if (gapOption == SIMILARITY_NOGAP) { if (!alpha->isGap(x) && !alpha->isGap(y)) { t++; if (x == y) s++; } } else throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption); } double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t)); return dist ? 1 - r : r; }
size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict) { if (motif.size() > seq.size()) return seq.size(); for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++) { bool match = false; for (size_t moti = 0; moti < motif.size(); moti++) { if (strict) { match = seq.getValue(seqi + moti) == motif.getValue(moti); } else { match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti)); } if (!match) { break; } } if (match) { return seqi; } } return seq.size(); }
void SequenceTools::getCDS(Sequence& sequence, bool checkInit, bool checkStop, bool includeInit, bool includeStop) { const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet()); if (!alphabet) throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence."); if (checkInit) { unsigned int i; for (i = 0; i < sequence.size() && !alphabet->isInit(sequence[i]); ++i) {} for (unsigned int j = 0; includeInit ? j < i : j <= i; ++j) { sequence.deleteElement(j); } } if (checkStop) { unsigned int i; for (i = 0; i < sequence.size() && !alphabet->isStop(sequence[i]); ++i) {} for (unsigned int j = includeStop ? i + 1 : i; j < sequence.size(); ++j) { sequence.deleteElement(j); } } }
unsigned int SequenceFeatureTools::getOrfs(const Sequence& seq, SequenceFeatureSet& featSet, const GeneticCode& gCode) { if (! AlphabetTools::isNucleicAlphabet(seq.getAlphabet())) { throw AlphabetException("SequenceFeatureTools::getOrfs: Sequence alphabet must be nucleic!", seq.getAlphabet()); } unsigned int orfCpt = 0; const CodonAlphabet* codonAlpha = gCode.getSourceAlphabet(); std::vector< std::vector<size_t> > starts(3), stops(3); size_t phase = 0; for (size_t p = 0 ; p < seq.size() - 2 ; p++) { phase = p % 3; if (gCode.isStart(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) { starts[phase].push_back(p); //std::cerr << "Start: " << p << " (" << phase << ")" << std::endl; } else if (gCode.isStop(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) { stops[phase].push_back(p); //std::cerr << "Stop: " << p << " (" << phase << ")" << std::endl; } } for (size_t i = 0 ; i < 3 ; ++i) { std::vector< size_t >::iterator start(starts[i].begin()), stop(stops[i].begin()); while (stop != stops[i].end() && start != starts[i].end()) { if (*stop < *start) { stop++; } else { orfCpt++; //std::cerr << "ORF: " << *start << " - " << *stop + 2 << " (" << i << ")" << std::endl; bpp::BasicSequenceFeature feat("", seq.getName(), "Bio++", "CDS", *start, *stop + 2, '+'); featSet.addFeature(feat); start++; } } } return orfCpt; }
void BasicSequence::append(const Sequence& seq) throw (AlphabetMismatchException) { if (seq.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("BasicSequence::append"); // Check list for incorrect characters for (size_t i = 0; i < seq.size(); i++) content_.push_back(seq[i]); }
void SequenceTools::removeGaps(Sequence& seq) { const Alphabet* alpha = seq.getAlphabet(); for (size_t i = seq.size(); i > 0; --i) { if (alpha->isGap(seq[i - 1])) seq.deleteElement(i - 1); } }
void SequenceTools::removeStops(Sequence& seq) throw (Exception) { const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet()); if (!calpha) throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet."); for (size_t i = seq.size(); i > 0; --i) { if (calpha->isStop(seq[i - 1])) seq.deleteElement(i - 1); } }
size_t SequenceTools::getNumberOfUnresolvedSites(const Sequence& seq) { size_t count = 0; const Alphabet* alpha = seq.getAlphabet(); for (size_t i = 0; i < seq.size(); i++) { if (alpha->isUnresolved(seq[i])) count++; } return count; }
void VectorSiteContainer::setSequence(size_t pos, const Sequence& sequence, bool checkNames) throw (Exception) { if (pos >= getNumberOfSequences()) throw IndexOutOfBoundsException("VectorSiteContainer::setSequence", pos, 0, getNumberOfSequences() - 1); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet()); // If the container has only one sequence, we set the size to the size of this sequence: if (getNumberOfSequences() == 1) realloc(sequence.size()); if (sequence.size() != sites_.size()) throw SequenceException("VectorSiteContainer::setSequence. Sequence has not the appropriate length.", &sequence); if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (i != pos && sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::settSequence. Name already exists in container.", &sequence); } } // Update name: names_[pos] = sequence.getName(); // Update elements at each site: for (size_t i = 0; i < sites_.size(); i++) { sites_[i]->setElement(pos, sequence.getValue(i)); } // Update comments: if (comments_[pos]) delete comments_[pos]; comments_[pos] = new Comments(sequence.getComments()); // Update sequences: if (sequences_[pos]) delete sequences_[pos]; sequences_[pos] = 0; }
void SequenceTools::replaceStopsWithGaps(Sequence& seq) throw (Exception) { const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet()); if (!calpha) throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet."); int gap = calpha->getGapCharacterCode(); for (size_t i = 0; i < seq.size(); ++i) { if (calpha->isStop(seq[i])) seq.setElement(i, gap); } }
void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level) { vector< vector< int > > states(seq.size()); list<Sequence*> t_hap; const Alphabet* alpha = seq.getAlphabet(); unsigned int hap_count = 1; // Vector of available states at each position for (size_t i = 0; i < seq.size(); i++) { vector<int> st = alpha->getAlias(seq[i]); if (!st.size()) { st.push_back(alpha->getGapCharacterCode()); } if (st.size() <= level) { states[i] = st; } else { states[i] = vector<int>(1, seq[i]); } } // Combinatorial haplotypes building (the use of tree may be more accurate) t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha)); for (size_t i = 0; i < states.size(); i++) { for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++) { for (unsigned int j = 0; j < states[i].size(); j++) { Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha); if (j < states[i].size() - 1) { tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++)); tmp_seq->addElement(states[i][j]); t_hap.insert(it, tmp_seq); } else { (**it).addElement(states[i][j]); } } } } for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++) { hap.push_back(*it); } }
void VectorSiteContainer::addSequence( const Sequence& sequence, size_t pos, bool checkNames) throw (Exception) { if (pos >= getNumberOfSequences()) throw IndexOutOfBoundsException("VectorSiteContainer::addSequence.", pos, 0, getNumberOfSequences() - 1); if (sequence.size() != sites_.size()) throw SequenceNotAlignedException("VectorSiteContainer::setSequence", &sequence); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) { throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet()); } if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence); } } for (size_t i = 0; i < sites_.size(); i++) { // For each site: sites_[i]->addElement(pos, sequence.getValue(i)); } // Actualize names and comments: names_.insert(names_.begin() + pos, sequence.getName()); comments_.insert(comments_.begin() + pos, new Comments(sequence.getComments())); sequences_.insert(sequences_.begin() + pos, 0); }
void VectorSiteContainer::addSequence(const Sequence& sequence, bool checkNames) throw (Exception) { // If the container has no sequence, we set the size to the size of this sequence: if (getNumberOfSequences() == 0) realloc(sequence.size()); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("VectorSiteContainer::addSequence", getAlphabet(), sequence.getAlphabet()); if (sequence.size() != sites_.size()) throw SequenceException("VectorSiteContainer::addSequence. Sequence has not the appropriate length: " + TextTools::toString(sequence.size()) + ", should be " + TextTools::toString(sites_.size()) + ".", &sequence); if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence); } } // Append name: names_.push_back(sequence.getName()); // Append elements at each site: for (size_t i = 0; i < sites_.size(); i++) { sites_[i]->addElement(sequence.getValue(i)); } // Append comments: comments_.push_back(new Comments(sequence.getComments())); // Sequences pointers: sequences_.push_back(0); }
std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2) throw (AlphabetMismatchException, Exception) { if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet()); map<size_t, size_t> tln; if (seq1.size() == 0) return tln; unsigned int count1 = 0; unsigned int count2 = 0; if (seq2.size() == 0) throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); int state1 = seq1[count1]; int state2 = seq2[count2]; bool end = false; while (!end) { while (state1 == -1) { count1++; if (count1 < seq1.size()) state1 = seq1[count1]; else break; } while (state2 == -1) { count2++; if (count2 < seq2.size()) state2 = seq2[count2]; else break; } if (state1 != state2) throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); tln[count1 + 1] = count2 + 1; // Count start at 1 if (count1 == seq1.size() - 1) end = true; else { if (count2 == seq2.size() - 1) { state1 = seq1[++count1]; while (state1 == -1) { count1++; if (count1 < seq1.size()) state1 = seq1[count1]; else break; } if (state1 == -1) end = true; else throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + "."); } else { state1 = seq1[++count1]; state2 = seq2[++count2]; } } } return tln; }