double SequenceTools::getPercentIdentity(const Sequence& seq1, const Sequence& seq2, bool ignoreGaps) throw (AlphabetMismatchException, SequenceNotAlignedException)
{
  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet());
  if (seq1.size() != seq2.size())
    throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2);
  int gap = seq1.getAlphabet()->getGapCharacterCode();
  size_t id = 0;
  size_t tot = 0;
  for (size_t i = 0; i < seq1.size(); i++)
  {
    int x = seq1.getValue(i);
    int y = seq2.getValue(i);
    if (ignoreGaps)
    {
      if (x != gap && y != gap)
      {
        tot++;
        if (x == y)
          id++;
      }
    }
    else
    {
      tot++;
      if (x == y)
        id++;
    }
  }
  return static_cast<double>(id) / static_cast<double>(tot) * 100.;
}
double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception)
{
  if (seq1.size() != seq2.size())
    throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2);
  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet());

  const Alphabet* alpha = seq1.getAlphabet();
  unsigned int s = 0;
  unsigned int t = 0;
  for (size_t i = 0; i < seq1.size(); i++)
  {
    int x = seq1[i];
    int y = seq2[i];
    int gapCode = alpha->getGapCharacterCode();
    if (unresolvedAsGap)
    {
      if (alpha->isUnresolved(x))
        x = gapCode;
      if (alpha->isUnresolved(y))
        y = gapCode;
    }
    if (gapOption == SIMILARITY_ALL)
    {
      t++;
      if (x == y && !alpha->isGap(x) && !alpha->isGap(y))
        s++;
    }
    else if (gapOption == SIMILARITY_NODOUBLEGAP)
    {
      if (!alpha->isGap(x) || !alpha->isGap(y))
      {
        t++;
        if (x == y)
          s++;
      }
    }
    else if (gapOption == SIMILARITY_NOGAP)
    {
      if (!alpha->isGap(x) && !alpha->isGap(y))
      {
        t++;
        if (x == y)
          s++;
      }
    }
    else
      throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
  }
  double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t));
  return dist ? 1 - r : r;
}
size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict)
{
  if (motif.size() > seq.size())
    return seq.size();
  for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++)
  {
    bool match = false;
    for (size_t moti = 0; moti < motif.size(); moti++)
    {
      if (strict)
      {
        match = seq.getValue(seqi + moti) == motif.getValue(moti);
      }
      else
      {
        match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti));
      }
      if (!match)
      {
        break;
      }
    }
    if (match)
    {
      return seqi;
    }
  }
  return seq.size();
}
void SequenceTools::getCDS(Sequence& sequence, bool checkInit, bool checkStop, bool includeInit, bool includeStop)
{
  const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet());
  if (!alphabet)
    throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence.");
  if (checkInit)
  {
    unsigned int i;
    for (i = 0; i < sequence.size() && !alphabet->isInit(sequence[i]); ++i)
    {}
    for (unsigned int j = 0; includeInit ? j < i : j <= i; ++j)
    {
      sequence.deleteElement(j);
    }
  }
  if (checkStop)
  {
    unsigned int i;
    for (i = 0; i < sequence.size() && !alphabet->isStop(sequence[i]); ++i)
    {}
    for (unsigned int j = includeStop ? i + 1 : i; j < sequence.size(); ++j)
    {
      sequence.deleteElement(j);
    }
  }
}
unsigned int SequenceFeatureTools::getOrfs(const Sequence& seq, SequenceFeatureSet& featSet, const GeneticCode& gCode)
{
  if (! AlphabetTools::isNucleicAlphabet(seq.getAlphabet())) {
    throw AlphabetException("SequenceFeatureTools::getOrfs: Sequence alphabet must be nucleic!", seq.getAlphabet());
  }
  unsigned int orfCpt = 0;
  const CodonAlphabet* codonAlpha = gCode.getSourceAlphabet();
  std::vector< std::vector<size_t> > starts(3), stops(3);
  size_t phase = 0;
  for (size_t p = 0 ; p < seq.size() - 2 ; p++) {
    phase = p % 3;
    if (gCode.isStart(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) {
      starts[phase].push_back(p);
      //std::cerr << "Start: " << p << " (" << phase << ")" << std::endl;
    } else if (gCode.isStop(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) {
      stops[phase].push_back(p);
      //std::cerr << "Stop:  " << p << " (" << phase << ")" << std::endl;
    }
  }
  for (size_t i = 0 ; i < 3 ; ++i) {
    std::vector< size_t >::iterator start(starts[i].begin()), stop(stops[i].begin());
    while (stop != stops[i].end() && start != starts[i].end()) {
      if (*stop < *start) {
        stop++;
      } else {
        orfCpt++;
        //std::cerr << "ORF:  " << *start << " - " << *stop + 2 << " (" << i << ")" << std::endl;
        bpp::BasicSequenceFeature feat("", seq.getName(), "Bio++", "CDS", *start, *stop + 2, '+');
        featSet.addFeature(feat);
        start++;
      }
    }
  }
  return orfCpt;
}
Exemple #6
0
void BasicSequence::append(const Sequence& seq) throw (AlphabetMismatchException)
{
  if (seq.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("BasicSequence::append");
	// Check list for incorrect characters
	for (size_t i = 0; i < seq.size(); i++)
		content_.push_back(seq[i]);
}
void SequenceTools::removeGaps(Sequence& seq)
{
  const Alphabet* alpha = seq.getAlphabet();
  for (size_t i = seq.size(); i > 0; --i)
  {
    if (alpha->isGap(seq[i - 1]))
      seq.deleteElement(i - 1);
  }
}
void SequenceTools::removeStops(Sequence& seq) throw (Exception)
{
  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
  if (!calpha)
    throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet.");
  for (size_t i = seq.size(); i > 0; --i)
  {
    if (calpha->isStop(seq[i - 1]))
      seq.deleteElement(i - 1);
  }
}
size_t SequenceTools::getNumberOfUnresolvedSites(const Sequence& seq)
{
  size_t count = 0;
  const Alphabet* alpha = seq.getAlphabet();
  for (size_t i = 0; i < seq.size(); i++)
  {
    if (alpha->isUnresolved(seq[i]))
      count++;
  }
  return count;
}
void VectorSiteContainer::setSequence(size_t pos, const Sequence& sequence, bool checkNames)
throw (Exception)
{
  if (pos >= getNumberOfSequences())
    throw IndexOutOfBoundsException("VectorSiteContainer::setSequence", pos, 0, getNumberOfSequences() - 1);

  // New sequence's alphabet and site container's alphabet matching verification
  if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet());

  // If the container has only one sequence, we set the size to the size of this sequence:
  if (getNumberOfSequences() == 1)
    realloc(sequence.size());

  if (sequence.size() != sites_.size())
    throw SequenceException("VectorSiteContainer::setSequence. Sequence has not the appropriate length.", &sequence);

  if (checkNames)
  {
    for (size_t i = 0; i < names_.size(); i++)
    {
      if (i != pos && sequence.getName() == names_[i])
        throw SequenceException("VectorSiteContainer::settSequence. Name already exists in container.", &sequence);
    }
  }
  // Update name:
  names_[pos] = sequence.getName();
  // Update elements at each site:
  for (size_t i = 0; i < sites_.size(); i++)
  {
    sites_[i]->setElement(pos, sequence.getValue(i));
  }
  // Update comments:
  if (comments_[pos])
    delete comments_[pos];
  comments_[pos] = new Comments(sequence.getComments());
  // Update sequences:
  if (sequences_[pos])
    delete sequences_[pos];
  sequences_[pos] = 0;
}
void SequenceTools::replaceStopsWithGaps(Sequence& seq) throw (Exception)
{
  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
  if (!calpha)
    throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet.");
  int gap = calpha->getGapCharacterCode();
  for (size_t i = 0; i < seq.size(); ++i)
  {
    if (calpha->isStop(seq[i]))
      seq.setElement(i, gap);
  }
}
void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level)
{
  vector< vector< int > > states(seq.size());
  list<Sequence*> t_hap;
  const Alphabet* alpha = seq.getAlphabet();
  unsigned int hap_count = 1;
  // Vector of available states at each position
  for (size_t i = 0; i < seq.size(); i++)
  {
    vector<int> st = alpha->getAlias(seq[i]);
    if (!st.size())
    {
      st.push_back(alpha->getGapCharacterCode());
    }
    if (st.size() <= level)
    {
      states[i] = st;
    }
    else
    {
      states[i] = vector<int>(1, seq[i]);
    }
  }
  // Combinatorial haplotypes building (the use of tree may be more accurate)
  t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha));
  for (size_t i = 0; i < states.size(); i++)
  {
    for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++)
    {
      for (unsigned int j = 0; j < states[i].size(); j++)
      {
        Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha);
        if (j < states[i].size() - 1)
        {
          tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++));
          tmp_seq->addElement(states[i][j]);
          t_hap.insert(it, tmp_seq);
        }
        else
        {
          (**it).addElement(states[i][j]);
        }
      }
    }
  }
  for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++)
  {
    hap.push_back(*it);
  }
}
void VectorSiteContainer::addSequence(
  const Sequence& sequence,
  size_t pos,
  bool checkNames)
throw (Exception)
{
  if (pos >= getNumberOfSequences())
    throw IndexOutOfBoundsException("VectorSiteContainer::addSequence.", pos, 0, getNumberOfSequences() - 1);
  if (sequence.size() != sites_.size())
    throw SequenceNotAlignedException("VectorSiteContainer::setSequence", &sequence);

  // New sequence's alphabet and site container's alphabet matching verification
  if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
  {
    throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet());
  }

  if (checkNames)
  {
    for (size_t i = 0; i < names_.size(); i++)
    {
      if (sequence.getName() == names_[i])
        throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence);
    }
  }

  for (size_t i = 0; i < sites_.size(); i++)
  {
    // For each site:
    sites_[i]->addElement(pos, sequence.getValue(i));
  }
  // Actualize names and comments:
  names_.insert(names_.begin() + pos, sequence.getName());
  comments_.insert(comments_.begin() + pos, new Comments(sequence.getComments()));
  sequences_.insert(sequences_.begin() + pos, 0);
}
void VectorSiteContainer::addSequence(const Sequence& sequence, bool checkNames) throw (Exception)
{
  // If the container has no sequence, we set the size to the size of this sequence:
  if (getNumberOfSequences() == 0)
    realloc(sequence.size());

  // New sequence's alphabet and site container's alphabet matching verification
  if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("VectorSiteContainer::addSequence", getAlphabet(), sequence.getAlphabet());

  if (sequence.size() != sites_.size())
    throw SequenceException("VectorSiteContainer::addSequence. Sequence has not the appropriate length: " + TextTools::toString(sequence.size()) + ", should be " + TextTools::toString(sites_.size()) + ".", &sequence);

  if (checkNames)
  {
    for (size_t i = 0; i < names_.size(); i++)
    {
      if (sequence.getName() == names_[i])
        throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence);
    }
  }

  // Append name:
  names_.push_back(sequence.getName());

  // Append elements at each site:
  for (size_t i = 0; i < sites_.size(); i++)
  {
    sites_[i]->addElement(sequence.getValue(i));
  }

  // Append comments:
  comments_.push_back(new Comments(sequence.getComments()));

  // Sequences pointers:
  sequences_.push_back(0);
}
std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2)
throw (AlphabetMismatchException, Exception)
{
  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet());
  map<size_t, size_t> tln;
  if (seq1.size() == 0)
    return tln;
  unsigned int count1 = 0;
  unsigned int count2 = 0;
  if (seq2.size() == 0)
    throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
  int state1 = seq1[count1];
  int state2 = seq2[count2];
  bool end = false;
  while (!end)
  {
    while (state1 == -1)
    {
      count1++;
      if (count1 < seq1.size())
        state1 = seq1[count1];
      else
        break;
    }
    while (state2 == -1)
    {
      count2++;
      if (count2 < seq2.size())
        state2 = seq2[count2];
      else
        break;
    }
    if (state1 != state2)
      throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
    tln[count1 + 1] = count2 + 1; // Count start at 1
    if (count1 == seq1.size() - 1)
      end = true;
    else
    {
      if (count2 == seq2.size() - 1)
      {
        state1 = seq1[++count1];
        while (state1 == -1)
        {
          count1++;
          if (count1 < seq1.size())
            state1 = seq1[count1];
          else
            break;
        }
        if (state1 == -1)
          end = true;
        else
          throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
      }
      else
      {
        state1 = seq1[++count1];
        state2 = seq2[++count2];
      }
    }
  }
  return tln;
}