void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process)
throw (Exception)
{
  if (sites.getNumberOfSequences() == 1)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set.");
  if (sites.getNumberOfSequences() == 0)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set.");
  if (!process.isCompatibleWith(sites))
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible.");
  alphabet_ = sites.getAlphabet();
  nbStates_ = process.getNumberOfStates();
  nbSites_  = sites.getNumberOfSites();
  unique_ptr<SitePatterns> patterns;

  if (usePatterns_)
  {
    patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();

    setPatterns(patternLinks_);
  }
  else
  {
    patterns.reset(new SitePatterns(&sites));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();
    initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process);
  }
}
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions)
{
  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
  int gap = sites.getAlphabet()->getGapCharacterCode();
  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
    const Sequence& seq = sites.getSequence(i);
    unsigned int pos = 0;
    for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
      if (seq[j] != gap) {
        ++pos;
        positions(i, j) = pos;
      } else {
        positions(i, j) = 0;
      }
    }
  }
}
VectorSiteContainer::VectorSiteContainer(const SiteContainer& sc) :
  AbstractSequenceContainer(sc),
  sites_(0),
  names_(sc.getSequencesNames()),
  comments_(sc.getNumberOfSequences()),
  sequences_(sc.getNumberOfSequences())
{
  // Now try to add each site:
  for (size_t i = 0; i < sc.getNumberOfSites(); i++)
  {
    addSite(sc.getSite(i), false); // We assume that positions are correct.
  }
  // Seq comments:
  for (size_t i = 0; i < sc.getNumberOfSequences(); i++)
  {
    comments_[i] = new Comments(sc.getComments(i));
  }
}
void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int gapCode = sites.getAlphabet()->getGapCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isUnresolved(*element))
        *element = gapCode;
    }
  }
}
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isGap(*element))
        *element = unknownCode;
    }
  }
}
Exemple #6
0
void Clustal::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception)
{
  output << "CLUSTAL W (1.81) multiple sequence alignment" << endl;
  output << endl;
  if (sc.getNumberOfSequences() == 0)
    return;

  vector<string> text;
  size_t length = 0;
  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i ) {
    const Sequence& seq = sc.getSequence(i);
    if (seq.getName().size() > length)
      length = seq.getName().size();
    text.push_back(sc.getSequence(i).toString());
  }
  length += nbSpacesBeforeSeq_;
  for (unsigned int j = 0; j < text[0].size(); j += charsByLine_) {
    for (unsigned int i = 0; i < sc.getNumberOfSequences(); ++i ) {
      output << TextTools::resizeRight(sc.getSequence(i).getName(), length);
      output << text[i].substr(j, charsByLine_) << endl;
    }
    output << endl;
  }
}
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse(
  const SiteContainer& allSites,
  map<string, string>& params,
  string suffix,
  bool suffixIsOptional,
  bool gapAsUnknown,
  bool verbose,
  int warn)
{
  // Fully resolved sites, i.e. without jokers and gaps:
  SiteContainer* sitesToAnalyse;
  VectorSiteContainer* sitesToAnalyse2;

  string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn);
  if (verbose)
    ApplicationTools::displayResult("Sites to use", option);
  if (option == "all")
  {
    sitesToAnalyse = new VectorSiteContainer(allSites);
    string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn);

    if (maxGapOption[maxGapOption.size() - 1] == '%')
    {
      double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.;
      if (gapFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          if (freq[-1] > gapFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t gapNum = TextTools::to<size_t>(maxGapOption);
      if (gapNum < sitesToAnalyse->getNumberOfSequences())
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          if (counts[-1] > gapNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn);

    int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize());

    if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%')
    {
      double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.;
      if (unresolvedFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove unresolved sites", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          double x = 0;
          for (int l = 0; l < sAlph; ++l)
          {
            x += freq[l];
          }
          if (1 - x > unresolvedFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t nbSeq = sitesToAnalyse->getNumberOfSequences();
      size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption);
      if (unresolvedNum < nbSeq)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          size_t x = 0;
          for (int l = 0; l < sAlph; l++)
          {
            x += counts[l];
          }

          if (nbSeq - x > unresolvedNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    if (gapAsUnknown)
    {
      SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse);
    }
  }
  else if (option == "complete")
  {
    sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
  }
  else if (option == "nogap")
  {
    sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
  }
  else
  {
    throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'.");
  }

  const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet());
  if (ca)
  {
    option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn);
    if ((option != "") && verbose)
      ApplicationTools::displayResult("Remove Stop Codons", option);

    if (option == "yes")
    {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn);
      unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc));
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode));
      delete sitesToAnalyse;
    }
    else
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);
  }
  else
    sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);

  return sitesToAnalyse2;
}