void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int gapCode = sites.getAlphabet()->getGapCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isUnresolved(*element))
        *element = gapCode;
    }
  }
}
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isGap(*element))
        *element = unknownCode;
    }
  }
}
void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process)
throw (Exception)
{
  if (sites.getNumberOfSequences() == 1)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set.");
  if (sites.getNumberOfSequences() == 0)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set.");
  if (!process.isCompatibleWith(sites))
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible.");
  alphabet_ = sites.getAlphabet();
  nbStates_ = process.getNumberOfStates();
  nbSites_  = sites.getNumberOfSites();
  unique_ptr<SitePatterns> patterns;

  if (usePatterns_)
  {
    patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();

    setPatterns(patternLinks_);
  }
  else
  {
    patterns.reset(new SitePatterns(&sites));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();
    initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process);
  }
}
/*
 * Inheriting from SubstitutionProcess
 */
bool SubstitutionProcessCollectionMember::isCompatibleWith(const SiteContainer& data) const
{
  if (modelToNodes_.size() > 0)
    return data.getAlphabet()->getAlphabetType() == pSubProColl_->getModel(modelToNodes_.begin()->first).getAlphabet()->getAlphabetType();
  else
    return true;
}
void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs)
throw (AlphabetMismatchException, Exception)
{
  if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet());


  vector<string> seqNames1 = seqCont1.getSequencesNames();
  vector<string> seqNames2 = seqCont2.getSequencesNames();
  const SiteContainer* seqCont2bis = 0;
  bool del = false;
  if (seqNames1 == seqNames2)
  {
    seqCont2bis = &seqCont2;
  }
  else
  {
    // We shall reorder sequences first:
    SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet());
    SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter);
    seqCont2bis = seqCont2ter;
    del = true;
  }

  if (leavePositionAsIs)
  {
    for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
    {
      seqCont1.addSite(seqCont2bis->getSite(i), false);
    }
  }
  else
  {
    int offset = static_cast<int>(seqCont1.getNumberOfSites());
    for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
    {
      seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false);
    }
  }

  if (del)
    delete seqCont2bis;
}
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore(
  const Tree& tree,
  const SiteContainer& data,
  const StateMap* statesMap,
  bool verbose)
throw (Exception) :
  tree_(new TreeTemplate<Node>(tree)),
  data_(0),
  alphabet_(data.getAlphabet()),
  statesMap_(statesMap),
  nbStates_(statesMap->getNumberOfModelStates())
{
  init_(data, verbose);
}
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore(
  const Tree& tree,
  const SiteContainer& data,
  bool verbose,
  bool includeGaps)
throw (Exception) :
  tree_(new TreeTemplate<Node>(tree)),
  data_(0),
  alphabet_(data.getAlphabet()),
  statesMap_(0),
  nbStates_(0)
{
  statesMap_ = new CanonicalStateMap(alphabet_, includeGaps);
  nbStates_  = statesMap_->getNumberOfModelStates();
  init_(data, verbose);
}
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions)
{
  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
  int gap = sites.getAlphabet()->getGapCharacterCode();
  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
    const Sequence& seq = sites.getSequence(i);
    unsigned int pos = 0;
    for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
      if (seq[j] != gap) {
        ++pos;
        positions(i, j) = pos;
      } else {
        positions(i, j) = 0;
      }
    }
  }
}
Exemple #9
0
void DCSE::appendAlignmentFromStream(istream& input, SiteContainer& sc) const throw (Exception)
{
  // Checking the existence of specified file
  if (!input) { throw IOException ("DCSE::read : fail to open file"); }

  // Initialization
  const Alphabet * alpha = sc.getAlphabet();
  string line, name, sequence = "";

  line = FileTools::getNextLine(input); // Copy current line in temporary string
  //StringTokenizer st(line);
  //st.nextToken();
  //First line ignored for now!
  //int n1 = TextTools::toInt(st.nextToken());
  //int n2 = TextTools::toInt(st.nextToken());
  //int nbSites = n2 - n1
  //cout << nbSpecies << " species and " << nbSites << " sites." << endl;

  // Main loop : for all file lines
  while (!input.eof())
  {
    line = FileTools::getNextLine(input); // Copy current line in temporary string
    if(line == "") break;
    string::size_type endOfSeq = line.find("     ");
    if(endOfSeq == line.npos) break;
    sequence = string(line.begin(), line.begin() + static_cast<ptrdiff_t>(endOfSeq));
    sequence = TextTools::removeWhiteSpaces(sequence);
    sequence = TextTools::removeChar(sequence, '{');
    sequence = TextTools::removeChar(sequence, '}');
    sequence = TextTools::removeChar(sequence, '[');
    sequence = TextTools::removeChar(sequence, ']');
    sequence = TextTools::removeChar(sequence, '(');
    sequence = TextTools::removeChar(sequence, ')');
    sequence = TextTools::removeChar(sequence, '^');
    name     = string(line.begin() + static_cast<ptrdiff_t>(endOfSeq + 1), line.end()),
    name     = TextTools::removeFirstWhiteSpaces(name);
    if(name.find("Helix numbering") == name.npos
    && name.find("mask") == name.npos)
      sc.addSequence(BasicSequence(name, sequence, alpha), true);
  }
}
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse(
  const SiteContainer& allSites,
  map<string, string>& params,
  string suffix,
  bool suffixIsOptional,
  bool gapAsUnknown,
  bool verbose,
  int warn)
{
  // Fully resolved sites, i.e. without jokers and gaps:
  SiteContainer* sitesToAnalyse;
  VectorSiteContainer* sitesToAnalyse2;

  string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn);
  if (verbose)
    ApplicationTools::displayResult("Sites to use", option);
  if (option == "all")
  {
    sitesToAnalyse = new VectorSiteContainer(allSites);
    string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn);

    if (maxGapOption[maxGapOption.size() - 1] == '%')
    {
      double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.;
      if (gapFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          if (freq[-1] > gapFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t gapNum = TextTools::to<size_t>(maxGapOption);
      if (gapNum < sitesToAnalyse->getNumberOfSequences())
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          if (counts[-1] > gapNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn);

    int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize());

    if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%')
    {
      double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.;
      if (unresolvedFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove unresolved sites", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          double x = 0;
          for (int l = 0; l < sAlph; ++l)
          {
            x += freq[l];
          }
          if (1 - x > unresolvedFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t nbSeq = sitesToAnalyse->getNumberOfSequences();
      size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption);
      if (unresolvedNum < nbSeq)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          size_t x = 0;
          for (int l = 0; l < sAlph; l++)
          {
            x += counts[l];
          }

          if (nbSeq - x > unresolvedNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    if (gapAsUnknown)
    {
      SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse);
    }
  }
  else if (option == "complete")
  {
    sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
  }
  else if (option == "nogap")
  {
    sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
  }
  else
  {
    throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'.");
  }

  const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet());
  if (ca)
  {
    option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn);
    if ((option != "") && verbose)
      ApplicationTools::displayResult("Remove Stop Codons", option);

    if (option == "yes")
    {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn);
      unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc));
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode));
      delete sitesToAnalyse;
    }
    else
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);
  }
  else
    sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);

  return sitesToAnalyse2;
}
Exemple #11
0
void Clustal::appendAlignmentFromStream(std::istream& input, SiteContainer & sc) const throw (Exception)
{
  // Checking the existence of specified file
  if (!input) { throw IOException ("Clustal::read : fail to open file"); }

  const Alphabet * alpha = sc.getAlphabet();
  vector<BasicSequence> sequences;

  string lineRead("");

  Comments comments(1);
  comments[0] = FileTools::getNextLine(input); // First line gives file generator.

  lineRead = FileTools::getNextLine(input); // This is the first sequence of the first block.
    
  string::size_type beginSeq = 0;
  unsigned int count = 0;
  for (size_t i = lineRead.size(); i > 0; i--) {
    char c = lineRead[i-1];
    if (c == ' ') {
      count++;
      if (count == nbSpacesBeforeSeq_) {
        beginSeq = i - 1 + nbSpacesBeforeSeq_;
        break;
      }
    }
    else count = 0;
  }
  if (beginSeq == 0) throw IOException("Clustal::read. Bad intput file.");

  unsigned int countSequences = 0;

  //Read first sequences block:
  bool test = true;
  do {
    sequences.push_back(BasicSequence(TextTools::removeSurroundingWhiteSpaces(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)), lineRead.substr(beginSeq), alpha));
    getline(input, lineRead, '\n');
    countSequences++;
    test = !TextTools::isEmpty(lineRead) && !TextTools::isEmpty(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_));
  }
  while (input && test);

  // Read other blocks
  lineRead = FileTools::getNextLine(input); // Read first sequence of next block.
  while (!TextTools::isEmpty(lineRead)) {
    // Read next block:
    for (unsigned int i = 0; i < countSequences; ++i) {
      // Complete sequences
      if (TextTools::isEmpty(lineRead))
        throw IOException("Clustal::read. Bad intput file.");
       sequences[i].append(lineRead.substr(beginSeq));
      getline(input, lineRead, '\n');
    }
    //At this point, lineRead is the first line after the current block.
    lineRead = FileTools::getNextLine(input);
  }

  for (unsigned int i = 0; i < countSequences; ++i)
    sc.addSequence(sequences[i], checkNames_);
  sc.setGeneralComments(comments);
}