void RecursiveLikelihoodTree::initLikelihoods(const SiteContainer& sites, const SubstitutionProcess& process)
throw (Exception)
{
  if (sites.getNumberOfSequences() == 1)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Only 1 sequence in data set.");
  if (sites.getNumberOfSequences() == 0)
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. No sequence in data set.");
  if (!process.isCompatibleWith(sites))
    throw Exception("RecursiveLikelihoodTree::initLikelihoods. Data and model are not compatible.");
  alphabet_ = sites.getAlphabet();
  nbStates_ = process.getNumberOfStates();
  nbSites_  = sites.getNumberOfSites();
  unique_ptr<SitePatterns> patterns;

  if (usePatterns_)
  {
    patterns.reset(initLikelihoodsWithPatterns_(process.getTree().getRootNode(), sites, process));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();

    setPatterns(patternLinks_);
  }
  else
  {
    patterns.reset(new SitePatterns(&sites));
    shrunkData_.reset(patterns->getSites());
    rootWeights_      = patterns->getWeights();
    rootPatternLinks_ = patterns->getIndices();
    nbDistinctSites_  = shrunkData_->getNumberOfSites();
    initLikelihoodsWithoutPatterns_(process.getTree().getRootNode(), *shrunkData_, process);
  }
}
void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps)
{
  for (size_t i = sites.getNumberOfSites(); i > 0; i--) {
    map<int, double> freq;
    SiteTools::getFrequencies(sites.getSite(i - 1), freq);
    if (freq[-1] > maxFreqGaps)
      sites.deleteSite(i - 1);
  }
}
void SiteContainerTools::changeUnresolvedCharactersToGaps(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int gapCode = sites.getAlphabet()->getGapCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isUnresolved(*element))
        *element = gapCode;
    }
  }
}
void SiteContainerTools::changeGapsToUnknownCharacters(SiteContainer& sites)
{
  // NB: use iterators for a better algorithm?
  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
    {
      int* element = &sites(j, i);
      if (sites.getAlphabet()->isGap(*element))
        *element = unknownCode;
    }
  }
}
Esempio n. 5
0
void RewardMappingTools::writeToStream(
  const ProbabilisticRewardMapping& rewards,
  const SiteContainer& sites,
  ostream& out)
throw (IOException)
{
  if (!out)
    throw IOException("RewardMappingTools::writeToFile. Can't write to stream.");
  out << "Branches";
  out << "\tMean";
  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
  {
    out << "\tSite" << sites.getSite(i).getPosition();
  }
  out << endl;

  for (size_t j = 0; j < rewards.getNumberOfBranches(); j++)
  {
    out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather();
    for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
    {
      out << "\t" << rewards(j, i);
    }
    out << endl;
  }
}
/*
 * Inheriting from SubstitutionProcess
 */
bool SubstitutionProcessCollectionMember::isCompatibleWith(const SiteContainer& data) const
{
  if (modelToNodes_.size() > 0)
    return data.getAlphabet()->getAlphabetType() == pSubProColl_->getModel(modelToNodes_.begin()->first).getAlphabet()->getAlphabetType();
  else
    return true;
}
SiteContainer* SequenceSimulationTools::simulateSites(const SiteSimulator& simulator, const vector<double>& rates)
{
  size_t numberOfSites = rates.size();
  vector<const Site*> vs(numberOfSites);
  for (size_t i = 0; i < numberOfSites; i++)
  {
    Site* s = simulator.simulateSite(rates[i]);
    s->setPosition(static_cast<int>(i));
    vs[i] = s;
  }
  SiteContainer* sites = new VectorSiteContainer(vs, simulator.getAlphabet());
  sites->setSequencesNames(simulator.getSequencesNames(), false);
  // Freeing memory:
  for (size_t i = 0; i < numberOfSites; i++)
  {
    delete vs[i];
  }

  return sites;
}
Esempio n. 8
0
void DCSE::appendAlignmentFromStream(istream& input, SiteContainer& sc) const throw (Exception)
{
  // Checking the existence of specified file
  if (!input) { throw IOException ("DCSE::read : fail to open file"); }

  // Initialization
  const Alphabet * alpha = sc.getAlphabet();
  string line, name, sequence = "";

  line = FileTools::getNextLine(input); // Copy current line in temporary string
  //StringTokenizer st(line);
  //st.nextToken();
  //First line ignored for now!
  //int n1 = TextTools::toInt(st.nextToken());
  //int n2 = TextTools::toInt(st.nextToken());
  //int nbSites = n2 - n1
  //cout << nbSpecies << " species and " << nbSites << " sites." << endl;

  // Main loop : for all file lines
  while (!input.eof())
  {
    line = FileTools::getNextLine(input); // Copy current line in temporary string
    if(line == "") break;
    string::size_type endOfSeq = line.find("     ");
    if(endOfSeq == line.npos) break;
    sequence = string(line.begin(), line.begin() + static_cast<ptrdiff_t>(endOfSeq));
    sequence = TextTools::removeWhiteSpaces(sequence);
    sequence = TextTools::removeChar(sequence, '{');
    sequence = TextTools::removeChar(sequence, '}');
    sequence = TextTools::removeChar(sequence, '[');
    sequence = TextTools::removeChar(sequence, ']');
    sequence = TextTools::removeChar(sequence, '(');
    sequence = TextTools::removeChar(sequence, ')');
    sequence = TextTools::removeChar(sequence, '^');
    name     = string(line.begin() + static_cast<ptrdiff_t>(endOfSeq + 1), line.end()),
    name     = TextTools::removeFirstWhiteSpaces(name);
    if(name.find("Helix numbering") == name.npos
    && name.find("mask") == name.npos)
      sc.addSequence(BasicSequence(name, sequence, alpha), true);
  }
}
void SiteContainerTools::removeGapOrUnresolvedOnlySites(SiteContainer& sites)
{
  size_t n = sites.getNumberOfSites();
  size_t i = n;
  while (i > 1)
  {
    ApplicationTools::displayGauge(n - i + 1, n);
    const Site* site = &sites.getSite(i - 1);
    if (SiteTools::isGapOnly(*site))
    {
      size_t end = i;
      while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1)
      {
        --i;
        site = &sites.getSite(i - 1);
      }
      sites.deleteSites(i, end - i);
    }
    else
    {
      --i;
    }
  }
  ApplicationTools::displayGauge(n, n);
  const Site* site = &sites.getSite(0);
  if (SiteTools::isGapOrUnresolvedOnly(*site))
    sites.deleteSite(0);
}
Esempio n. 10
0
void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs)
throw (AlphabetMismatchException, Exception)
{
  if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
    throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet());


  vector<string> seqNames1 = seqCont1.getSequencesNames();
  vector<string> seqNames2 = seqCont2.getSequencesNames();
  const SiteContainer* seqCont2bis = 0;
  bool del = false;
  if (seqNames1 == seqNames2)
  {
    seqCont2bis = &seqCont2;
  }
  else
  {
    // We shall reorder sequences first:
    SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet());
    SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter);
    seqCont2bis = seqCont2ter;
    del = true;
  }

  if (leavePositionAsIs)
  {
    for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
    {
      seqCont1.addSite(seqCont2bis->getSite(i), false);
    }
  }
  else
  {
    int offset = static_cast<int>(seqCont1.getNumberOfSites());
    for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
    {
      seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false);
    }
  }

  if (del)
    delete seqCont2bis;
}
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore(
  const Tree& tree,
  const SiteContainer& data,
  const StateMap* statesMap,
  bool verbose)
throw (Exception) :
  tree_(new TreeTemplate<Node>(tree)),
  data_(0),
  alphabet_(data.getAlphabet()),
  statesMap_(statesMap),
  nbStates_(statesMap->getNumberOfModelStates())
{
  init_(data, verbose);
}
Esempio n. 12
0
SiteContainer* SequenceSimulationTools::simulateSites(const SiteSimulator& simulator, const vector<double>& rates, const vector<size_t>& states)
throw (Exception)
{
  size_t numberOfSites = rates.size();
  if (states.size() != numberOfSites)
    throw Exception("SequenceSimulationTools::simulateSites., 'rates' and 'states' must have the same length.");
  vector<const Site*> vs(numberOfSites);
  for (size_t i = 0; i < numberOfSites; i++)
  {
    Site* s = simulator.simulateSite(states[i], rates[i]);
    s->setPosition(static_cast<int>(i));
    vs[i] = s;
  }
  SiteContainer* sites = new VectorSiteContainer(vs, simulator.getAlphabet());
  sites->setSequencesNames(simulator.getSequencesNames(), false);
  // Freeing memory:
  for (size_t i = 0; i < numberOfSites; i++)
  {
    delete vs[i];
  }

  return sites;
}
Esempio n. 13
0
std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2)
{
  const Sequence* seq1 = &sequences.getSequence(i1);
  const Sequence* seq2 = &sequences.getSequence(i2);
  map<size_t, size_t> tln;
  size_t count1 = 0; // Sequence 1 counter
  size_t count2 = 0; // Sequence 2 counter
  int state1;
  int state2;
  for (size_t i = 0; i <  sequences.getNumberOfSites(); i++)
  {
    state1 = (*seq1)[i];
    if (state1 != -1)
      count1++;
    state2 = (*seq2)[i];
    if (state2 != -1)
      count2++;
    if (state1 != -1)
    {
      tln[count1] = (state2 == -1 ? 0 : count2);
    }
  }
  return tln;
}
AbstractTreeParsimonyScore::AbstractTreeParsimonyScore(
  const Tree& tree,
  const SiteContainer& data,
  bool verbose,
  bool includeGaps)
throw (Exception) :
  tree_(new TreeTemplate<Node>(tree)),
  data_(0),
  alphabet_(data.getAlphabet()),
  statesMap_(0),
  nbStates_(0)
{
  statesMap_ = new CanonicalStateMap(alphabet_, includeGaps);
  nbStates_  = statesMap_->getNumberOfModelStates();
  init_(data, verbose);
}
Esempio n. 15
0
VectorSiteContainer::VectorSiteContainer(const SiteContainer& sc) :
  AbstractSequenceContainer(sc),
  sites_(0),
  names_(sc.getSequencesNames()),
  comments_(sc.getNumberOfSequences()),
  sequences_(sc.getNumberOfSequences())
{
  // Now try to add each site:
  for (size_t i = 0; i < sc.getNumberOfSites(); i++)
  {
    addSite(sc.getSite(i), false); // We assume that positions are correct.
  }
  // Seq comments:
  for (size_t i = 0; i < sc.getNumberOfSequences(); i++)
  {
    comments_[i] = new Comments(sc.getComments(i));
  }
}
Esempio n. 16
0
void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions)
{
  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
  int gap = sites.getAlphabet()->getGapCharacterCode();
  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
    const Sequence& seq = sites.getSequence(i);
    unsigned int pos = 0;
    for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
      if (seq[j] != gap) {
        ++pos;
        positions(i, j) = pos;
      } else {
        positions(i, j) = 0;
      }
    }
  }
}
Esempio n. 17
0
int main() {
  //ProteicAlphabet* alpha = new ProteicAlphabet;
  RNA* alpha = new RNA();
  SiteContainer* sites = new VectorSiteContainer(alpha);
  BasicSequence seq1("seq1", "----AUGCCG---GCGU----UUU----G--G-CCGACGUGUUUU--", alpha);
  BasicSequence seq2("seq2", "---GAAGGCG---G-GU----UUU----GC-GACCGACG--UUUU--", alpha);
  sites->addSequence(seq1, false);
  sites->addSequence(seq2, false);

  cout << sites->getNumberOfSites() << endl;
  cout << sites->toString("seq1") << endl;
  cout << sites->toString("seq2") << endl;
  SiteContainerTools::removeGapOnlySites(*sites);
  cout << endl;
  
  cout << sites->getNumberOfSites() << endl;
  cout << sites->toString("seq1") << endl;
  cout << sites->toString("seq2") << endl;

  return (sites->getNumberOfSites() == 30 ? 0 : 1);
}
Esempio n. 18
0
void Clustal::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception)
{
  output << "CLUSTAL W (1.81) multiple sequence alignment" << endl;
  output << endl;
  if (sc.getNumberOfSequences() == 0)
    return;

  vector<string> text;
  size_t length = 0;
  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i ) {
    const Sequence& seq = sc.getSequence(i);
    if (seq.getName().size() > length)
      length = seq.getName().size();
    text.push_back(sc.getSequence(i).toString());
  }
  length += nbSpacesBeforeSeq_;
  for (unsigned int j = 0; j < text[0].size(); j += charsByLine_) {
    for (unsigned int i = 0; i < sc.getNumberOfSequences(); ++i ) {
      output << TextTools::resizeRight(sc.getSequence(i).getName(), length);
      output << text[i].substr(j, charsByLine_) << endl;
    }
    output << endl;
  }
}
Esempio n. 19
0
VectorSiteContainer* SequenceApplicationTools::getSitesToAnalyse(
  const SiteContainer& allSites,
  map<string, string>& params,
  string suffix,
  bool suffixIsOptional,
  bool gapAsUnknown,
  bool verbose,
  int warn)
{
  // Fully resolved sites, i.e. without jokers and gaps:
  SiteContainer* sitesToAnalyse;
  VectorSiteContainer* sitesToAnalyse2;

  string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn);
  if (verbose)
    ApplicationTools::displayResult("Sites to use", option);
  if (option == "all")
  {
    sitesToAnalyse = new VectorSiteContainer(allSites);
    string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn);

    if (maxGapOption[maxGapOption.size() - 1] == '%')
    {
      double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100.;
      if (gapFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          if (freq[-1] > gapFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t gapNum = TextTools::to<size_t>(maxGapOption);
      if (gapNum < sitesToAnalyse->getNumberOfSequences())
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          if (counts[-1] > gapNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn);

    int sAlph = static_cast<int>(sitesToAnalyse->getAlphabet()->getSize());

    if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%')
    {
      double unresolvedFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100.;
      if (unresolvedFreq < 1)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove unresolved sites", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; --i)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, double> freq;
          SiteTools::getFrequencies(sitesToAnalyse->getSite(i - 1), freq);
          double x = 0;
          for (int l = 0; l < sAlph; ++l)
          {
            x += freq[l];
          }
          if (1 - x > unresolvedFreq)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }
    else
    {
      size_t nbSeq = sitesToAnalyse->getNumberOfSequences();
      size_t unresolvedNum = TextTools::to<size_t>(maxUnresolvedOption);
      if (unresolvedNum < nbSeq)
      {
        if (verbose)
          ApplicationTools::displayTask("Remove sites with gaps", true);
        for (size_t i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
        {
          if (verbose)
            ApplicationTools::displayGauge(sitesToAnalyse->getNumberOfSites() - i, sitesToAnalyse->getNumberOfSites() - 1, '=');
          map<int, size_t> counts;
          SiteTools::getCounts(sitesToAnalyse->getSite(i - 1), counts);
          size_t x = 0;
          for (int l = 0; l < sAlph; l++)
          {
            x += counts[l];
          }

          if (nbSeq - x > unresolvedNum)
            sitesToAnalyse->deleteSite(i - 1);
        }
        if (verbose)
          ApplicationTools::displayTaskDone();
      }
    }

    if (gapAsUnknown)
    {
      SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse);
    }
  }
  else if (option == "complete")
  {
    sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
  }
  else if (option == "nogap")
  {
    sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites);
    size_t nbSites = sitesToAnalyse->getNumberOfSites();
    if (verbose)
      ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
  }
  else
  {
    throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'.");
  }

  const CodonAlphabet* ca = dynamic_cast<const CodonAlphabet*>(sitesToAnalyse->getAlphabet());
  if (ca)
  {
    option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn);
    if ((option != "") && verbose)
      ApplicationTools::displayResult("Remove Stop Codons", option);

    if (option == "yes")
    {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn);
      unique_ptr<GeneticCode> gCode(getGeneticCode(ca->getNucleicAlphabet(), codeDesc));
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::removeStopCodonSites(*sitesToAnalyse, *gCode));
      delete sitesToAnalyse;
    }
    else
      sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);
  }
  else
    sitesToAnalyse2 = dynamic_cast<VectorSiteContainer*>(sitesToAnalyse);

  return sitesToAnalyse2;
}
Esempio n. 20
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*           Bio++ Sequence Manipulator, version 2.3.0.           *" << endl;
  cout << "* Author: J. Dutheil                        Last Modif. 25/11/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;
  
  if (args == 1)
  {
    help();
    return 0;
  }
  
  try {

  BppApplication bppseqman(args, argv, "BppSeqMan");
  bppseqman.startTimer();
  
  // Get alphabet
  Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true);
  unique_ptr<GeneticCode> gCode;
  CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);

  // Get sequences:
  bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1);
  OrderedSequenceContainer* sequences = 0;

  if (aligned) {
    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams());
    sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false);
    delete allSites;
  } else {
    SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true);
    sequences = new VectorSequenceContainer(*tmp);
    delete tmp;
  }

  ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences());
  
  // Perform manipulations
  
  vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1);
  

  for (size_t a = 0; a < actions.size(); a++)
  {
    string cmdName;
    map<string, string> cmdArgs;
    KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs);
    ApplicationTools::displayResult("Performing action", cmdName);

    // +-----------------+
    // | Complementation |
    // +-----------------+
    if (cmdName == "Complement")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------------+
    // | (Reverse)Transcription |
    // +------------------------+
    else if (cmdName == "Transcript")
    {
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::transcript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else throw Exception("Transcription error: input alphabet must be of type 'nucleic'.");
    }
    // +-------------------------------+
    // | Switching nucleotide alphabet |
    // +-------------------------------+
    else if (cmdName == "Switch")
    {
      const Alphabet* alpha = 0;
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::RNA_ALPHABET;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::DNA_ALPHABET;
      }
      else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'.");
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(alpha);
      else         sc = new VectorSequenceContainer(alpha);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        vector<int> content(old->size());
        for (size_t j = 0; j < old->size(); ++j)
          content[j] = (*old)[j];
        Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +-------------+
    // | Translation |
    // +-------------+
    else if (cmdName == "Translate")
    {
      if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet()))
        throw Exception("Error in translation: alphabet is not of type 'codon'.");
      if (cmdArgs["code"] != "")
        throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'.");
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET);
      else         sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        Sequence* seq = gCode->translate(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;      
    }
    // +-------------+
    // | Remove gaps |
    // +-------------+
    else if (cmdName == "RemoveGaps")
    {
      VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
        SequenceTools::removeGaps(*seq);
        sc->addSequence(*seq);
      }
      delete sequences;
      sequences = sc;
      aligned = false;
    }
    // +---------------------------+
    // | Change gaps to unresolved |
    // +---------------------------+
    else if (cmdName == "GapToUnknown")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeGapsToUnknownCharacters(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +---------------------------+
    // | Change unresolved to gaps |
    // +---------------------------+
    else if (cmdName == "UnknownToGap")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeUnresolvedCharactersToGaps(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    
    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveStops")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::removeStops(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      } else {
        VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::replaceStopsWithGaps(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      }
    }

    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveColumnsWithStops")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      for (size_t i = sites->getNumberOfSites(); i > 0; i--)
      {
        if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode))
          sites->deleteSite(i - 1);
      }
    }

    // +---------+
    // | Get CDS |
    // +---------+
    else if (cmdName == "GetCDS")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        BasicSequence seq = sequences->getSequence(i);
        size_t len = seq.size();
        SequenceTools::getCDS(seq, *gCode, false, true, true, false);
        if (aligned) {
          for (size_t c = seq.size(); c < len; ++c)
            seq.addElement(seq.getAlphabet()->getGapCharacterCode());
        }
        sc->addSequence(seq, false);
      }
      delete sequences;
      sequences = sc;
    }

    // +--------------------------+
    // | Resolve dotted alignment |
    // +--------------------------+
    else if (actions[a] == "CoerceToAlignment")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if(! sites)
      {
        sites = new VectorSiteContainer(*sequences);
        delete sequences;
        sequences = sites;
      }
      aligned = true;
    }
    else if (actions[a] == "ResolvedDotted")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      const Alphabet* alpha = 0;
      string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1);
      if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET;
      else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET;
      else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET;
      else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment.");
      OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha);
      delete sequences;
      sequences = resolvedCont;
    }
    // +---------------------+
    // | Keep complete sites |
    // +---------------------+
    else if (cmdName == "KeepComplete")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1);
      if (maxGapOption[maxGapOption.size()-1] == '%')
      {
        double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.;
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, double> freqs;
          SiteTools::getFrequencies(sites->getSite(i - 1), freqs);
          if (freqs[-1] > gapFreq) sites->deleteSite(i - 1);
        }
      }
      else
      {
        size_t gapNum = TextTools::to<size_t>(maxGapOption);
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, size_t> counts;
          SiteTools::getCounts(sites->getSite(i - 1), counts);
          counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0.
          if (counts[-1] > gapNum) sites->deleteSite(i-1);
        }
      }
    }
    // +-----------------+
    // | Invert sequence |
    // +-----------------+
    else if (cmdName == "Invert")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        Sequence* seq = SequenceTools::getInvert(*old);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------+
    // | GetCodonPosition |
    // +------------------+
    else if (cmdName == "GetCodonPosition")
    {
      unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1);
      OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1));
      delete sequences;
      if (aligned) {
        sequences = new VectorSiteContainer(*sc);
        delete sc;
      } else {
        sequences = sc;
      }
    }
    // +-----------------+
    // | FilterFromTree |
    // +-----------------+
    else if (cmdName == "FilterFromTree")
    {
      unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, ""));
      vector<string> names = tree->getLeavesNames();
      OrderedSequenceContainer* reorderedSequences = 0;
      if (aligned) {
        reorderedSequences = new VectorSiteContainer(sequences->getAlphabet());
      } else {
        reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet());
      }
      for (size_t i = 0; i < names.size(); ++i) {
        reorderedSequences->addSequence(sequences->getSequence(names[i]), false);
      }
      delete sequences;
      sequences = reorderedSequences;
    }
    // +----------------------+
    // | RemoveEmptySequences |
    // +----------------------+
    else if (cmdName == "RemoveEmptySequences")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0)
          sc->addSequence(sequences->getSequence(i), false);
      }
      delete sequences;
      sequences = sc;
    }

    else throw Exception("Unknown action: " + cmdName);
  }
  
  // Write sequences
  ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned);
  if (aligned)
  {
    SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1);
  }
  else
  {
    SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1);
  }

  delete alphabet;
  delete sequences;

  bppseqman.done();

  } catch(exception & e) {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Esempio n. 21
0
void Clustal::appendAlignmentFromStream(std::istream& input, SiteContainer & sc) const throw (Exception)
{
  // Checking the existence of specified file
  if (!input) { throw IOException ("Clustal::read : fail to open file"); }

  const Alphabet * alpha = sc.getAlphabet();
  vector<BasicSequence> sequences;

  string lineRead("");

  Comments comments(1);
  comments[0] = FileTools::getNextLine(input); // First line gives file generator.

  lineRead = FileTools::getNextLine(input); // This is the first sequence of the first block.
    
  string::size_type beginSeq = 0;
  unsigned int count = 0;
  for (size_t i = lineRead.size(); i > 0; i--) {
    char c = lineRead[i-1];
    if (c == ' ') {
      count++;
      if (count == nbSpacesBeforeSeq_) {
        beginSeq = i - 1 + nbSpacesBeforeSeq_;
        break;
      }
    }
    else count = 0;
  }
  if (beginSeq == 0) throw IOException("Clustal::read. Bad intput file.");

  unsigned int countSequences = 0;

  //Read first sequences block:
  bool test = true;
  do {
    sequences.push_back(BasicSequence(TextTools::removeSurroundingWhiteSpaces(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_)), lineRead.substr(beginSeq), alpha));
    getline(input, lineRead, '\n');
    countSequences++;
    test = !TextTools::isEmpty(lineRead) && !TextTools::isEmpty(lineRead.substr(0, beginSeq - nbSpacesBeforeSeq_));
  }
  while (input && test);

  // Read other blocks
  lineRead = FileTools::getNextLine(input); // Read first sequence of next block.
  while (!TextTools::isEmpty(lineRead)) {
    // Read next block:
    for (unsigned int i = 0; i < countSequences; ++i) {
      // Complete sequences
      if (TextTools::isEmpty(lineRead))
        throw IOException("Clustal::read. Bad intput file.");
       sequences[i].append(lineRead.substr(beginSeq));
      getline(input, lineRead, '\n');
    }
    //At this point, lineRead is the first line after the current block.
    lineRead = FileTools::getNextLine(input);
  }

  for (unsigned int i = 0; i < countSequences; ++i)
    sc.addSequence(sequences[i], checkNames_);
  sc.setGeneralComments(comments);
}
Esempio n. 22
0
void RecursiveLikelihoodTree::initLikelihoodsWithoutPatterns_(const Node* node, const SiteContainer& sequences, const SubstitutionProcess& process) throw (Exception)
{
  int nId = node->getId();

  // Initialize likelihood vector:
  if (!node->hasFather())
  {
    resetAboveLikelihoods(nId, nbDistinctSites_, nbStates_);
    resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D0);

    resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D1);
    resetLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D2);
  }


  resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D0);
  resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D1);
  resetBelowLikelihoods(nId, nbDistinctSites_, nbStates_, ComputingNode::D2);


  // Now initialize likelihood values and pointers:

  if (node->hasNoSon())
  {
    const Sequence* seq;
    try
    {
      seq = &sequences.getSequence(node->getName());
    }
    catch (SequenceNotFoundException snfe)
    {
      throw SequenceNotFoundException("RecursiveLikelihoodTree::initTreelikelihoods. Leaf name in tree not found in site conainer: ", (node->getName()));
    }

    for (size_t c = 0; c < nbClasses_; c++)
    {
      RecursiveLikelihoodNode& lNode = *dynamic_cast<RecursiveLikelihoodNode*>(vTree_[c]->getNode(nId));
      VVdouble& array = lNode.getBelowLikelihoodArray_(ComputingNode::D0);

      for (size_t i = 0; i < nbDistinctSites_; i++)
      {
        Vdouble* array_i = &array[i];
        int state = seq->getValue(i);
        double test = 0.;
        for (size_t s = 0; s < nbStates_; s++)
        {
          double x = process.getInitValue(s, state);
          if (lNode.usesLog())
          {
            if (x <= 0)
              (*array_i)[s] = -10000;
            else
              (*array_i)[s] = log(x);
          }
          else
            (*array_i)[s] = x;

          test += x;
        }
        if (test < 0.000001)
          std::cerr << "WARNING!!! Likelihood will be 0 for this site " << TextTools::toString(i) << std::endl;
      }
      lNode.updateBelow_(true, ComputingNode::D0);
    }
  }
  else
  {
    // 'node' is an internal node.
    std::map<int, std::vector<size_t> >* patternLinks_node = &patternLinks_[nId];
    int nbSonNodes = static_cast<int>(node->getNumberOfSons());
    for (int l = 0; l < nbSonNodes; ++l)
    {
      // For each son node,
      const Node* son = (*node)[l];
      initLikelihoodsWithoutPatterns_(son, sequences, process);
      std::vector<size_t>* patternLinks_node_son = &(*patternLinks_node)[son->getId()];

      // Init map:
      patternLinks_node_son->resize(nbDistinctSites_);

      for (size_t i = 0; i < nbDistinctSites_; i++)
      {
        (*patternLinks_node_son)[i] = i;
      }
    }
  }

  if (!node->hasFather())
    setAboveLikelihoods(nId, process.getRootFrequencies());
}