Example #1
0
Tree* TreeTools::MRPMultilabel(const vector<Tree*>& vecTr)
{
    // matrix representation
    VectorSiteContainer* sites = TreeTools::MRPEncode(vecTr);

    // starting bioNJ tree
    const DNA* alphabet = dynamic_cast<const DNA*>(sites->getAlphabet());
    JCnuc* jc = new JCnuc(alphabet);
    ConstantDistribution* constRate = new ConstantDistribution(1.);
    DistanceEstimation distFunc(jc, constRate, sites, 0, true);
    BioNJ bionjTreeBuilder(false, false);
    bionjTreeBuilder.setDistanceMatrix(*(distFunc.getMatrix()));
    bionjTreeBuilder.computeTree();
    if (ApplicationTools::message)
        ApplicationTools::message->endLine();
    TreeTemplate<Node>* startTree = new TreeTemplate<Node>(*bionjTreeBuilder.getTree());

    // MP optimization
    DRTreeParsimonyScore* MPScore = new DRTreeParsimonyScore(*startTree, *sites, false);
    MPScore = OptimizationTools::optimizeTreeNNI(MPScore, 0);
    delete startTree;
    Tree* retTree = new TreeTemplate<Node>(MPScore->getTree());
    delete MPScore;

    return retTree;
}
VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index)
{
  VectorSiteContainer* sample = new VectorSiteContainer(sites.getSequencesNames(), sites.getAlphabet());
  for (size_t i = 0; i < nbSites; i++)
  {
    size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites())));
    sample->addSite(sites.getSite(pos), false);
    if (index)
      index->push_back(pos);
  }
  return sample;
}
SiteContainer* SiteContainerTools::getCompleteSites(const SiteContainer& sites)
{
  vector<string> seqNames = sites.getSequencesNames();
  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
  noGapCont->setSequencesNames(seqNames, false);
  CompleteSiteContainerIterator csi(sites);
  while (csi.hasMoreSites())
  {
    noGapCont->addSite(*csi.nextSite());
  }
  return noGapCont;
}
SiteContainer* SiteContainerTools::removeGapSites(const SiteContainer& sites, double maxFreqGaps)
{
  vector<string> seqNames = sites.getSequencesNames();
  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
  noGapCont->setSequencesNames(seqNames, false);
  for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) {
    map<int, double> freq;
    SiteTools::getFrequencies(sites.getSite(i), freq);
    if (freq[-1] <= maxFreqGaps)
      noGapCont->addSite(sites.getSite(i), false);
  }
  return noGapCont;
}
SiteContainer* SiteContainerTools::removeGapOrUnresolvedOnlySites(const SiteContainer& sites)
{
  vector<string> seqNames = sites.getSequencesNames();
  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
  noGapCont->setSequencesNames(seqNames, false);
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    const Site* site = &sites.getSite(i);
    if (!SiteTools::isGapOrUnresolvedOnly(*site))
      noGapCont->addSite(*site, false);
  }
  return noGapCont;
}
SiteContainer* SiteContainerTools::getSelectedSites(
  const SiteContainer& sequences,
  const SiteSelection& selection)
{
  vector<string> seqNames = sequences.getSequencesNames();
  VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet());
  sc->setSequencesNames(seqNames, false);
  for (unsigned int i = 0; i < selection.size(); i++)
  {
    sc->addSite(sequences.getSite(selection[i]), false);
    // We do not check names, we suppose that the container passed as an argument is correct.
    // WARNING: what if selection contains many times the same indice? ...
  }
  sc->setGeneralComments(sequences.getGeneralComments());
  return sc;
}
SiteContainer* SiteContainerTools::removeStopCodonSites(const SiteContainer& sites)  throw (AlphabetException)
{
  const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet());
  if (!pca)
    throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet());
  vector<string> seqNames = sites.getSequencesNames();
  VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
  noStopCont->setSequencesNames(seqNames, false);
  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
  {
    const Site* site = &sites.getSite(i);
    if (!SiteTools::hasStopCodon(*site))
      noStopCont->addSite(*site, false);
  }
  return noStopCont;
}
VectorSiteContainer::VectorSiteContainer(const VectorSiteContainer& vsc) :
  AbstractSequenceContainer(vsc),
  sites_(0),
  names_(vsc.names_),
  comments_(vsc.getNumberOfSequences()),
  sequences_(vsc.getNumberOfSequences())
{
  // Now try to add each site:
  for (size_t i = 0; i < vsc.getNumberOfSites(); i++)
  {
    addSite(vsc.getSite(i), false); // We assume that positions are correct.
  }
  // Seq comments:
  for (size_t i = 0; i < vsc.getNumberOfSequences(); i++)
  {
    comments_[i] = new Comments(vsc.getComments(i));
  }
}
VectorSiteContainer* SequenceApplicationTools::getSiteContainer(
  const Alphabet* alpha,
  map<string, string>& params,
  const string& suffix,
  bool suffixIsOptional,
  bool verbose,
  int warn)
{
  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
  BppOAlignmentReaderFormat bppoReader(warn);
  unique_ptr<IAlignment> iAln(bppoReader.read(sequenceFormat));
  map<string, string> args(bppoReader.getUnparsedArguments());
  if (verbose)
  {
    ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
    ApplicationTools::displayResult("Sequence format " + suffix, iAln->getFormatName());
  }

  const Alphabet* alpha2;
  if (AlphabetTools::isRNYAlphabet(alpha))
    alpha2 = &dynamic_cast<const RNY*>(alpha)->getLetterAlphabet();
  else
    alpha2 = alpha;

  const SequenceContainer* seqCont = iAln->readAlignment(sequenceFilePath, alpha2);

  VectorSiteContainer* sites2 = new VectorSiteContainer(*dynamic_cast<const OrderedSequenceContainer*>(seqCont));

  delete seqCont;

  VectorSiteContainer* sites;

  if (AlphabetTools::isRNYAlphabet(alpha))
  {
    const SequenceTools ST;
    sites = new VectorSiteContainer(alpha);
    for (unsigned int i = 0; i < sites2->getNumberOfSequences(); i++)
    {
      sites->addSequence(*(ST.RNYslice(sites2->getSequence(i))));
    }
    delete sites2;
  }
  else
    sites = sites2;

  // Look for site selection:
  if (iAln->getFormatName() == "MASE file")
  {
    // getting site set:
    string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
    if (siteSet != "none")
    {
      VectorSiteContainer* selectedSites;
      try
      {
        selectedSites = dynamic_cast<VectorSiteContainer*>(MaseTools::getSelectedSites(*sites, siteSet));
        if (verbose)
          ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
      }
      catch (IOException& ioe)
      {
        throw ioe;
      }
      if (selectedSites->getNumberOfSites() == 0)
      {
        throw Exception("Site set '" + siteSet + "' is empty.");
      }
      delete sites;
      sites = selectedSites;
    }
  }
  else
  {
    // getting site set:
    size_t nbSites = sites->getNumberOfSites();

    string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);

    VectorSiteContainer* selectedSites = 0;
    if (siteSet != "none")
    {
      vector<size_t> vSite;
      try
      {
        vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet,",",":");
        for (size_t i = 0; i < vSite1.size(); ++i)
        {
          int x = (vSite1[i] >= 0 ? vSite1[i] : static_cast<int>(nbSites) + vSite1[i]+ 1);
          if (x<=(int)nbSites)
          {
            if (x > 0)
              vSite.push_back(static_cast<size_t>(x - 1));
            else
              throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index: " + TextTools::toString(x));
          }
          else
            throw Exception("SequenceApplicationTools::getSiteContainer(). Too large index: " + TextTools::toString(x));
        }
        selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite));
        selectedSites->reindexSites();
      }
      catch (Exception& e)
      {
        string seln;
        map<string, string> selArgs;
        KeyvalTools::parseProcedure(siteSet, seln, selArgs);
        if (seln == "Sample")
        {
          size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
          bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);

          vSite.resize(n);
          vector<size_t> vPos;
          for (size_t p = 0; p < nbSites; ++p)
          {
            vPos.push_back(p);
          }

          RandomTools::getSample(vPos, vSite, replace);

          selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite));
          if (replace)
            selectedSites->reindexSites();
        }
        else
          throw Exception("Unknown site selection description: " + siteSet);
      }

      if (verbose)
        ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));

      if (selectedSites && (selectedSites->getNumberOfSites() == 0))
      {
        throw Exception("Site set '" + siteSet + "' is empty.");
      }
      delete sites;
      sites = selectedSites;
    }
  }
  return sites;
}
VectorSiteContainer* VectorSiteContainer::createEmptyContainer() const
{
  VectorSiteContainer* vsc = new VectorSiteContainer(getAlphabet());
  vsc->setGeneralComments(getGeneralComments());
  return vsc;
}
Example #11
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*           Bio++ Sequence Manipulator, version 2.3.0.           *" << endl;
  cout << "* Author: J. Dutheil                        Last Modif. 25/11/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;
  
  if (args == 1)
  {
    help();
    return 0;
  }
  
  try {

  BppApplication bppseqman(args, argv, "BppSeqMan");
  bppseqman.startTimer();
  
  // Get alphabet
  Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true);
  unique_ptr<GeneticCode> gCode;
  CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);

  // Get sequences:
  bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1);
  OrderedSequenceContainer* sequences = 0;

  if (aligned) {
    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams());
    sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false);
    delete allSites;
  } else {
    SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true);
    sequences = new VectorSequenceContainer(*tmp);
    delete tmp;
  }

  ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences());
  
  // Perform manipulations
  
  vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1);
  

  for (size_t a = 0; a < actions.size(); a++)
  {
    string cmdName;
    map<string, string> cmdArgs;
    KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs);
    ApplicationTools::displayResult("Performing action", cmdName);

    // +-----------------+
    // | Complementation |
    // +-----------------+
    if (cmdName == "Complement")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------------+
    // | (Reverse)Transcription |
    // +------------------------+
    else if (cmdName == "Transcript")
    {
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::transcript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else throw Exception("Transcription error: input alphabet must be of type 'nucleic'.");
    }
    // +-------------------------------+
    // | Switching nucleotide alphabet |
    // +-------------------------------+
    else if (cmdName == "Switch")
    {
      const Alphabet* alpha = 0;
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::RNA_ALPHABET;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::DNA_ALPHABET;
      }
      else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'.");
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(alpha);
      else         sc = new VectorSequenceContainer(alpha);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        vector<int> content(old->size());
        for (size_t j = 0; j < old->size(); ++j)
          content[j] = (*old)[j];
        Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +-------------+
    // | Translation |
    // +-------------+
    else if (cmdName == "Translate")
    {
      if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet()))
        throw Exception("Error in translation: alphabet is not of type 'codon'.");
      if (cmdArgs["code"] != "")
        throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'.");
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET);
      else         sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        Sequence* seq = gCode->translate(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;      
    }
    // +-------------+
    // | Remove gaps |
    // +-------------+
    else if (cmdName == "RemoveGaps")
    {
      VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
        SequenceTools::removeGaps(*seq);
        sc->addSequence(*seq);
      }
      delete sequences;
      sequences = sc;
      aligned = false;
    }
    // +---------------------------+
    // | Change gaps to unresolved |
    // +---------------------------+
    else if (cmdName == "GapToUnknown")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeGapsToUnknownCharacters(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +---------------------------+
    // | Change unresolved to gaps |
    // +---------------------------+
    else if (cmdName == "UnknownToGap")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeUnresolvedCharactersToGaps(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    
    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveStops")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::removeStops(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      } else {
        VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::replaceStopsWithGaps(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      }
    }

    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveColumnsWithStops")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      for (size_t i = sites->getNumberOfSites(); i > 0; i--)
      {
        if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode))
          sites->deleteSite(i - 1);
      }
    }

    // +---------+
    // | Get CDS |
    // +---------+
    else if (cmdName == "GetCDS")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        BasicSequence seq = sequences->getSequence(i);
        size_t len = seq.size();
        SequenceTools::getCDS(seq, *gCode, false, true, true, false);
        if (aligned) {
          for (size_t c = seq.size(); c < len; ++c)
            seq.addElement(seq.getAlphabet()->getGapCharacterCode());
        }
        sc->addSequence(seq, false);
      }
      delete sequences;
      sequences = sc;
    }

    // +--------------------------+
    // | Resolve dotted alignment |
    // +--------------------------+
    else if (actions[a] == "CoerceToAlignment")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if(! sites)
      {
        sites = new VectorSiteContainer(*sequences);
        delete sequences;
        sequences = sites;
      }
      aligned = true;
    }
    else if (actions[a] == "ResolvedDotted")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      const Alphabet* alpha = 0;
      string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1);
      if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET;
      else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET;
      else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET;
      else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment.");
      OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha);
      delete sequences;
      sequences = resolvedCont;
    }
    // +---------------------+
    // | Keep complete sites |
    // +---------------------+
    else if (cmdName == "KeepComplete")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1);
      if (maxGapOption[maxGapOption.size()-1] == '%')
      {
        double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.;
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, double> freqs;
          SiteTools::getFrequencies(sites->getSite(i - 1), freqs);
          if (freqs[-1] > gapFreq) sites->deleteSite(i - 1);
        }
      }
      else
      {
        size_t gapNum = TextTools::to<size_t>(maxGapOption);
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, size_t> counts;
          SiteTools::getCounts(sites->getSite(i - 1), counts);
          counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0.
          if (counts[-1] > gapNum) sites->deleteSite(i-1);
        }
      }
    }
    // +-----------------+
    // | Invert sequence |
    // +-----------------+
    else if (cmdName == "Invert")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        Sequence* seq = SequenceTools::getInvert(*old);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------+
    // | GetCodonPosition |
    // +------------------+
    else if (cmdName == "GetCodonPosition")
    {
      unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1);
      OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1));
      delete sequences;
      if (aligned) {
        sequences = new VectorSiteContainer(*sc);
        delete sc;
      } else {
        sequences = sc;
      }
    }
    // +-----------------+
    // | FilterFromTree |
    // +-----------------+
    else if (cmdName == "FilterFromTree")
    {
      unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, ""));
      vector<string> names = tree->getLeavesNames();
      OrderedSequenceContainer* reorderedSequences = 0;
      if (aligned) {
        reorderedSequences = new VectorSiteContainer(sequences->getAlphabet());
      } else {
        reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet());
      }
      for (size_t i = 0; i < names.size(); ++i) {
        reorderedSequences->addSequence(sequences->getSequence(names[i]), false);
      }
      delete sequences;
      sequences = reorderedSequences;
    }
    // +----------------------+
    // | RemoveEmptySequences |
    // +----------------------+
    else if (cmdName == "RemoveEmptySequences")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0)
          sc->addSequence(sequences->getSequence(i), false);
      }
      delete sequences;
      sequences = sc;
    }

    else throw Exception("Unknown action: " + cmdName);
  }
  
  // Write sequences
  ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned);
  if (aligned)
  {
    SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1);
  }
  else
  {
    SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1);
  }

  delete alphabet;
  delete sequences;

  bppseqman.done();

  } catch(exception & e) {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Example #12
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*     Bio++ Computation of site likelihoods inside mixed models  *" << endl;
  cout << "*                        Version 2.2.0.                          *" << endl;
  cout << "* Author: L. Guéguen                       Last Modif.: 25/09/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if (args == 1)
  {
    help();
    return 0;
  }

  try
  {
    BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods");
    bppmixedlikelihoods.startTimer();

    Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false);
    auto_ptr<GeneticCode> gCode;
    CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);
    if (codonAlphabet) {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true);
      ApplicationTools::displayResult("Genetic Code", codeDesc);
      
      gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
    }

    // get the data

    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams());

    VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false);
    delete allSites;

    ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences()));
    ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites()));

    // Get the tree
    Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams());
    ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves()));


    AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl;
    string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false);
    ApplicationTools::displayResult("Heterogeneous model", nhOpt);

    MixedSubstitutionModel* model       = 0;
    MixedSubstitutionModelSet* modelSet = 0;
    DiscreteDistribution* rDist         = 0;

    if (nhOpt == "no")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true);
    }
    else if (nhOpt == "one_per_branch")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      vector<double> rateFreqs;
      if (model->getNumberOfStates() != alphabet->getSize())
      {
        // Markov-Modulated Markov Model...
        unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize());
        rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case,
        // we should assume a rate distribution for the root also!!!
      }
      
      std::map<std::string, std::string> aliasFreqNames;

      FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs);
      vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', "");
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters));
      model = 0;
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else if (nhOpt == "general")
    {
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (modelSet == 0)
      {
        cout << "Missing a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantDistribution(1.);
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else
      throw Exception("Unknown option for nonhomogeneous: " + nhOpt);

    tl->initialize();

    double logL = tl->getValue();
    if (isinf(logL))
    {
      // This may be due to null branch lengths, leading to null likelihood!
      ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero.");
      ApplicationTools::displayWarning("!!! This may be due to branch length == 0.");
      ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001.");
      ParameterList pl = tl->getBranchLengthsParameters();
      for (unsigned int i = 0; i < pl.size(); i++)
      {
        if (pl[i].getValue() < 0.000001)
          pl[i].setValue(0.000001);
      }
      tl->matchParametersValues(pl);
      logL = tl->getValue();
    }
    if (isinf(logL))
    {
      ApplicationTools::displayError("!!! Unexpected likelihood == 0.");
      ApplicationTools::displayError("!!! Looking at each site:");
      for (unsigned int i = 0; i < sites->getNumberOfSites(); i++)
      {
        (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine();
      }
      ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences).");
      exit(-1);
    }


    // Write parameters to screen:
    ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
    ParameterList parameters = tl->getSubstitutionModelParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    parameters = tl->getRateDistributionParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }


    // /////////////////////////////////////////////
    // Getting likelihoods per submodel

    string outputFile;
    outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false);
    ApplicationTools::displayResult("Output file for likelihoods", outputFile);
    ofstream out(outputFile.c_str(), ios::out);

    size_t nSites = sites->getNumberOfSites();

    size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true);

    string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false);

    if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels())))
    {
      ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + ".");
      exit(-1);
    }

    MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1));

    if (!p0)
    {
      ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model.");
      exit(-1);
    }

    const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0);
    if (ptmp) {
      p0 = ptmp->getMixedModel().clone();

      if (nhOpt == "no")
        model = p0;
      else {
        modelSet->replaceModel(nummodel-1, p0);
        modelSet->isFullySetUpFor(*tree);
      }
    }
    
    //////////////////////////////////////////////////
    // Case of a MixtureOfSubstitutionModels

    MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0);

    if (pMSM)
    {
      vector<string> colNames;
      colNames.push_back("Sites");

      size_t nummod = pMSM->getNumberOfModels();
      for (unsigned int i = 0; i < nummod; i++)
      {
        colNames.push_back(pMSM->getNModel(i)->getName());
      }

      DataTable* rates = new DataTable(nSites, colNames.size());
      rates->setColumnNames(colNames);

      for (unsigned int i = 0; i < nSites; i++)
      {
        const Site* currentSite = &sites->getSite(i);
        int currentSitePosition = currentSite->getPosition();
        (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]");
      }

      Vdouble vprob = pMSM->getProbabilities();
      for (unsigned int i = 0; i < nummod; i++)
      {
        string modname = pMSM->getNModel(i)->getName();

        for (unsigned int j = 0; j < nummod; j++)
        {
          pMSM->setNProbability(j, (j == i) ? 1 : 0);
        }

        if (tl)
          delete tl;

        if (nhOpt == "no")
          tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
        else
          tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

        tl->initialize();
        logL = tl->getValue();
        Vdouble Vd = tl->getLogLikelihoodForEachSite();
        for (unsigned int j = 0; j < nSites; j++)
        {
          (*rates)(j, modname) = TextTools::toString(Vd[j]);
        }

        ApplicationTools::displayMessage("\n");
        ApplicationTools::displayMessage("Model " + modname + ":");
        ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
        ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15));
      }

      DataTable::write(*rates, out, "\t");
    }

    //////////////////////////////////////////////////
    // Case of a MixtureOfASubstitutionModel

    else
    {
      MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0);
      if (pMSM2 != NULL)
      {
        size_t nummod = pMSM2->getNumberOfModels();
        if (parname == "")
        {
          ParameterList pl=pMSM2->getParameters();

          for (size_t i2 = 0; i2 < pl.size(); i2++)
          {
            string pl2n = pl[i2].getName();

            if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL)
            {
              parname=pl2n;

              while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL)
                parname=pl2n.substr(0,pl2n.rfind("_"));

              if (parname.size()>0){
                ApplicationTools::displayResult("likelihoods.parameter_name", parname);
                break;
              }
            }
          }
        }

        if (parname == "")
        {
          ApplicationTools::displayError("Argument likelihoods.parameter_name is required.");
          exit(-1);
        }

        vector< Vint > vvnmod;
        size_t i2 = 0;
        while (i2 < nummod)
        {
          string par2 = parname + "_" + TextTools::toString(i2 + 1);
          Vint vnmod = pMSM2->getSubmodelNumbers(par2);
          if (vnmod.size() == 0)
            break;
          vvnmod.push_back(vnmod);
          i2++;
        }

        size_t nbcl = vvnmod.size();
        if (nbcl==0)
          throw Exception("Parameter " + parname + " is not mixed.");
        
        Vdouble vprob = pMSM2->getProbabilities();

        vector<vector<double> > vvprob;
        vector<double> vsprob;
        
        for (size_t i = 0; i < nbcl; i++)
        {
          vector<double> vprob2;
          for (size_t j = 0; j < vvnmod[i].size(); j++)
          {
            vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]);
          }

          vvprob.push_back(vprob2);
          vsprob.push_back(VectorTools::sum(vvprob[i]));
        }

        vector<string> colNames;
        colNames.push_back("Sites");

        Vdouble dval;
        for (unsigned int i = 0; i < nbcl; i++)
        {
          SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0]));
          double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname));
          dval.push_back(valPar);
          colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar));
        }
        for (unsigned int i = 0; i < nbcl; i++)
          colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i]));

        colNames.push_back("mean");

        DataTable* rates = new DataTable(nSites, colNames.size());
        rates->setColumnNames(colNames);

        for (unsigned int i = 0; i < nSites; i++)
        {
          const Site* currentSite = &sites->getSite(i);
          int currentSitePosition = currentSite->getPosition();
          (*rates)(i,"Sites")=TextTools::toString(currentSitePosition);
        }

        VVdouble vvd;

          
        vector<double> vRates = pMSM2->getVRates();

        for (size_t i = 0; i < nbcl; ++i)
        {
          string par2 = parname + "_" + TextTools::toString(i + 1);
          
          for (unsigned int j = 0; j < nummod; ++j)
            pMSM2->setNProbability(j, 0);

          for (size_t j = 0; j < vvprob[i].size(); ++j)
            pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]);

          if (tl)
            delete tl;

          if (nhOpt == "no")
            tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
          else
            tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

          tl->initialize();
          logL = tl->getValue();
          Vdouble vd = tl->getLogLikelihoodForEachSite();

          for (unsigned int j = 0; j < nSites; j++)
            (*rates)(j, i + 1) = TextTools::toString(vd[j]);

          vvd.push_back(vd);

          ApplicationTools::displayMessage("\n");
          ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i]));

          ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
          ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15));
        }

        for (unsigned int j = 0; j < nSites; j++)
        {
          Vdouble vd;
          for (unsigned int i = 0; i < nbcl; i++)
            vd.push_back(std::log(vsprob[i])+vvd[i][j]);
          
          VectorTools::logNorm(vd);
          for (unsigned int i = 0; i < nbcl; i++)
            (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i]));
          (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval));
        }

        DataTable::write(*rates, out, "\t");
      }
    }

    delete alphabet;
    delete sites;
    if (model)
      delete model;
    if (modelSet)
      delete modelSet;
    delete rDist;
    delete tl;
    delete tree;
    ApplicationTools::displayMessage("\n");
    bppmixedlikelihoods.done();
  }

  catch (exception& e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Example #13
0
VectorSiteContainer * SequenceApplicationTools::getSitesToAnalyse(
  const SiteContainer & allSites,
  map<string, string> & params,
  string suffix,
  bool suffixIsOptional,
  bool gapAsUnknown,
  bool verbose)
{
  // Fully resolved sites, i.e. without jokers and gaps:
  VectorSiteContainer * sitesToAnalyse;
  
  string option = ApplicationTools::getStringParameter("sequence.sites_to_use", params, "complete", suffix, suffixIsOptional);
  if(verbose) ApplicationTools::displayResult("Sites to use", option);
  sitesToAnalyse = new VectorSiteContainer(allSites);
  if(option == "all")
  {
    string maxGapOption = ApplicationTools::getStringParameter("sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional);
    if(maxGapOption[maxGapOption.size()-1] == '%')
    {
      double gapFreq = TextTools::toDouble(maxGapOption.substr(0,maxGapOption.size()-1)) / 100.;
      for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
      {
        map<int, double> freq;
        SiteTools::getFrequencies(*sitesToAnalyse->getSite(i-1), freq);
        if(freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i-1);
      }
    }
    else
    {
      unsigned int gapNum=TextTools::to<unsigned int>(maxGapOption);
      for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
      {
        map<int, unsigned int> counts;
        SiteTools::getCounts(*sitesToAnalyse->getSite(i-1), counts);
        if(counts[-1] > gapNum) sitesToAnalyse->deleteSite(i-1);
      }
    }
    if(gapAsUnknown)
    {
      SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse);
    }
  }
  else if(option == "complete")
  {
    sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getCompleteSites(allSites));
    int nbSites = sitesToAnalyse->getNumberOfSites();
    if(verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
  }
  else if(option == "nogap")
  {
    sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getSitesWithoutGaps(allSites));
    int nbSites = sitesToAnalyse->getNumberOfSites();
    if(verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
  }
  else
  {
    ApplicationTools::displayError("Option '" + option + "' unknown in parameter 'sequence.sitestouse'.");
    exit(-1);
  }

  return sitesToAnalyse;
}
Example #14
0
VectorSiteContainer * SequenceApplicationTools::getSiteContainer(
  const Alphabet * alpha,
  map<string, string> & params,
  const string & suffix,
  bool suffixIsOptional,
  bool verbose)
{
  string sequenceFilePath = ApplicationTools::getAFilePath("sequence.file",params, true, true, suffix, suffixIsOptional);
  string sequenceFormat = ApplicationTools::getStringParameter("sequence.format", params, "Fasta", suffix, suffixIsOptional);
  if(verbose) ApplicationTools::displayResult("Sequence format " + suffix, sequenceFormat);
  ISequence * iSeq = NULL;
  if(sequenceFormat == "Mase")
  {
    iSeq = new Mase();
  }
  else if(sequenceFormat == "Phylip")
  {
    bool sequential = true, extended = true;
    string split = "  ";
    if(params.find("sequence.format_phylip.order") != params.end())
    {
           if(params["sequence.format_phylip.order"] == "sequential" ) sequential = true;
      else if(params["sequence.format_phylip.order"] == "interleaved") sequential = false;
      else ApplicationTools::displayWarning("Argument '" +
             params["sequence.format_phylip.order"] +
             "' for parameter 'sequence.format_phylip.order' is unknown. " +
             "Default used instead: sequential.");
    }
    else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.order' not found. Default used instead: sequential.");
    if(params.find("sequence.format_phylip.ext") != params.end())
    {
      if(params["sequence.format_phylip.ext"] == "extended")
      {
        extended = true;
        split = ApplicationTools::getStringParameter("sequence.format_phylip.extended.split", params, "spaces", suffix, suffixIsOptional);
        if(split == "spaces") split = "  ";
        else if(split == "tab") split = "\t";
        else throw Exception("Unknown option for sequence.format_phylip.extended.split: " + split);
      }
      else if(params["sequence.format_phylip.ext"] == "classic" ) extended = false;
      else ApplicationTools::displayWarning("Argument '" +
             params["sequence.format_phylip.ext"] +
             "' for parameter 'sequence.format_phylip.ext' is unknown. " +
             "Default used instead: extended.");
    }
    else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.ext' not found. Default used instead: extended.");
    iSeq = new Phylip(extended, sequential, 100, true, split);
  }
  else if(sequenceFormat == "Fasta") iSeq = new Fasta();
  else if(sequenceFormat == "Clustal") iSeq = new Clustal();
  else
  {
    ApplicationTools::displayError("Unknown sequence format.");
    exit(-1);
  }
  const SequenceContainer * seqCont = iSeq->read(sequenceFilePath, alpha);
  VectorSiteContainer * sites = new VectorSiteContainer(* dynamic_cast<const OrderedSequenceContainer *>(seqCont));
  delete seqCont;
  delete iSeq;
  
  if(verbose) ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);

  // Look for site selection:
  if(sequenceFormat == "Mase")
  {
    //getting site set:
    string siteSet = ApplicationTools::getStringParameter("sequence.format_mase.site_selection", params, "none", suffix, suffixIsOptional, false);
    if(siteSet != "none")
    {
      VectorSiteContainer * selectedSites;
      try
      {
        selectedSites = dynamic_cast<VectorSiteContainer *>(MaseTools::getSelectedSites(* sites, siteSet));
        if(verbose) ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
      }
      catch(IOException ioe)
      {
        ApplicationTools::displayError("Site Set '" + siteSet + "' not found.");
        exit(-1);
      }
      if(selectedSites->getNumberOfSites() == 0)
      {
        ApplicationTools::displayError("Site Set '" + siteSet + "' is empty.");
        exit(-1);
      }
      delete sites;
      sites = selectedSites;
    }
  }
  return sites;
}
SiteContainer* SiteContainerTools::resolveDottedAlignment(
  const SiteContainer& dottedAln,
  const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception)
{
  if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet()))
    throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());

  // First we look for the reference sequence:
  size_t n = dottedAln.getNumberOfSequences();
  if (n == 0)
    throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");

  const Sequence* refSeq = 0;
  for (size_t i = 0; i < n; ++i) // Test each sequence
  {
    const Sequence* seq = &dottedAln.getSequence(i);
    bool isRef = true;
    for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence
    {
      if (seq->getChar(j) == ".")
        isRef = false;
    }
    if (isRef) // We found the reference sequence!
    {
      refSeq = new BasicSequence(*seq);
    }
  }
  if (!refSeq)
    throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");

  // Now we build a new VectorSiteContainer:
  VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet);

  // We add each site one by one:
  size_t m = dottedAln.getNumberOfSites();
  string state;
  for (unsigned int i = 0; i < m; ++i)
  {
    string resolved = refSeq->getChar(i);
    const Site* site = &dottedAln.getSite(i);
    Site resolvedSite(resolvedAlphabet, site->getPosition());
    for (unsigned int j = 0; j < n; j++)
    {
      state = site->getChar(j);
      if (state == ".")
      {
        state = resolved;
      }
      resolvedSite.addElement(state);
    }
    // Add the new site:
    sites->addSite(resolvedSite);
  }

  // Seq sequence names:
  sites->setSequencesNames(dottedAln.getSequencesNames());

  // Delete the copied sequence:
  delete refSeq;

  // Return result:
  return sites;
}
Example #16
0
int main(int args, char ** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*              Bio++ Distance Methods, version 2.2.0             *" << endl;
  cout << "* Author: J. Dutheil                        Created     05/05/07 *" << endl;
  cout << "*                                           Last Modif. 04/02/15 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if(args == 1)
  {
    help();
    return 0;
  }
  
  try {

  BppApplication bppdist(args, argv, "BppDist");
  bppdist.startTimer();

  Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppdist.getParams(), "", false);
  auto_ptr<GeneticCode> gCode;
  CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);
  if (codonAlphabet) {
    string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppdist.getParams(), "Standard", "", true, true);
    ApplicationTools::displayResult("Genetic Code", codeDesc);

    gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
  }

  VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppdist.getParams());
  
  VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(* allSites, bppdist.getParams());
  delete allSites;

  ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences()));
  ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites()));
  
  SubstitutionModel* model = PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppdist.getParams());
  
	DiscreteDistribution* rDist = 0;
  if (model->getNumberOfStates() > model->getAlphabet()->getSize())
  {
    //Markov-modulated Markov model!
    rDist = new ConstantRateDistribution();
  }
  else
  {
	  rDist = PhylogeneticsApplicationTools::getRateDistribution(bppdist.getParams());
  }
   
  DistanceEstimation distEstimation(model, rDist, sites, 1, false);
 
  string method = ApplicationTools::getStringParameter("method", bppdist.getParams(), "nj");
  ApplicationTools::displayResult("Tree reconstruction method", method);
  TreeTemplate<Node>* tree;
  AgglomerativeDistanceMethod* distMethod = 0;
  if(method == "wpgma")
  {
    PGMA* wpgma = new PGMA(true);
    distMethod = wpgma;
  }
  else if(method == "upgma")
  {
    PGMA* upgma = new PGMA(false);
    distMethod = upgma;
  }
  else if(method == "nj")
  {
    NeighborJoining* nj = new NeighborJoining();
    nj->outputPositiveLengths(true);
    distMethod = nj;
  }
  else if(method == "bionj")
  {
    BioNJ* bionj = new BioNJ();
    bionj->outputPositiveLengths(true);
    distMethod = bionj;
  }
  else throw Exception("Unknown tree reconstruction method.");
  
  string type = ApplicationTools::getStringParameter("optimization.method", bppdist.getParams(), "init");
  ApplicationTools::displayResult("Model parameters estimation method", type);
  if (type == "init") type = OptimizationTools::DISTANCEMETHOD_INIT;
  else if (type == "pairwise") type = OptimizationTools::DISTANCEMETHOD_PAIRWISE;
  else if (type == "iterations") type = OptimizationTools::DISTANCEMETHOD_ITERATIONS;
  else throw Exception("Unknown parameter estimation procedure '" + type + "'.");
  
	unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>("optimization.verbose", bppdist.getParams(), 2);
	
	string mhPath = ApplicationTools::getAFilePath("optimization.message_handler", bppdist.getParams(), false, false);
	OutputStream* messenger = 
		(mhPath == "none") ? 0 :
			(mhPath == "std") ? ApplicationTools::message :
				new StlOutputStream(new ofstream(mhPath.c_str(), ios::out));
	ApplicationTools::displayResult("Message handler", mhPath);

	string prPath = ApplicationTools::getAFilePath("optimization.profiler", bppdist.getParams(), false, false);
	OutputStream* profiler = 
		(prPath == "none") ? 0 :
			(prPath == "std") ? ApplicationTools::message :
				new StlOutputStream(new ofstream(prPath.c_str(), ios::out));
	if(profiler) profiler->setPrecision(20);
	ApplicationTools::displayResult("Profiler", prPath);

	// Should I ignore some parameters?
  ParameterList allParameters = model->getParameters();
  allParameters.addParameters(rDist->getParameters());
	ParameterList parametersToIgnore;
  string paramListDesc = ApplicationTools::getStringParameter("optimization.ignore_parameter", bppdist.getParams(), "", "", true, false);
	bool ignoreBrLen = false;
  StringTokenizer st(paramListDesc, ",");
	while (st.hasMoreToken())
  {
		try
    {
      string param = st.nextToken();
      if (param == "BrLen")
        ignoreBrLen = true;
      else
      {
        if (allParameters.hasParameter(param))
        {
          Parameter* p = &allParameters.getParameter(param);
          parametersToIgnore.addParameter(*p);
        }
        else ApplicationTools::displayWarning("Parameter '" + param + "' not found."); 
      }
		} 
    catch (ParameterNotFoundException& pnfe)
    {
			ApplicationTools::displayError("Parameter '" + pnfe.getParameter() + "' not found, and so can't be ignored!");
		}
	}
	
	unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>("optimization.max_number_f_eval", bppdist.getParams(), 1000000);
	ApplicationTools::displayResult("Max # ML evaluations", TextTools::toString(nbEvalMax));
	
	double tolerance = ApplicationTools::getDoubleParameter("optimization.tolerance", bppdist.getParams(), .000001);
	ApplicationTools::displayResult("Tolerance", TextTools::toString(tolerance));
	
  //Here it is:
  ofstream warn("warnings", ios::out);
  ApplicationTools::warning = new StlOutputStreamWrapper(&warn);
  tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore, !ignoreBrLen, type, tolerance, nbEvalMax, profiler, messenger, optVerbose);
  warn.close();
  delete ApplicationTools::warning;
  ApplicationTools::warning = ApplicationTools::message;

  string matrixPath = ApplicationTools::getAFilePath("output.matrix.file", bppdist.getParams(), false, false, "", false);
  if (matrixPath != "none")
  {
    ApplicationTools::displayResult("Output matrix file", matrixPath);
    string matrixFormat = ApplicationTools::getAFilePath("output.matrix.format", bppdist.getParams(), false, false, "", false);
    string format = "";
    bool extended = false;
    std::map<std::string, std::string> unparsedArguments_;
    KeyvalTools::parseProcedure(matrixFormat, format, unparsedArguments_);
    if (unparsedArguments_.find("type") != unparsedArguments_.end())
    {
      if (unparsedArguments_["type"] == "extended")
      {
        extended = true;
      }     
      else if (unparsedArguments_["type"] == "classic")
        extended = false;
      else
        ApplicationTools::displayWarning("Argument '" +
                                         unparsedArguments_["type"] + "' for parameter 'Phylip#type' is unknown. " +
                                         "Default used instead: not extended.");
    }    
    else
      ApplicationTools::displayWarning("Argument 'Phylip#type' not found. Default used instead: not extended.");
    

    ODistanceMatrix* odm = IODistanceMatrixFactory().createWriter(IODistanceMatrixFactory::PHYLIP_FORMAT, extended);
    odm->write(*distEstimation.getMatrix(), matrixPath, true);
    delete odm;
  }
  PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams());
  
  //Output some parameters:
  if (type == OptimizationTools::DISTANCEMETHOD_ITERATIONS)
  {
    // Write parameters to screen:
    ParameterList parameters = model->getParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
		  ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    parameters = rDist->getParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
		  ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    // Write parameters to file:
	  string parametersFile = ApplicationTools::getAFilePath("output.estimates", bppdist.getParams(), false, false);
    if (parametersFile != "none")
    {
		  ofstream out(parametersFile.c_str(), ios::out);
      parameters = model->getParameters();
      for (unsigned int i = 0; i < parameters.size(); i++)
      {
        out << parameters[i].getName() << " = " << parameters[i].getValue() << endl;
      }
      parameters = rDist->getParameters();
      for (unsigned int i = 0; i < parameters.size(); i++)
      {
        out << parameters[i].getName() << " = " << parameters[i].getValue() << endl;
      }
      out.close();
    }
  }
 
  //Bootstrap:
  unsigned int nbBS = ApplicationTools::getParameter<unsigned int>("bootstrap.number", bppdist.getParams(), 0);
  if(nbBS > 0)
  {
    ApplicationTools::displayResult("Number of bootstrap samples", TextTools::toString(nbBS));
    bool approx = ApplicationTools::getBooleanParameter("bootstrap.approximate", bppdist.getParams(), true);
    ApplicationTools::displayResult("Use approximate bootstrap", TextTools::toString(approx ? "yes" : "no"));
    if(approx)
    {
      type = OptimizationTools::DISTANCEMETHOD_INIT;
      parametersToIgnore = allParameters;
      ignoreBrLen = true;
    }
    bool bootstrapVerbose = ApplicationTools::getBooleanParameter("bootstrap.verbose", bppdist.getParams(), false, "", true, false);
 
    string bsTreesPath = ApplicationTools::getAFilePath("bootstrap.output.file", bppdist.getParams(), false, false);
    ofstream *out = NULL;
    if(bsTreesPath != "none")
    {
      ApplicationTools::displayResult("Bootstrap trees stored in file", bsTreesPath);
      out = new ofstream(bsTreesPath.c_str(), ios::out);
    }
    Newick newick;
    
    vector<Tree *> bsTrees(nbBS);
    ApplicationTools::displayTask("Bootstrapping", true);
    for(unsigned int i = 0; i < nbBS; i++)
    {
      ApplicationTools::displayGauge(i, nbBS-1, '=');
      VectorSiteContainer * sample = SiteContainerTools::bootstrapSites(*sites);
      if(approx) model->setFreqFromData(*sample);
      distEstimation.setData(sample);
      bsTrees[i] = OptimizationTools::buildDistanceTree(
          distEstimation,
          *distMethod,
          parametersToIgnore,
          ignoreBrLen,
          type,
          tolerance,
          nbEvalMax,
          NULL,
          NULL,
          (bootstrapVerbose ? 1 : 0)
        );
      if(out && i == 0) newick.write(*bsTrees[i], bsTreesPath, true);
      if(out && i >  0) newick.write(*bsTrees[i], bsTreesPath, false);
      delete sample;
    }
    if(out) out->close();
    if(out) delete out;
    ApplicationTools::displayTaskDone();
    ApplicationTools::displayTask("Compute bootstrap values");
    TreeTools::computeBootstrapValues(*tree, bsTrees);
    ApplicationTools::displayTaskDone();
    for(unsigned int i = 0; i < nbBS; i++) delete bsTrees[i];

    //Write resulting tree:
    PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams());
  }
    
  delete alphabet;
  delete sites;
  delete distMethod;
  delete tree;

  bppdist.done();}
  
      
  catch(exception & e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}