Exemplo n.º 1
0
VectorSiteContainer::VectorSiteContainer(const VectorSiteContainer& vsc) :
  AbstractSequenceContainer(vsc),
  sites_(0),
  names_(vsc.names_),
  comments_(vsc.getNumberOfSequences()),
  sequences_(vsc.getNumberOfSequences())
{
  // Now try to add each site:
  for (size_t i = 0; i < vsc.getNumberOfSites(); i++)
  {
    addSite(vsc.getSite(i), false); // We assume that positions are correct.
  }
  // Seq comments:
  for (size_t i = 0; i < vsc.getNumberOfSequences(); i++)
  {
    comments_[i] = new Comments(vsc.getComments(i));
  }
}
Exemplo n.º 2
0
VectorSiteContainer* SequenceApplicationTools::getSiteContainer(
  const Alphabet* alpha,
  map<string, string>& params,
  const string& suffix,
  bool suffixIsOptional,
  bool verbose,
  int warn)
{
  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
  BppOAlignmentReaderFormat bppoReader(warn);
  unique_ptr<IAlignment> iAln(bppoReader.read(sequenceFormat));
  map<string, string> args(bppoReader.getUnparsedArguments());
  if (verbose)
  {
    ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
    ApplicationTools::displayResult("Sequence format " + suffix, iAln->getFormatName());
  }

  const Alphabet* alpha2;
  if (AlphabetTools::isRNYAlphabet(alpha))
    alpha2 = &dynamic_cast<const RNY*>(alpha)->getLetterAlphabet();
  else
    alpha2 = alpha;

  const SequenceContainer* seqCont = iAln->readAlignment(sequenceFilePath, alpha2);

  VectorSiteContainer* sites2 = new VectorSiteContainer(*dynamic_cast<const OrderedSequenceContainer*>(seqCont));

  delete seqCont;

  VectorSiteContainer* sites;

  if (AlphabetTools::isRNYAlphabet(alpha))
  {
    const SequenceTools ST;
    sites = new VectorSiteContainer(alpha);
    for (unsigned int i = 0; i < sites2->getNumberOfSequences(); i++)
    {
      sites->addSequence(*(ST.RNYslice(sites2->getSequence(i))));
    }
    delete sites2;
  }
  else
    sites = sites2;

  // Look for site selection:
  if (iAln->getFormatName() == "MASE file")
  {
    // getting site set:
    string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
    if (siteSet != "none")
    {
      VectorSiteContainer* selectedSites;
      try
      {
        selectedSites = dynamic_cast<VectorSiteContainer*>(MaseTools::getSelectedSites(*sites, siteSet));
        if (verbose)
          ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
      }
      catch (IOException& ioe)
      {
        throw ioe;
      }
      if (selectedSites->getNumberOfSites() == 0)
      {
        throw Exception("Site set '" + siteSet + "' is empty.");
      }
      delete sites;
      sites = selectedSites;
    }
  }
  else
  {
    // getting site set:
    size_t nbSites = sites->getNumberOfSites();

    string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);

    VectorSiteContainer* selectedSites = 0;
    if (siteSet != "none")
    {
      vector<size_t> vSite;
      try
      {
        vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet,",",":");
        for (size_t i = 0; i < vSite1.size(); ++i)
        {
          int x = (vSite1[i] >= 0 ? vSite1[i] : static_cast<int>(nbSites) + vSite1[i]+ 1);
          if (x<=(int)nbSites)
          {
            if (x > 0)
              vSite.push_back(static_cast<size_t>(x - 1));
            else
              throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index: " + TextTools::toString(x));
          }
          else
            throw Exception("SequenceApplicationTools::getSiteContainer(). Too large index: " + TextTools::toString(x));
        }
        selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite));
        selectedSites->reindexSites();
      }
      catch (Exception& e)
      {
        string seln;
        map<string, string> selArgs;
        KeyvalTools::parseProcedure(siteSet, seln, selArgs);
        if (seln == "Sample")
        {
          size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
          bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);

          vSite.resize(n);
          vector<size_t> vPos;
          for (size_t p = 0; p < nbSites; ++p)
          {
            vPos.push_back(p);
          }

          RandomTools::getSample(vPos, vSite, replace);

          selectedSites = dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getSelectedSites(*sites, vSite));
          if (replace)
            selectedSites->reindexSites();
        }
        else
          throw Exception("Unknown site selection description: " + siteSet);
      }

      if (verbose)
        ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));

      if (selectedSites && (selectedSites->getNumberOfSites() == 0))
      {
        throw Exception("Site set '" + siteSet + "' is empty.");
      }
      delete sites;
      sites = selectedSites;
    }
  }
  return sites;
}
Exemplo n.º 3
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*     Bio++ Computation of site likelihoods inside mixed models  *" << endl;
  cout << "*                        Version 2.2.0.                          *" << endl;
  cout << "* Author: L. Guéguen                       Last Modif.: 25/09/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if (args == 1)
  {
    help();
    return 0;
  }

  try
  {
    BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods");
    bppmixedlikelihoods.startTimer();

    Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false);
    auto_ptr<GeneticCode> gCode;
    CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);
    if (codonAlphabet) {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true);
      ApplicationTools::displayResult("Genetic Code", codeDesc);
      
      gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
    }

    // get the data

    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams());

    VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false);
    delete allSites;

    ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences()));
    ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites()));

    // Get the tree
    Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams());
    ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves()));


    AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl;
    string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false);
    ApplicationTools::displayResult("Heterogeneous model", nhOpt);

    MixedSubstitutionModel* model       = 0;
    MixedSubstitutionModelSet* modelSet = 0;
    DiscreteDistribution* rDist         = 0;

    if (nhOpt == "no")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true);
    }
    else if (nhOpt == "one_per_branch")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      vector<double> rateFreqs;
      if (model->getNumberOfStates() != alphabet->getSize())
      {
        // Markov-Modulated Markov Model...
        unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize());
        rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case,
        // we should assume a rate distribution for the root also!!!
      }
      
      std::map<std::string, std::string> aliasFreqNames;

      FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs);
      vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', "");
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters));
      model = 0;
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else if (nhOpt == "general")
    {
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (modelSet == 0)
      {
        cout << "Missing a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantDistribution(1.);
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else
      throw Exception("Unknown option for nonhomogeneous: " + nhOpt);

    tl->initialize();

    double logL = tl->getValue();
    if (isinf(logL))
    {
      // This may be due to null branch lengths, leading to null likelihood!
      ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero.");
      ApplicationTools::displayWarning("!!! This may be due to branch length == 0.");
      ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001.");
      ParameterList pl = tl->getBranchLengthsParameters();
      for (unsigned int i = 0; i < pl.size(); i++)
      {
        if (pl[i].getValue() < 0.000001)
          pl[i].setValue(0.000001);
      }
      tl->matchParametersValues(pl);
      logL = tl->getValue();
    }
    if (isinf(logL))
    {
      ApplicationTools::displayError("!!! Unexpected likelihood == 0.");
      ApplicationTools::displayError("!!! Looking at each site:");
      for (unsigned int i = 0; i < sites->getNumberOfSites(); i++)
      {
        (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine();
      }
      ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences).");
      exit(-1);
    }


    // Write parameters to screen:
    ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
    ParameterList parameters = tl->getSubstitutionModelParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    parameters = tl->getRateDistributionParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }


    // /////////////////////////////////////////////
    // Getting likelihoods per submodel

    string outputFile;
    outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false);
    ApplicationTools::displayResult("Output file for likelihoods", outputFile);
    ofstream out(outputFile.c_str(), ios::out);

    size_t nSites = sites->getNumberOfSites();

    size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true);

    string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false);

    if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels())))
    {
      ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + ".");
      exit(-1);
    }

    MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1));

    if (!p0)
    {
      ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model.");
      exit(-1);
    }

    const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0);
    if (ptmp) {
      p0 = ptmp->getMixedModel().clone();

      if (nhOpt == "no")
        model = p0;
      else {
        modelSet->replaceModel(nummodel-1, p0);
        modelSet->isFullySetUpFor(*tree);
      }
    }
    
    //////////////////////////////////////////////////
    // Case of a MixtureOfSubstitutionModels

    MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0);

    if (pMSM)
    {
      vector<string> colNames;
      colNames.push_back("Sites");

      size_t nummod = pMSM->getNumberOfModels();
      for (unsigned int i = 0; i < nummod; i++)
      {
        colNames.push_back(pMSM->getNModel(i)->getName());
      }

      DataTable* rates = new DataTable(nSites, colNames.size());
      rates->setColumnNames(colNames);

      for (unsigned int i = 0; i < nSites; i++)
      {
        const Site* currentSite = &sites->getSite(i);
        int currentSitePosition = currentSite->getPosition();
        (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]");
      }

      Vdouble vprob = pMSM->getProbabilities();
      for (unsigned int i = 0; i < nummod; i++)
      {
        string modname = pMSM->getNModel(i)->getName();

        for (unsigned int j = 0; j < nummod; j++)
        {
          pMSM->setNProbability(j, (j == i) ? 1 : 0);
        }

        if (tl)
          delete tl;

        if (nhOpt == "no")
          tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
        else
          tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

        tl->initialize();
        logL = tl->getValue();
        Vdouble Vd = tl->getLogLikelihoodForEachSite();
        for (unsigned int j = 0; j < nSites; j++)
        {
          (*rates)(j, modname) = TextTools::toString(Vd[j]);
        }

        ApplicationTools::displayMessage("\n");
        ApplicationTools::displayMessage("Model " + modname + ":");
        ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
        ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15));
      }

      DataTable::write(*rates, out, "\t");
    }

    //////////////////////////////////////////////////
    // Case of a MixtureOfASubstitutionModel

    else
    {
      MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0);
      if (pMSM2 != NULL)
      {
        size_t nummod = pMSM2->getNumberOfModels();
        if (parname == "")
        {
          ParameterList pl=pMSM2->getParameters();

          for (size_t i2 = 0; i2 < pl.size(); i2++)
          {
            string pl2n = pl[i2].getName();

            if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL)
            {
              parname=pl2n;

              while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL)
                parname=pl2n.substr(0,pl2n.rfind("_"));

              if (parname.size()>0){
                ApplicationTools::displayResult("likelihoods.parameter_name", parname);
                break;
              }
            }
          }
        }

        if (parname == "")
        {
          ApplicationTools::displayError("Argument likelihoods.parameter_name is required.");
          exit(-1);
        }

        vector< Vint > vvnmod;
        size_t i2 = 0;
        while (i2 < nummod)
        {
          string par2 = parname + "_" + TextTools::toString(i2 + 1);
          Vint vnmod = pMSM2->getSubmodelNumbers(par2);
          if (vnmod.size() == 0)
            break;
          vvnmod.push_back(vnmod);
          i2++;
        }

        size_t nbcl = vvnmod.size();
        if (nbcl==0)
          throw Exception("Parameter " + parname + " is not mixed.");
        
        Vdouble vprob = pMSM2->getProbabilities();

        vector<vector<double> > vvprob;
        vector<double> vsprob;
        
        for (size_t i = 0; i < nbcl; i++)
        {
          vector<double> vprob2;
          for (size_t j = 0; j < vvnmod[i].size(); j++)
          {
            vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]);
          }

          vvprob.push_back(vprob2);
          vsprob.push_back(VectorTools::sum(vvprob[i]));
        }

        vector<string> colNames;
        colNames.push_back("Sites");

        Vdouble dval;
        for (unsigned int i = 0; i < nbcl; i++)
        {
          SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0]));
          double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname));
          dval.push_back(valPar);
          colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar));
        }
        for (unsigned int i = 0; i < nbcl; i++)
          colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i]));

        colNames.push_back("mean");

        DataTable* rates = new DataTable(nSites, colNames.size());
        rates->setColumnNames(colNames);

        for (unsigned int i = 0; i < nSites; i++)
        {
          const Site* currentSite = &sites->getSite(i);
          int currentSitePosition = currentSite->getPosition();
          (*rates)(i,"Sites")=TextTools::toString(currentSitePosition);
        }

        VVdouble vvd;

          
        vector<double> vRates = pMSM2->getVRates();

        for (size_t i = 0; i < nbcl; ++i)
        {
          string par2 = parname + "_" + TextTools::toString(i + 1);
          
          for (unsigned int j = 0; j < nummod; ++j)
            pMSM2->setNProbability(j, 0);

          for (size_t j = 0; j < vvprob[i].size(); ++j)
            pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]);

          if (tl)
            delete tl;

          if (nhOpt == "no")
            tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
          else
            tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

          tl->initialize();
          logL = tl->getValue();
          Vdouble vd = tl->getLogLikelihoodForEachSite();

          for (unsigned int j = 0; j < nSites; j++)
            (*rates)(j, i + 1) = TextTools::toString(vd[j]);

          vvd.push_back(vd);

          ApplicationTools::displayMessage("\n");
          ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i]));

          ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
          ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15));
        }

        for (unsigned int j = 0; j < nSites; j++)
        {
          Vdouble vd;
          for (unsigned int i = 0; i < nbcl; i++)
            vd.push_back(std::log(vsprob[i])+vvd[i][j]);
          
          VectorTools::logNorm(vd);
          for (unsigned int i = 0; i < nbcl; i++)
            (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i]));
          (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval));
        }

        DataTable::write(*rates, out, "\t");
      }
    }

    delete alphabet;
    delete sites;
    if (model)
      delete model;
    if (modelSet)
      delete modelSet;
    delete rDist;
    delete tl;
    delete tree;
    ApplicationTools::displayMessage("\n");
    bppmixedlikelihoods.done();
  }

  catch (exception& e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Exemplo n.º 4
0
VectorSiteContainer * SequenceApplicationTools::getSitesToAnalyse(
  const SiteContainer & allSites,
  map<string, string> & params,
  string suffix,
  bool suffixIsOptional,
  bool gapAsUnknown,
  bool verbose)
{
  // Fully resolved sites, i.e. without jokers and gaps:
  VectorSiteContainer * sitesToAnalyse;
  
  string option = ApplicationTools::getStringParameter("sequence.sites_to_use", params, "complete", suffix, suffixIsOptional);
  if(verbose) ApplicationTools::displayResult("Sites to use", option);
  sitesToAnalyse = new VectorSiteContainer(allSites);
  if(option == "all")
  {
    string maxGapOption = ApplicationTools::getStringParameter("sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional);
    if(maxGapOption[maxGapOption.size()-1] == '%')
    {
      double gapFreq = TextTools::toDouble(maxGapOption.substr(0,maxGapOption.size()-1)) / 100.;
      for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
      {
        map<int, double> freq;
        SiteTools::getFrequencies(*sitesToAnalyse->getSite(i-1), freq);
        if(freq[-1] > gapFreq) sitesToAnalyse->deleteSite(i-1);
      }
    }
    else
    {
      unsigned int gapNum=TextTools::to<unsigned int>(maxGapOption);
      for(unsigned int i = sitesToAnalyse->getNumberOfSites(); i > 0; i--)
      {
        map<int, unsigned int> counts;
        SiteTools::getCounts(*sitesToAnalyse->getSite(i-1), counts);
        if(counts[-1] > gapNum) sitesToAnalyse->deleteSite(i-1);
      }
    }
    if(gapAsUnknown)
    {
      SiteContainerTools::changeGapsToUnknownCharacters(*sitesToAnalyse);
    }
  }
  else if(option == "complete")
  {
    sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getCompleteSites(allSites));
    int nbSites = sitesToAnalyse->getNumberOfSites();
    if(verbose) ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
  }
  else if(option == "nogap")
  {
    sitesToAnalyse = dynamic_cast<VectorSiteContainer *>(SiteContainerTools::getSitesWithoutGaps(allSites));
    int nbSites = sitesToAnalyse->getNumberOfSites();
    if(verbose) ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
  }
  else
  {
    ApplicationTools::displayError("Option '" + option + "' unknown in parameter 'sequence.sitestouse'.");
    exit(-1);
  }

  return sitesToAnalyse;
}
Exemplo n.º 5
0
VectorSiteContainer * SequenceApplicationTools::getSiteContainer(
  const Alphabet * alpha,
  map<string, string> & params,
  const string & suffix,
  bool suffixIsOptional,
  bool verbose)
{
  string sequenceFilePath = ApplicationTools::getAFilePath("sequence.file",params, true, true, suffix, suffixIsOptional);
  string sequenceFormat = ApplicationTools::getStringParameter("sequence.format", params, "Fasta", suffix, suffixIsOptional);
  if(verbose) ApplicationTools::displayResult("Sequence format " + suffix, sequenceFormat);
  ISequence * iSeq = NULL;
  if(sequenceFormat == "Mase")
  {
    iSeq = new Mase();
  }
  else if(sequenceFormat == "Phylip")
  {
    bool sequential = true, extended = true;
    string split = "  ";
    if(params.find("sequence.format_phylip.order") != params.end())
    {
           if(params["sequence.format_phylip.order"] == "sequential" ) sequential = true;
      else if(params["sequence.format_phylip.order"] == "interleaved") sequential = false;
      else ApplicationTools::displayWarning("Argument '" +
             params["sequence.format_phylip.order"] +
             "' for parameter 'sequence.format_phylip.order' is unknown. " +
             "Default used instead: sequential.");
    }
    else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.order' not found. Default used instead: sequential.");
    if(params.find("sequence.format_phylip.ext") != params.end())
    {
      if(params["sequence.format_phylip.ext"] == "extended")
      {
        extended = true;
        split = ApplicationTools::getStringParameter("sequence.format_phylip.extended.split", params, "spaces", suffix, suffixIsOptional);
        if(split == "spaces") split = "  ";
        else if(split == "tab") split = "\t";
        else throw Exception("Unknown option for sequence.format_phylip.extended.split: " + split);
      }
      else if(params["sequence.format_phylip.ext"] == "classic" ) extended = false;
      else ApplicationTools::displayWarning("Argument '" +
             params["sequence.format_phylip.ext"] +
             "' for parameter 'sequence.format_phylip.ext' is unknown. " +
             "Default used instead: extended.");
    }
    else ApplicationTools::displayWarning("Argument 'sequence.format_phylip.ext' not found. Default used instead: extended.");
    iSeq = new Phylip(extended, sequential, 100, true, split);
  }
  else if(sequenceFormat == "Fasta") iSeq = new Fasta();
  else if(sequenceFormat == "Clustal") iSeq = new Clustal();
  else
  {
    ApplicationTools::displayError("Unknown sequence format.");
    exit(-1);
  }
  const SequenceContainer * seqCont = iSeq->read(sequenceFilePath, alpha);
  VectorSiteContainer * sites = new VectorSiteContainer(* dynamic_cast<const OrderedSequenceContainer *>(seqCont));
  delete seqCont;
  delete iSeq;
  
  if(verbose) ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);

  // Look for site selection:
  if(sequenceFormat == "Mase")
  {
    //getting site set:
    string siteSet = ApplicationTools::getStringParameter("sequence.format_mase.site_selection", params, "none", suffix, suffixIsOptional, false);
    if(siteSet != "none")
    {
      VectorSiteContainer * selectedSites;
      try
      {
        selectedSites = dynamic_cast<VectorSiteContainer *>(MaseTools::getSelectedSites(* sites, siteSet));
        if(verbose) ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
      }
      catch(IOException ioe)
      {
        ApplicationTools::displayError("Site Set '" + siteSet + "' not found.");
        exit(-1);
      }
      if(selectedSites->getNumberOfSites() == 0)
      {
        ApplicationTools::displayError("Site Set '" + siteSet + "' is empty.");
        exit(-1);
      }
      delete sites;
      sites = selectedSites;
    }
  }
  return sites;
}
Exemplo n.º 6
0
int main(int args, char ** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*              Bio++ Distance Methods, version 2.2.0             *" << endl;
  cout << "* Author: J. Dutheil                        Created     05/05/07 *" << endl;
  cout << "*                                           Last Modif. 04/02/15 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if(args == 1)
  {
    help();
    return 0;
  }
  
  try {

  BppApplication bppdist(args, argv, "BppDist");
  bppdist.startTimer();

  Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppdist.getParams(), "", false);
  auto_ptr<GeneticCode> gCode;
  CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);
  if (codonAlphabet) {
    string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppdist.getParams(), "Standard", "", true, true);
    ApplicationTools::displayResult("Genetic Code", codeDesc);

    gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
  }

  VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppdist.getParams());
  
  VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(* allSites, bppdist.getParams());
  delete allSites;

  ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences()));
  ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites()));
  
  SubstitutionModel* model = PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppdist.getParams());
  
	DiscreteDistribution* rDist = 0;
  if (model->getNumberOfStates() > model->getAlphabet()->getSize())
  {
    //Markov-modulated Markov model!
    rDist = new ConstantRateDistribution();
  }
  else
  {
	  rDist = PhylogeneticsApplicationTools::getRateDistribution(bppdist.getParams());
  }
   
  DistanceEstimation distEstimation(model, rDist, sites, 1, false);
 
  string method = ApplicationTools::getStringParameter("method", bppdist.getParams(), "nj");
  ApplicationTools::displayResult("Tree reconstruction method", method);
  TreeTemplate<Node>* tree;
  AgglomerativeDistanceMethod* distMethod = 0;
  if(method == "wpgma")
  {
    PGMA* wpgma = new PGMA(true);
    distMethod = wpgma;
  }
  else if(method == "upgma")
  {
    PGMA* upgma = new PGMA(false);
    distMethod = upgma;
  }
  else if(method == "nj")
  {
    NeighborJoining* nj = new NeighborJoining();
    nj->outputPositiveLengths(true);
    distMethod = nj;
  }
  else if(method == "bionj")
  {
    BioNJ* bionj = new BioNJ();
    bionj->outputPositiveLengths(true);
    distMethod = bionj;
  }
  else throw Exception("Unknown tree reconstruction method.");
  
  string type = ApplicationTools::getStringParameter("optimization.method", bppdist.getParams(), "init");
  ApplicationTools::displayResult("Model parameters estimation method", type);
  if (type == "init") type = OptimizationTools::DISTANCEMETHOD_INIT;
  else if (type == "pairwise") type = OptimizationTools::DISTANCEMETHOD_PAIRWISE;
  else if (type == "iterations") type = OptimizationTools::DISTANCEMETHOD_ITERATIONS;
  else throw Exception("Unknown parameter estimation procedure '" + type + "'.");
  
	unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>("optimization.verbose", bppdist.getParams(), 2);
	
	string mhPath = ApplicationTools::getAFilePath("optimization.message_handler", bppdist.getParams(), false, false);
	OutputStream* messenger = 
		(mhPath == "none") ? 0 :
			(mhPath == "std") ? ApplicationTools::message :
				new StlOutputStream(new ofstream(mhPath.c_str(), ios::out));
	ApplicationTools::displayResult("Message handler", mhPath);

	string prPath = ApplicationTools::getAFilePath("optimization.profiler", bppdist.getParams(), false, false);
	OutputStream* profiler = 
		(prPath == "none") ? 0 :
			(prPath == "std") ? ApplicationTools::message :
				new StlOutputStream(new ofstream(prPath.c_str(), ios::out));
	if(profiler) profiler->setPrecision(20);
	ApplicationTools::displayResult("Profiler", prPath);

	// Should I ignore some parameters?
  ParameterList allParameters = model->getParameters();
  allParameters.addParameters(rDist->getParameters());
	ParameterList parametersToIgnore;
  string paramListDesc = ApplicationTools::getStringParameter("optimization.ignore_parameter", bppdist.getParams(), "", "", true, false);
	bool ignoreBrLen = false;
  StringTokenizer st(paramListDesc, ",");
	while (st.hasMoreToken())
  {
		try
    {
      string param = st.nextToken();
      if (param == "BrLen")
        ignoreBrLen = true;
      else
      {
        if (allParameters.hasParameter(param))
        {
          Parameter* p = &allParameters.getParameter(param);
          parametersToIgnore.addParameter(*p);
        }
        else ApplicationTools::displayWarning("Parameter '" + param + "' not found."); 
      }
		} 
    catch (ParameterNotFoundException& pnfe)
    {
			ApplicationTools::displayError("Parameter '" + pnfe.getParameter() + "' not found, and so can't be ignored!");
		}
	}
	
	unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>("optimization.max_number_f_eval", bppdist.getParams(), 1000000);
	ApplicationTools::displayResult("Max # ML evaluations", TextTools::toString(nbEvalMax));
	
	double tolerance = ApplicationTools::getDoubleParameter("optimization.tolerance", bppdist.getParams(), .000001);
	ApplicationTools::displayResult("Tolerance", TextTools::toString(tolerance));
	
  //Here it is:
  ofstream warn("warnings", ios::out);
  ApplicationTools::warning = new StlOutputStreamWrapper(&warn);
  tree = OptimizationTools::buildDistanceTree(distEstimation, *distMethod, parametersToIgnore, !ignoreBrLen, type, tolerance, nbEvalMax, profiler, messenger, optVerbose);
  warn.close();
  delete ApplicationTools::warning;
  ApplicationTools::warning = ApplicationTools::message;

  string matrixPath = ApplicationTools::getAFilePath("output.matrix.file", bppdist.getParams(), false, false, "", false);
  if (matrixPath != "none")
  {
    ApplicationTools::displayResult("Output matrix file", matrixPath);
    string matrixFormat = ApplicationTools::getAFilePath("output.matrix.format", bppdist.getParams(), false, false, "", false);
    string format = "";
    bool extended = false;
    std::map<std::string, std::string> unparsedArguments_;
    KeyvalTools::parseProcedure(matrixFormat, format, unparsedArguments_);
    if (unparsedArguments_.find("type") != unparsedArguments_.end())
    {
      if (unparsedArguments_["type"] == "extended")
      {
        extended = true;
      }     
      else if (unparsedArguments_["type"] == "classic")
        extended = false;
      else
        ApplicationTools::displayWarning("Argument '" +
                                         unparsedArguments_["type"] + "' for parameter 'Phylip#type' is unknown. " +
                                         "Default used instead: not extended.");
    }    
    else
      ApplicationTools::displayWarning("Argument 'Phylip#type' not found. Default used instead: not extended.");
    

    ODistanceMatrix* odm = IODistanceMatrixFactory().createWriter(IODistanceMatrixFactory::PHYLIP_FORMAT, extended);
    odm->write(*distEstimation.getMatrix(), matrixPath, true);
    delete odm;
  }
  PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams());
  
  //Output some parameters:
  if (type == OptimizationTools::DISTANCEMETHOD_ITERATIONS)
  {
    // Write parameters to screen:
    ParameterList parameters = model->getParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
		  ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    parameters = rDist->getParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
		  ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    // Write parameters to file:
	  string parametersFile = ApplicationTools::getAFilePath("output.estimates", bppdist.getParams(), false, false);
    if (parametersFile != "none")
    {
		  ofstream out(parametersFile.c_str(), ios::out);
      parameters = model->getParameters();
      for (unsigned int i = 0; i < parameters.size(); i++)
      {
        out << parameters[i].getName() << " = " << parameters[i].getValue() << endl;
      }
      parameters = rDist->getParameters();
      for (unsigned int i = 0; i < parameters.size(); i++)
      {
        out << parameters[i].getName() << " = " << parameters[i].getValue() << endl;
      }
      out.close();
    }
  }
 
  //Bootstrap:
  unsigned int nbBS = ApplicationTools::getParameter<unsigned int>("bootstrap.number", bppdist.getParams(), 0);
  if(nbBS > 0)
  {
    ApplicationTools::displayResult("Number of bootstrap samples", TextTools::toString(nbBS));
    bool approx = ApplicationTools::getBooleanParameter("bootstrap.approximate", bppdist.getParams(), true);
    ApplicationTools::displayResult("Use approximate bootstrap", TextTools::toString(approx ? "yes" : "no"));
    if(approx)
    {
      type = OptimizationTools::DISTANCEMETHOD_INIT;
      parametersToIgnore = allParameters;
      ignoreBrLen = true;
    }
    bool bootstrapVerbose = ApplicationTools::getBooleanParameter("bootstrap.verbose", bppdist.getParams(), false, "", true, false);
 
    string bsTreesPath = ApplicationTools::getAFilePath("bootstrap.output.file", bppdist.getParams(), false, false);
    ofstream *out = NULL;
    if(bsTreesPath != "none")
    {
      ApplicationTools::displayResult("Bootstrap trees stored in file", bsTreesPath);
      out = new ofstream(bsTreesPath.c_str(), ios::out);
    }
    Newick newick;
    
    vector<Tree *> bsTrees(nbBS);
    ApplicationTools::displayTask("Bootstrapping", true);
    for(unsigned int i = 0; i < nbBS; i++)
    {
      ApplicationTools::displayGauge(i, nbBS-1, '=');
      VectorSiteContainer * sample = SiteContainerTools::bootstrapSites(*sites);
      if(approx) model->setFreqFromData(*sample);
      distEstimation.setData(sample);
      bsTrees[i] = OptimizationTools::buildDistanceTree(
          distEstimation,
          *distMethod,
          parametersToIgnore,
          ignoreBrLen,
          type,
          tolerance,
          nbEvalMax,
          NULL,
          NULL,
          (bootstrapVerbose ? 1 : 0)
        );
      if(out && i == 0) newick.write(*bsTrees[i], bsTreesPath, true);
      if(out && i >  0) newick.write(*bsTrees[i], bsTreesPath, false);
      delete sample;
    }
    if(out) out->close();
    if(out) delete out;
    ApplicationTools::displayTaskDone();
    ApplicationTools::displayTask("Compute bootstrap values");
    TreeTools::computeBootstrapValues(*tree, bsTrees);
    ApplicationTools::displayTaskDone();
    for(unsigned int i = 0; i < nbBS; i++) delete bsTrees[i];

    //Write resulting tree:
    PhylogeneticsApplicationTools::writeTree(*tree, bppdist.getParams());
  }
    
  delete alphabet;
  delete sites;
  delete distMethod;
  delete tree;

  bppdist.done();}
  
      
  catch(exception & e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}