Example #1
0
int main() {
  TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("((A:0.01, B:0.02):0.03,C:0.01,D:0.1);");
  vector<string> seqNames= tree->getLeavesNames();
  vector<int> ids = tree->getNodesId();
  //-------------

  NucleicAlphabet* alphabet = new DNA();
  SubstitutionModel* model = new T92(alphabet, 3.);
  FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet);
  std::vector<std::string> globalParameterNames;
  globalParameterNames.push_back("T92.kappa");
  map<string, string> alias;

  SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames);
  DiscreteDistribution* rdist = new ConstantRateDistribution();
  vector<double> thetas;
  for (unsigned int i = 0; i < modelSet->getNumberOfModels(); ++i) {
    double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005;
    cout << "Theta" << i << " set to " << theta << endl; 
    modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    thetas.push_back(theta);
  }
  NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree);

  unsigned int n = 100000;
  OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
  OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));

  //Check fast simulation first:
 
  cout << "Fast check:" << endl;
 
  //Generate data set:
  VectorSiteContainer sites(seqNames, alphabet);
  for (unsigned int i = 0; i < n; ++i) {
    auto_ptr<Site> site(simulator.simulateSite());
    site->setPosition(static_cast<int>(i));
    sites.addSite(*site, false);
  }

  //Now fit model:
  SubstitutionModelSet* modelSet2 = modelSet->clone();
  RNonHomogeneousTreeLikelihood tl(*tree, sites, modelSet2, rdist);
  tl.initialize();

  OptimizationTools::optimizeNumericalParameters2(
      &tl, tl.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
    cout << thetas[i] << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue() << endl;
    double diff = abs(thetas[i] - modelSet2->getModel(i)->getParameter("theta").getValue());
    if (diff > 0.1)
      return 1;
  }
  delete modelSet2;

  //Now try detailed simulations:

  cout << "Detailed check:" << endl;
  
  //Generate data set:
  VectorSiteContainer sites2(seqNames, alphabet);
  for (unsigned int i = 0; i < n; ++i) {
    RASiteSimulationResult* result = simulator.dSimulateSite();
    auto_ptr<Site> site(result->getSite(*simulator.getSubstitutionModelSet()->getModel(0)));
    site->setPosition(static_cast<int>(i));
    sites2.addSite(*site, false);
    delete result;
  }

  //Now fit model:
  SubstitutionModelSet* modelSet3 = modelSet->clone();
  RNonHomogeneousTreeLikelihood tl2(*tree, sites2, modelSet3, rdist);
  tl2.initialize();

  OptimizationTools::optimizeNumericalParameters2(
      &tl2, tl2.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
    cout << thetas[i] << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl;
    double diff = abs(thetas[i] - modelSet3->getModel(i)->getParameter("theta").getValue());
    if (diff > 0.1)
      return 1;
  }
  delete modelSet3;

  //-------------
  delete tree;
  delete alphabet;
  delete modelSet;
  delete rdist;

  return 0;
}
Example #2
0
int main() {
  TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);");
  vector<string> seqNames= tree->getLeavesNames();
  vector<int> ids = tree->getNodesId();
  //-------------

  const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET;
  FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet);
  SubstitutionModel* model = new T92(alphabet, 3.);
  std::vector<std::string> globalParameterNames;
  globalParameterNames.push_back("T92.kappa");
  map<string, string> alias;

  SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames);
  //DiscreteDistribution* rdist = new ConstantDistribution(1.0, true);
  //Very difficult to optimize on small datasets:
  DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0);

  size_t nsites = 1000;
  unsigned int nrep = 20;
  size_t nmodels = modelSet->getNumberOfModels();
  vector<double> thetas(nmodels);
  vector<double> thetasEst1(nmodels);
  vector<double> thetasEst2(nmodels);

  for (size_t i = 0; i < nmodels; ++i) {
    double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005;
    cout << "Theta" << i << " set to " << theta << endl; 
    modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    thetas[i] = theta;
  }
  NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree);
 
  for (unsigned int j = 0; j < nrep; j++) {

    OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
    OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));

    //Simulate data:
    auto_ptr<SiteContainer> sites(simulator.simulate(nsites));
    //Now fit model:
    auto_ptr<SubstitutionModelSet> modelSet2(modelSet->clone());
    auto_ptr<SubstitutionModelSet> modelSet3(modelSet->clone());
    RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet2.get(), rdist, true, true, false);
    tl.initialize();
    RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet3.get(), rdist, true, true, true);
    tl2.initialize();
   
    unsigned int c1 = OptimizationTools::optimizeNumericalParameters2(
        &tl, tl.getParameters(), 0,
        0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    unsigned int c2 = OptimizationTools::optimizeNumericalParameters2(
        &tl2, tl2.getParameters(), 0,
        0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    cout << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl;
      
    for (size_t i = 0; i < nmodels; ++i) {
      cout << modelSet2->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl;
      //if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1)
      //  return 1;
      thetasEst1[i] +=  modelSet2->getModel(i)->getParameter("theta").getValue();
      thetasEst2[i] +=  modelSet3->getModel(i)->getParameter("theta").getValue();
    }
  }
  thetasEst1 /= static_cast<double>(nrep);
  thetasEst2 /= static_cast<double>(nrep);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
     cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << endl;
     double diff1 = abs(thetas[i] - thetasEst1[i]);
     double diff2 = abs(thetas[i] - thetasEst2[i]);
     if (diff1 > 0.2 || diff2 > 0.2)
        return 1;
  }

  //-------------
  delete tree;
  delete modelSet;
  delete rdist;

  return 0;
}