Beispiel #1
0
int main() {
  TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);");
  vector<string> seqNames= tree->getLeavesNames();
  vector<int> ids = tree->getNodesId();
  //-------------

  const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET;
  FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet);
  SubstitutionModel* model = new T92(alphabet, 3.);
  std::vector<std::string> globalParameterNames;
  globalParameterNames.push_back("T92.kappa");
  map<string, string> alias;

  SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames);
  //DiscreteDistribution* rdist = new ConstantDistribution(1.0, true);
  //Very difficult to optimize on small datasets:
  DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0);

  size_t nsites = 1000;
  unsigned int nrep = 20;
  size_t nmodels = modelSet->getNumberOfModels();
  vector<double> thetas(nmodels);
  vector<double> thetasEst1(nmodels);
  vector<double> thetasEst2(nmodels);

  for (size_t i = 0; i < nmodels; ++i) {
    double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005;
    cout << "Theta" << i << " set to " << theta << endl; 
    modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    thetas[i] = theta;
  }
  NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree);
 
  for (unsigned int j = 0; j < nrep; j++) {

    OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
    OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));

    //Simulate data:
    auto_ptr<SiteContainer> sites(simulator.simulate(nsites));
    //Now fit model:
    auto_ptr<SubstitutionModelSet> modelSet2(modelSet->clone());
    auto_ptr<SubstitutionModelSet> modelSet3(modelSet->clone());
    RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet2.get(), rdist, true, true, false);
    tl.initialize();
    RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet3.get(), rdist, true, true, true);
    tl2.initialize();
   
    unsigned int c1 = OptimizationTools::optimizeNumericalParameters2(
        &tl, tl.getParameters(), 0,
        0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    unsigned int c2 = OptimizationTools::optimizeNumericalParameters2(
        &tl2, tl2.getParameters(), 0,
        0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    cout << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl;
      
    for (size_t i = 0; i < nmodels; ++i) {
      cout << modelSet2->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl;
      //if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1)
      //  return 1;
      thetasEst1[i] +=  modelSet2->getModel(i)->getParameter("theta").getValue();
      thetasEst2[i] +=  modelSet3->getModel(i)->getParameter("theta").getValue();
    }
  }
  thetasEst1 /= static_cast<double>(nrep);
  thetasEst2 /= static_cast<double>(nrep);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
     cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << endl;
     double diff1 = abs(thetas[i] - thetasEst1[i]);
     double diff2 = abs(thetas[i] - thetasEst2[i]);
     if (diff1 > 0.2 || diff2 > 0.2)
        return 1;
  }

  //-------------
  delete tree;
  delete modelSet;
  delete rdist;

  return 0;
}
Beispiel #2
0
int main() {
  TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("((A:0.01, B:0.02):0.03,C:0.01,D:0.1);");
  vector<string> seqNames= tree->getLeavesNames();
  vector<int> ids = tree->getNodesId();
  //-------------

  NucleicAlphabet* alphabet = new DNA();
  SubstitutionModel* model = new T92(alphabet, 3.);
  FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet);
  std::vector<std::string> globalParameterNames;
  globalParameterNames.push_back("T92.kappa");
  map<string, string> alias;

  SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames);
  DiscreteDistribution* rdist = new ConstantRateDistribution();
  vector<double> thetas;
  for (unsigned int i = 0; i < modelSet->getNumberOfModels(); ++i) {
    double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005;
    cout << "Theta" << i << " set to " << theta << endl; 
    modelSet->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    thetas.push_back(theta);
  }
  NonHomogeneousSequenceSimulator simulator(modelSet, rdist, tree);

  unsigned int n = 100000;
  OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
  OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));

  //Check fast simulation first:
 
  cout << "Fast check:" << endl;
 
  //Generate data set:
  VectorSiteContainer sites(seqNames, alphabet);
  for (unsigned int i = 0; i < n; ++i) {
    auto_ptr<Site> site(simulator.simulateSite());
    site->setPosition(static_cast<int>(i));
    sites.addSite(*site, false);
  }

  //Now fit model:
  SubstitutionModelSet* modelSet2 = modelSet->clone();
  RNonHomogeneousTreeLikelihood tl(*tree, sites, modelSet2, rdist);
  tl.initialize();

  OptimizationTools::optimizeNumericalParameters2(
      &tl, tl.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
    cout << thetas[i] << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue() << endl;
    double diff = abs(thetas[i] - modelSet2->getModel(i)->getParameter("theta").getValue());
    if (diff > 0.1)
      return 1;
  }
  delete modelSet2;

  //Now try detailed simulations:

  cout << "Detailed check:" << endl;
  
  //Generate data set:
  VectorSiteContainer sites2(seqNames, alphabet);
  for (unsigned int i = 0; i < n; ++i) {
    RASiteSimulationResult* result = simulator.dSimulateSite();
    auto_ptr<Site> site(result->getSite(*simulator.getSubstitutionModelSet()->getModel(0)));
    site->setPosition(static_cast<int>(i));
    sites2.addSite(*site, false);
    delete result;
  }

  //Now fit model:
  SubstitutionModelSet* modelSet3 = modelSet->clone();
  RNonHomogeneousTreeLikelihood tl2(*tree, sites2, modelSet3, rdist);
  tl2.initialize();

  OptimizationTools::optimizeNumericalParameters2(
      &tl2, tl2.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

  //Now compare estimated values to real ones:
  for (size_t i = 0; i < thetas.size(); ++i) {
    cout << thetas[i] << "\t" << modelSet3->getModel(i)->getParameter("theta").getValue() << endl;
    double diff = abs(thetas[i] - modelSet3->getModel(i)->getParameter("theta").getValue());
    if (diff > 0.1)
      return 1;
  }
  delete modelSet3;

  //-------------
  delete tree;
  delete alphabet;
  delete modelSet;
  delete rdist;

  return 0;
}
int main() {

  TreeTemplate<Node>* tree = TreeTemplateTools::parenthesisToTree("(((A:0.1, B:0.2):0.3,C:0.1):0.2,(D:0.3,(E:0.2,F:0.05):0.1):0.1);");

  vector<string> seqNames= tree->getLeavesNames();
  vector<int> ids = tree->getNodesId();
  //-------------

  const NucleicAlphabet* alphabet = &AlphabetTools::DNA_ALPHABET;
  FrequenciesSet* rootFreqs = new GCFrequenciesSet(alphabet);
  
  SubstitutionModel* model = new T92(alphabet, 3.);
  std::vector<std::string> globalParameterNames;
  globalParameterNames.push_back("T92.kappa");

  //Very difficult to optimize on small datasets:
  DiscreteDistribution* rdist = new GammaDiscreteRateDistribution(4, 1.0);
  
  ParametrizableTree* parTree = new ParametrizableTree(*tree);
  FrequenciesSet* rootFreqs2 = rootFreqs->clone();
  DiscreteDistribution* rdist2 = rdist->clone();
  SubstitutionModel* model2=model->clone();

  map<string, string> alias;

  SubstitutionModelSet* modelSet = SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, alias, globalParameterNames);
  unique_ptr<SubstitutionModelSet> modelSetSim(modelSet->clone());

  NonHomogeneousSubstitutionProcess* subPro= NonHomogeneousSubstitutionProcess::createNonHomogeneousSubstitutionProcess(model2, rdist2, rootFreqs2, parTree, globalParameterNames);

  // Simulation
    
  size_t nsites = 1000;
  unsigned int nrep = 20;
  size_t nmodels = modelSet->getNumberOfModels();
  vector<double> thetas(nmodels);
  vector<double> thetasEst1(nmodels);
  vector<double> thetasEst2(nmodels);
  vector<double> thetasEst1n(nmodels);
  vector<double> thetasEst2n(nmodels);

  for (size_t i = 0; i < nmodels; ++i) {
    double theta = RandomTools::giveRandomNumberBetweenZeroAndEntry(0.99) + 0.005;
    cout << "Theta" << i << " set to " << theta << endl; 
    modelSetSim->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    //subPro->setParameterValue("T92.theta_" + TextTools::toString(i + 1), theta);
    thetas[i] = theta;
  }

  NonHomogeneousSequenceSimulator simulator(modelSetSim.get(), rdist, tree);

  NonHomogeneousSubstitutionProcess* subPro2 = subPro->clone();

  for (unsigned int j = 0; j < nrep; j++) {

    OutputStream* profiler  = new StlOutputStream(new ofstream("profile.txt", ios::out));
    OutputStream* messenger = new StlOutputStream(new ofstream("messages.txt", ios::out));

    //Simulate data:
    unique_ptr<SiteContainer> sites(simulator.simulate(nsites));

    //Now fit model:
    unique_ptr<SubstitutionModelSet> modelSet2(modelSet->clone());

    RNonHomogeneousTreeLikelihood tl(*tree, *sites.get(), modelSet, rdist, true, true, false);
    tl.initialize();

    RNonHomogeneousTreeLikelihood tl2(*tree, *sites.get(), modelSet2.get(), rdist, true, true, true);
    tl2.initialize();

    SubstitutionProcess* nsubPro=subPro->clone();
    SubstitutionProcess* nsubPro2=subPro2->clone();
    
    RecursiveLikelihoodTreeCalculation* tlComp = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro, true, false);
    SingleProcessPhyloLikelihood ntl(nsubPro, tlComp, true);

    RecursiveLikelihoodTreeCalculation* tlComp2 = new RecursiveLikelihoodTreeCalculation(*sites->clone(), nsubPro2, true);
    SingleProcessPhyloLikelihood ntl2(nsubPro2, tlComp2, true);

    for (size_t i = 0; i < nmodels; ++i) {
      ntl.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]);
      ntl2.setParameterValue("T92.theta_" + TextTools::toString(i + 1), thetas[i]);
    }

    cout << setprecision(10) << "OldTL init: "  << tl.getValue() << "\t" << tl2.getValue() << endl;
    cout << setprecision(10) << "NewTL init: "  << ntl.getValue() << "\t" << ntl2.getValue() << endl;

    unsigned int c1 = OptimizationTools::optimizeNumericalParameters2(
      &tl, tl.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);
    
    
    unsigned int c2 = OptimizationTools::optimizeNumericalParameters2(
      &tl2, tl2.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    unsigned int nc1 = OptimizationTools::optimizeNumericalParameters2(
      &ntl, ntl.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    unsigned int nc2 = OptimizationTools::optimizeNumericalParameters2(
      &ntl2, ntl2.getParameters(), 0,
      0.0001, 10000, messenger, profiler, false, false, 1, OptimizationTools::OPTIMIZATION_NEWTON);

    cout << "OldTL: " << c1 << ": " << tl.getValue() << "\t" << c2 << ": " << tl2.getValue() << endl;
    cout << "NewTL: " << nc1 << ": " << ntl.getValue() << "\t" << nc2 << ": " << ntl2.getValue() << endl;

    cout << "Thetas : " << endl;

    for (size_t i = 0; i < nmodels; ++i) {
      //    cerr << modelSet->getModel(i)->getParameter("theta").getValue() << "\t" << modelSet2->getModel(i)->getParameter("theta").getValue();
      
      //      cerr << "\t"  << subPro->getModel(i)->getParameter("theta").getValue() << "\t" << subPro2->getModel(i)->getParameter("theta").getValue() << endl;
      // if (abs(modelSet2->getModel(i)->getParameter("theta").getValue() - modelSet3->getModel(i)->getParameter("theta").getValue()) > 0.1)
      //   return 1;
      thetasEst1[i] +=  modelSet->getModel(i)->getParameter("theta").getValue();
      thetasEst2[i] +=  modelSet2->getModel(i)->getParameter("theta").getValue();
      thetasEst1n[i] +=  dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro)->getModel(i)->getParameter("theta").getValue();
      thetasEst2n[i] +=  dynamic_cast< NonHomogeneousSubstitutionProcess*>(nsubPro2)->getModel(i)->getParameter("theta").getValue();
    }
  }
  thetasEst1 /= static_cast<double>(nrep);
  thetasEst2 /= static_cast<double>(nrep);
  thetasEst1n /= static_cast<double>(nrep);
  thetasEst2n /= static_cast<double>(nrep);

  //Now compare estimated values to real ones:
  cout << "Real" << "\t" << "Est_Old1" << "\t" << "Est_Old2" << "\t";
  cout << "Est_New1" << "\t" << "Est_New2" << endl;
  for (size_t i = 0; i < thetas.size(); ++i) {
    cout << thetas[i] << "\t" << thetasEst1[i] << "\t" << thetasEst2[i] << "\t";
    cout << thetasEst1n[i] << "\t" << thetasEst2n[i] << endl;
     double diff1 = abs(thetas[i] - thetasEst1[i]);
     double diff2 = abs(thetas[i] - thetasEst2[i]);
     double diffn1 = abs(thetas[i] - thetasEst1n[i]);
     double diffn2 = abs(thetas[i] - thetasEst2n[i]);
     if (diff1 > 0.2 || diff2 > 0.2 || diffn1 > 0.2 || diffn2 > 0.2)
       return 1;
  }

  return 0;
}