Пример #1
0
static void printSubsRates(tree *tr,int model, int numSubsRates)
{
  assert(tr->partitionData[model].dataType = DNA_DATA);
  int i;
  printBothOpen("Subs rates: ");
  for(i=0; i<numSubsRates; i++)
    printBothOpen("%d => %.3f, ", i, tr->partitionData[model].substRates[i]);
  printBothOpen("\n\n");
}
Пример #2
0
static void print_state(state *s, double startLH)
{
   assert(startLH == s->tr->startLH);
   printBothOpen("tr LH %f, startLH %f, incr %f\n", s->tr->likelihood, startLH, s->tr->likelihood - startLH);
   printBothOpen("pruned %db%d nb %d, nnb %d, reinsert %db%d \n", s->p->number, s->p->back->number,
                                                    s->nb->number, s->nnb->number,
                                                    s->q->number, s->r->number);
      
}
Пример #3
0
static showNNI_move(nodeptr p)
{
  printBothOpen("NNI from p %d %.6f, pnb %d %.6f, pnnb %d %.6f\n", 
      p->number, p->z[0],
      p->next->back->number, p->next->back->z[0], 
      p->next->next->back->number,  p->next->next->back->z[0]);
  nodeptr q = p->back;
  printBothOpen("NNI from q %d %.6f, qnb %d %.6f, qnnb %d %.6f\n", 
      q->number, q->z[0],
      q->next->back->number, q->next->back->z[0], 
      q->next->next->back->number,  q->next->next->back->z[0]);
}
Пример #4
0
static void printRecomTree(tree *tr, boolean printBranchLengths, char *title)
{
  FILE *nwfile;
  nwfile = myfopen("tmp.nw", "w+");
  pllTreeToNewickRecomREC(tr->tree_string, tr, tr->start->back, printBranchLengths);
  fprintf(nwfile,"%s\n", tr->tree_string);
  fclose(nwfile);
  if(title)
    printBothOpen("%s\n", title);
  if (printBranchLengths)
    printBothOpen("%s\n", tr->tree_string);
  printBothOpen("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
  system("bin/nw_display tmp.nw");
}  
Пример #5
0
void pinToCore(int tid)
{
  cpu_set_t cpuset;
         
  CPU_ZERO(&cpuset);    
  CPU_SET( tid, &cpuset);

  if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0)
    {
      printBothOpen("\n\nThere was a problem finding a physical core for thread number %d to run on.\n", tid);
      printBothOpen("Probably this happend because you are trying to run more threads than you have cores available,\n");
      printBothOpen("which is a thing you should never ever do again, good bye .... \n\n");
      assert(0);
    }
}
void printTraversalInfo(tree *tr)
{
  int 
    k, 
    total_steps = 0;

  printBothOpen("Traversals : %d \n", tr->travCounter->numTraversals);
  printBothOpen("Traversals tt: %d \n", tr->travCounter->tt);
  printBothOpen("Traversals ti: %d \n", tr->travCounter->ti);
  printBothOpen("Traversals ii: %d \n", tr->travCounter->ii);
  printBothOpen("all: %d \n", tr->travCounter->tt + tr->travCounter->ii + tr->travCounter->ti);
  printBothOpen("Traversals len freq  : \n");

  for(k = 0; k < tr->mxtips; k++)
  {
    total_steps += tr->travCounter->travlenFreq[k] * (k - 1);
    if(tr->travCounter->travlenFreq[k] > 0)
      printBothOpen("len %d : %d\n", k, tr->travCounter->travlenFreq[k]);
  }
  printBothOpen("all steps: %d \n", total_steps);
}
Пример #7
0
void shSupports(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta)
{
  double 
    diff,
    *lhVectors[3];

  char 
    bestTreeFileName[1024],
    shSupportFileName[1024];
  
  FILE 
    *f;

  int
    interchanges = 0,
    counter = 0;

  assert(adef->restart);
    
  tr->resample = permutationSH(tr, 1000, 12345);
    
  lhVectors[0] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite);
  lhVectors[1] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite);
  lhVectors[2] = (double *)rax_malloc(sizeof(double) * tr->cdta->endsite);
  tr->bInf = (branchInfo*)rax_malloc(sizeof(branchInfo) * (tr->mxtips - 3));       

  initModel(tr, rdta, cdta, adef);        
 
 
  getStartingTree(tr, adef);
  
 
  
  if(adef->useBinaryModelFile)
    {
      readBinaryModel(tr);
      evaluateGenericInitrav(tr, tr->start);
      treeEvaluate(tr, 2);
    }
  else
    modOpt(tr, adef, FALSE, 10.0);
  
  printBothOpen("Time after model optimization: %f\n", gettime() - masterTime);
  
  printBothOpen("Initial Likelihood %f\n\n", tr->likelihood);   
    
  do
    {   
      double 
	lh1,
	lh2;
   
      lh1 = tr->likelihood;

      interchanges = encapsulateNNIs(tr, lhVectors, FALSE);       

      evaluateGeneric(tr, tr->start); 		

      lh2 = tr->likelihood;

      diff = ABS(lh1 - lh2);

      printBothOpen("NNI interchanges %d Likelihood %f\n", interchanges, tr->likelihood);
    }
  while(diff > 0.01);

  printBothOpen("\nFinal Likelihood of NNI-optimized tree: %f\n\n", tr->likelihood);

  setupBranchInfo(tr->start->back, tr, &counter);
  assert(counter == tr->mxtips - 3);
 
  interchanges = encapsulateNNIs(tr, lhVectors, TRUE);
              
  strcpy(bestTreeFileName, workdir); 
  strcat(bestTreeFileName, "RAxML_fastTree.");
  strcat(bestTreeFileName,         run_id); 

  Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, FALSE);
    
  f = myfopen(bestTreeFileName, "wb");
  fprintf(f, "%s", tr->tree_string);
  fclose(f);  

  
  strcpy(shSupportFileName, workdir); 
  strcat(shSupportFileName, "RAxML_fastTreeSH_Support.");
  strcat(shSupportFileName,         run_id);
  
  Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, TRUE);
  
  f = myfopen(shSupportFileName, "wb");
  fprintf(f, "%s", tr->tree_string);
  fclose(f);  
      
  printBothOpen("RAxML NNI-optimized tree written to file: %s\n", bestTreeFileName);
  
  printBothOpen("Same tree with SH-like supports written to file: %s\n", shSupportFileName);
  
  printBothOpen("Total execution time: %f\n", gettime() - masterTime);

  exit(0);
}
Пример #8
0
void fastSearch(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta)
{
  double    
    likelihood, 
    startLikelihood,
    *lhVectors[3];

  char 
    bestTreeFileName[1024];
  
  FILE 
    *f;

  int
    model;

 
  
  lhVectors[0] = (double *)NULL;
  lhVectors[1] = (double *)NULL;
  lhVectors[2] = (double *)NULL;  
      
  /* initialize model parameters with standard starting values */

  initModel(tr, rdta, cdta, adef);      

  printBothOpen("Time after init : %f\n", gettime() - masterTime);

  /* 
     compute starting tree, either by reading in a tree specified via -t 
     or by building one 
  */

  getStartingTree(tr, adef);
 
  printBothOpen("Time after init and starting tree: %f\n", gettime() - masterTime);
  
  /* 
     rough model parameter optimization, the log likelihood epsilon should 
     actually be determined based on the initial tree score and not be hard-coded 
  */
  
 if(adef->useBinaryModelFile)
    {
      readBinaryModel(tr);
      evaluateGenericInitrav(tr, tr->start);
      treeEvaluate(tr, 2);
    }
 else
   modOpt(tr, adef, FALSE, 10.0);
  
  printBothOpen("Time after init, starting tree, mod opt: %f\n", gettime() - masterTime);

  /* print out the number of rate categories used for the CAT model, one should 
     use less then the default, e.g., -c 16 works quite well */

  for(model = 0; model < tr->NumberOfModels; model++)
    printBothOpen("Partion %d number of Cats: %d\n", model, tr->partitionData[model].numberOfCategories);

  /* 
     means that we are going to do thorough insertions 
     with real newton-raphson based br-len opt at the three branches 
     adjactent to every insertion point 
  */

  Thorough = 1;

  
  /*
    loop over SPR cycles until the likelihood difference 
     before and after the SPR cycle is <= 0.5 log likelihood units.
     Rather than being hard-coded this should also be determined based on the 
     actual likelihood of the tree 
  */
 
  do
    {      
      startLikelihood = tr->likelihood;
   
      /* conduct a cycle of linear SPRs */

    

      likelihood = linearSPRs(tr, 20, adef->veryFast);          
           
      evaluateGeneric(tr, tr->start); 
      
      /* the NNIs also optimize br-lens of resulting topology a bit */
      encapsulateNNIs(tr, lhVectors, FALSE);                    
 
      printBothOpen("LH after SPRs %f, after NNI %f\n", likelihood, tr->likelihood);
    }
  while(ABS(tr->likelihood - startLikelihood) > 0.5);

 
  
 

  
  /* print out the resulting tree to the RAxML_bestTree. file. 
     note that boosttrapping or doing multiple inferences won't work.
     This thing computes a single tree and that's it */
      
  strcpy(bestTreeFileName, workdir); 
  strcat(bestTreeFileName, "RAxML_fastTree.");
  strcat(bestTreeFileName,         run_id);

 

  Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, SUMMARIZE_LH, FALSE, FALSE);
    
  f = myfopen(bestTreeFileName, "wb");
  fprintf(f, "%s", tr->tree_string);
  fclose(f);  

 
    
  printBothOpen("RAxML fast tree written to file: %s\n", bestTreeFileName);
  
  writeBinaryModel(tr);
  
  printBothOpen("Total execution time: %f\n", gettime() - masterTime);

  printBothOpen("Good bye ... \n");
}
Пример #9
0
void computePlacementBias(tree *tr, analdef *adef)
{
  int 
    windowSize = adef->slidingWindowSize,
    k,   
    i, 
    tips,
    numTraversalBranches = (2 * (tr->mxtips - 1)) - 3; /* compute number of branches into which we need to insert once we have removed a taxon */    
    
  char 
    fileName[1024];

  FILE 
    *outFile;

  /* data for each sliding window starting position */

  positionData
    *pd = (positionData *)malloc(sizeof(positionData) * (tr->cdta->endsite - windowSize));

  double 
    *nodeDistances = (double*)calloc(tr->cdta->endsite, sizeof(double)), /* array to store node distnces ND for every sliding window position */
    *distances = (double*)calloc(tr->cdta->endsite, sizeof(double)); /* array to store avg distances for every site */

  strcpy(fileName,         workdir);
  strcat(fileName,         "RAxML_SiteSpecificPlacementBias.");
  strcat(fileName,         run_id);

  outFile = myfopen(fileName, "w");

  printBothOpen("Likelihood of comprehensive tree %f\n\n", tr->likelihood);

  if(windowSize > tr->cdta->endsite)
    {
      printBothOpen("The size of your sliding window is %d while the number of sites in the alignment is %d\n\n", windowSize, tr->cdta->endsite);
      exit(-1);
    }

  if(windowSize >=  (int)(0.9 * tr->cdta->endsite))    
    printBothOpen("WARNING: your sliding window of size %d is only slightly smaller than you alignment that has %d sites\n\n",  windowSize, tr->cdta->endsite);

  printBothOpen("Sliding window size: %d\n\n", windowSize);

  /* prune and re-insert on tip at a time into all branches of the remaining tree */

  for(tips = 1; tips <= tr->mxtips; tips++)    
    {
      nodeptr 
	myStart,
	p = tr->nodep[tips]->back, /* this is the node at which we are prunung */
	p1 =  p->next->back,
	p2 =  p->next->next->back;
      
      double
	pz[NUM_BRANCHES],
	p1z[NUM_BRANCHES], 
	p2z[NUM_BRANCHES];

      int 
	branchCounter = 0;
      
      /* reset array values for this tip */

      for(i = 0; i < tr->cdta->endsite; i++)
	{
	  pd[i].lh = unlikely;
	  pd[i].p = (nodeptr)NULL;
	}

      /* store the three branch lengths adjacent to the position at which we prune */

      for(i = 0; i < tr->numBranches; i++)
	{
	  p1z[i] = p1->z[i];
	  p2z[i] = p2->z[i];	   	   
	  pz[i] = p->z[i];
	}           

      /* prune the taxon, optimizing the branch between p1 and p2 */

      removeNodeBIG(tr, p,  tr->numBranches);  

      printBothOpen("Pruning taxon Number %d [%s]\n", tips, tr->nameList[tips]);
      
      /* find any tip to start traversing the tree */

      myStart = findAnyTip(p1, tr->mxtips);      

      /* insert taxon, compute likelihood and remove taxon again from all branches */

      traverseBias(p, myStart->back, tr, &branchCounter, pd, windowSize);     

      assert(branchCounter == numTraversalBranches);                    

      /* for every sliding window position calc ND to the true/correct position at p */

      for(i = 0; i < tr->cdta->endsite - windowSize; i++)		
	nodeDistances[i] = getNodeDistance(p1, pd[i].p, tr->mxtips); 

      /* now analyze */

      for(i = 0; i < tr->cdta->endsite; i++)
	{
	  double 
	    d = 0.0;

	  int 
	    s = 0;	  	  
	  
	  /* 
	     check site position, i.e., doe we have windowSize data points available 
	     or fewer because we are at the start or the end of the alignment 
	  */

	  /*
	    for each site just accumulate the node distances we have for all 
	    sliding windows that passed over this site 
	  */

	  if(i < windowSize)
	    {
	      for(k = 0; k < i + 1; k++, s++)	    		
		d += nodeDistances[k];		 		
	    }
	  else
	    {
	      if(i < tr->cdta->endsite - windowSize)
		{
		  for(k = i - windowSize + 1; k <= i; k++, s++)	    		    
		    d += nodeDistances[k];		      		   
		}	    
	      else
		{				  
		  for(k = i - windowSize; k < (tr->cdta->endsite - windowSize); k++, s++)	    		    
		    d += nodeDistances[k + 1];		     		 
		}
	    }
	   	  
	  /* 
	     now just divide the accumultaed ND distance by 
	     the number of distances we have for this position and then add it to the acc 
	     distances over all taxa.
	     I just realized that the version on which I did the tests 
	     I sent to Simon I used 
	     
	     distances[i] = d / ((double)s);

	     instead of 
	     
	     distances[i] += d / ((double)s);

	     gamo tin poutana mou
	  */
	     
	  distances[i] += (d / ((double)s));	  
	}
      

      /* 
	 re-connect taxon to its original position 
      */

      hookup(p->next,       p1,      p1z, tr->numBranches); 
      hookup(p->next->next, p2,      p2z, tr->numBranches);
      hookup(p,             p->back, pz, tr->numBranches);      

      /*
	fix likelihood vectors 
      */

      newviewGeneric(tr, p);          
    }

  /* 
     now just compute the average ND over all taxa 
  */
    

  for(i = 0; i < tr->cdta->endsite; i++)
    {
      double 
	avg = distances[i] / ((double)tr->mxtips);

      fprintf(outFile, "%d %f\n", i, avg);
    }

  printBothOpen("\nTime for EPA-based site-specific placement bias calculation: %f\n", gettime() - masterTime); 
  printBothOpen("Site-specific placement bias statistics written to file %s\n", fileName);

  fclose(outFile);

  exit(0);
}
Пример #10
0
void computeBIGRAPID (tree *tr, analdef *adef, boolean estimateModel) 
{ 
  unsigned int
    vLength = 0;
  int
    i,
    impr, 
    bestTrav,
    rearrangementsMax = 0, 
    rearrangementsMin = 0,    
    thoroughIterations = 0,
    fastIterations = 0;
   
  double lh, previousLh, difference, epsilon;              
  bestlist *bestT, *bt;  
    
#ifdef _TERRACES
  /* store the 20 best trees found in a dedicated list */

  bestlist
    *terrace;
  
  /* output file names */

  char 
    terraceFileName[1024],
    buf[64];
#endif

  hashtable *h = (hashtable*)NULL;
  unsigned int **bitVectors = (unsigned int**)NULL;
  
 
  if(tr->searchConvergenceCriterion)
    {          
      bitVectors = initBitVector(tr, &vLength);
      h = initHashTable(tr->mxtips * 4);   
    }

  bestT = (bestlist *) rax_malloc(sizeof(bestlist));
  bestT->ninit = 0;
  initBestTree(bestT, 1, tr->mxtips);
      
  bt = (bestlist *) rax_malloc(sizeof(bestlist));      
  bt->ninit = 0;
  initBestTree(bt, 20, tr->mxtips); 

#ifdef _TERRACES 
  /* initialize the tree list and the output file name for the current tree search/replicate */


  terrace = (bestlist *) rax_malloc(sizeof(bestlist));      
  terrace->ninit = 0;
  initBestTree(terrace, 20, tr->mxtips); 
  
  sprintf(buf, "%d", bCount);
  
  strcpy(terraceFileName,         workdir);
  strcat(terraceFileName,         "RAxML_terrace.");
  strcat(terraceFileName,         run_id);
  strcat(terraceFileName,         ".BS.");
  strcat(terraceFileName,         buf);
  
  printf("%s\n", terraceFileName);
#endif

  initInfoList(50);
 
  difference = 10.0;
  epsilon = 0.01;    
    
  Thorough = 0;     
  
  if(estimateModel)
    {
      if(adef->useBinaryModelFile)
	{
	  readBinaryModel(tr);
	  evaluateGenericInitrav(tr, tr->start);
	  treeEvaluate(tr, 2);
	}
      else
	{
	  evaluateGenericInitrav(tr, tr->start);
	  modOpt(tr, adef, FALSE, 10.0);
	}
    }
  else
    treeEvaluate(tr, 2);  


  printLog(tr, adef, FALSE); 

  saveBestTree(bestT, tr);
  
  if(!adef->initialSet)   
    bestTrav = adef->bestTrav = determineRearrangementSetting(tr, adef, bestT, bt);                   
  else
    bestTrav = adef->bestTrav = adef->initial;

  if(estimateModel)
    {
      if(adef->useBinaryModelFile)	
	treeEvaluate(tr, 2);
      else
	{
	  evaluateGenericInitrav(tr, tr->start);
	  modOpt(tr, adef, FALSE, 5.0);
	}
    }
  else
    treeEvaluate(tr, 1);
  
  saveBestTree(bestT, tr); 
  impr = 1;
  if(tr->doCutoff)
    tr->itCount = 0;

 

  while(impr)
    {              
      recallBestTree(bestT, 1, tr); 

      if(tr->searchConvergenceCriterion)
	{
	  int bCounter = 0;	      
	  
	  if(fastIterations > 1)
	    cleanupHashTable(h, (fastIterations % 2));		
	  
	  bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, fastIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL,
				  &bCounter, 1, FALSE, FALSE);	    
	  
	  assert(bCounter == tr->mxtips - 3);	    	   
	  
	  if(fastIterations > 0)
	    {
	      double rrf = convergenceCriterion(h, tr->mxtips);
	      
	      if(rrf <= 0.01) /* 1% cutoff */
		{
		  printBothOpen("ML fast search converged at fast SPR cycle %d with stopping criterion\n", fastIterations);
		  printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%");
		  cleanupHashTable(h, 0);
		  cleanupHashTable(h, 1);
		  goto cleanup_fast;
		}
	      else		    
		printBothOpen("ML search convergence criterion fast cycle %d->%d Relative Robinson-Foulds %f\n", fastIterations - 1, fastIterations, rrf);
	    }
	}

	 
      fastIterations++;	


      treeEvaluate(tr, 1.0);
      
      
      saveBestTree(bestT, tr);           
      printLog(tr, adef, FALSE);         
      printResult(tr, adef, FALSE);    
      lh = previousLh = tr->likelihood;
   
     
      treeOptimizeRapid(tr, 1, bestTrav, adef, bt);   
      
      impr = 0;
	  
      for(i = 1; i <= bt->nvalid; i++)
	{	    		  	   
	  recallBestTree(bt, i, tr);
	  
	  treeEvaluate(tr, 0.25);	    	 		      	 

	  difference = ((tr->likelihood > previousLh)? 
			tr->likelihood - previousLh: 
			previousLh - tr->likelihood); 	    
	  if(tr->likelihood > lh && difference > epsilon)
	    {
	      impr = 1;	       
	      lh = tr->likelihood;	       	     
	      saveBestTree(bestT, tr);
	    }	   	   
	}	
    }

 

  if(tr->searchConvergenceCriterion)
    {
      cleanupHashTable(h, 0);
      cleanupHashTable(h, 1);
    }

 cleanup_fast:

  Thorough = 1;
  impr = 1;
  
  recallBestTree(bestT, 1, tr); 
  if(estimateModel)
    {
      if(adef->useBinaryModelFile)	
	treeEvaluate(tr, 2);
      else
	{
	  evaluateGenericInitrav(tr, tr->start);
	  modOpt(tr, adef, FALSE, 1.0);
	}
    }
  else
    treeEvaluate(tr, 1.0);

  while(1)
    {	
      recallBestTree(bestT, 1, tr);    
      if(impr)
	{	    
	  printResult(tr, adef, FALSE);
	  rearrangementsMin = 1;
	  rearrangementsMax = adef->stepwidth;	

	 

	  if(tr->searchConvergenceCriterion)
	    {
	      int bCounter = 0;	      

	      if(thoroughIterations > 1)
		cleanupHashTable(h, (thoroughIterations % 2));		
		
	      bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, thoroughIterations % 2, BIPARTITIONS_RF, (branchInfo *)NULL,
				      &bCounter, 1, FALSE, FALSE);	    
	      
	      assert(bCounter == tr->mxtips - 3);	    	   
	      
	      if(thoroughIterations > 0)
		{
		  double rrf = convergenceCriterion(h, tr->mxtips);
		  
		  if(rrf <= 0.01) /* 1% cutoff */
		    {
		      printBothOpen("ML search converged at thorough SPR cycle %d with stopping criterion\n", thoroughIterations);
		      printBothOpen("Relative Robinson-Foulds (RF) distance between respective best trees after one succseful SPR cycle: %f%s\n", rrf, "%");
		      goto cleanup;
		    }
		  else		    
		    printBothOpen("ML search convergence criterion thorough cycle %d->%d Relative Robinson-Foulds %f\n", thoroughIterations - 1, thoroughIterations, rrf);
		}
	    }

	 
	   	  
	  thoroughIterations++;	  
	}			  			
      else
	{		       	   
	  rearrangementsMax += adef->stepwidth;
	  rearrangementsMin += adef->stepwidth; 	        	      
	  if(rearrangementsMax > adef->max_rearrange)	     	     	 
	    goto cleanup; 	   
	}
      treeEvaluate(tr, 1.0);
     
      previousLh = lh = tr->likelihood;	      
      saveBestTree(bestT, tr); 
      
      printLog(tr, adef, FALSE);
      treeOptimizeRapid(tr, rearrangementsMin, rearrangementsMax, adef, bt);
	
      impr = 0;			      		            

      for(i = 1; i <= bt->nvalid; i++)
	{		 
	  recallBestTree(bt, i, tr);	 	    	    	
	  
	  treeEvaluate(tr, 0.25);	    	 
	
#ifdef _TERRACES
	  /* save all 20 best trees in the terrace tree list */
	  saveBestTree(terrace, tr);
#endif

	  difference = ((tr->likelihood > previousLh)? 
			tr->likelihood - previousLh: 
			previousLh - tr->likelihood); 	    
	  if(tr->likelihood > lh && difference > epsilon)
	    {
	      impr = 1;	       
	      lh = tr->likelihood;	  	     
	      saveBestTree(bestT, tr);
	    }	   	   
	}  

                      
    }

 cleanup: 

#ifdef _TERRACES
  {
    double
      bestLH = tr->likelihood;
    FILE 
      *f = myfopen(terraceFileName, "w");
    
    /* print out likelihood of best tree found */

    printf("best tree: %f\n", tr->likelihood);

    /* print out likelihoods of 20 best trees found during the tree search */

    for(i = 1; i <= terrace->nvalid; i++)
      {
	recallBestTree(terrace, i, tr);
	
	/* if the likelihood scores are smaller than some epsilon 0.000001
	   print the tree to file */
	   
	if(ABS(bestLH - tr->likelihood) < 0.000001)
	  {
	    printf("%d %f\n", i, tr->likelihood);
	    Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, FALSE, adef, NO_BRANCHES, FALSE, FALSE, FALSE, FALSE);
	    
	    fprintf(f, "%s\n", tr->tree_string); 
	  }
      }

    fclose(f);
    /* increment tree search counter */
    bCount++;
  }
#endif
 
  
  if(tr->searchConvergenceCriterion)
    {
      freeBitVectors(bitVectors, 2 * tr->mxtips);
      rax_free(bitVectors);
      freeHashTable(h);
      rax_free(h);
    }
  
  freeBestTree(bestT);
  rax_free(bestT);
  freeBestTree(bt);
  rax_free(bt);

#ifdef _TERRACES
  /* free terrace tree list */

  freeBestTree(terrace);
  rax_free(terrace);
#endif

  freeInfoList();  
  printLog(tr, adef, FALSE);
  printResult(tr, adef, FALSE);
}
Пример #11
0
void plausibilityChecker(tree *tr, analdef *adef)
{
  FILE 
    *treeFile,
    *rfFile;
  
  tree 
    *smallTree = (tree *)rax_malloc(sizeof(tree));

  char 
    rfFileName[1024];
 
  /* init hash table for big reference tree */
  
  hashtable
    *h      = initHashTable(tr->mxtips * 2 * 2);
  
  /* init the bit vectors we need for computing and storing bipartitions during 
     the tree traversal */
  unsigned int 
    vLength, 
    **bitVectors = initBitVector(tr, &vLength);
   
  int
    numberOfTreesAnalyzed = 0,
    branchCounter = 0,
    i;

  double 
    avgRF = 0.0;

  /* set up an output file name */

  strcpy(rfFileName,         workdir);  
  strcat(rfFileName,         "RAxML_RF-Distances.");
  strcat(rfFileName,         run_id);

  rfFile = myfopen(rfFileName, "wb");  

  assert(adef->mode ==  PLAUSIBILITY_CHECKER);

  /* open the big reference tree file and parse it */

  treeFile = myfopen(tree_file, "r");

  printBothOpen("Parsing reference tree %s\n", tree_file);

  treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE);

  assert(tr->mxtips == tr->ntips);

  printBothOpen("The reference tree has %d tips\n", tr->ntips);

  fclose(treeFile);
  
  /* extract all induced bipartitions from the big tree and store them in the hastable */
  
  bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, 0, BIPARTITIONS_RF, (branchInfo *)NULL,
			  &branchCounter, 1, FALSE, FALSE);
     
  assert(branchCounter == tr->mxtips - 3);   
  
  /* now see how many small trees we have */

  treeFile = getNumberOfTrees(tr, bootStrapFile, adef);

  checkTreeNumber(tr->numberOfTrees, bootStrapFile);

  /* allocate a data structure for parsing the potentially mult-furcating tree */

  allocateMultifurcations(tr, smallTree);

  /* loop over all small trees */

  for(i = 0; i < tr->numberOfTrees;  i++)
    {          
      int           
	numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE);

      if(numberOfSplits > 0)
	{
	  unsigned int
	    entryCount = 0,
	    k,
	    j,	
	    *masked    = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)),
	    *smallTreeMask = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int));

	  hashtable
	    *rehash = initHashTable(tr->mxtips * 2 * 2);

	  double
	    rf,
	    maxRF;

	  int 
	    bCounter = 0,  
	    bips,
	    firstTaxon,
	    taxa = 0;

	  if(numberOfTreesAnalyzed % 100 == 0)
	    printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits);    

	  /* compute the maximum RF distance for computing the relative RF distance later-on */
	  
	  /* note that here we need to pay attention, since the RF distance is not normalized 
	     by 2 * (n-3) but we need to account for the fact that the multifurcating small tree 
	     will potentially contain less bipartitions. 
	     Hence the normalization factor is obtained as 2 * numberOfSplits, where numberOfSplits is the number of bipartitions
	     in the small tree.
	  */
	  
	  maxRF = (double)(2 * numberOfSplits);
	  
	  /* now set up a bit mask where only the bits are set to one for those 
	     taxa that are actually present in the small tree we just read */
	  
	  /* note that I had to apply some small changes to this function to make it work for 
	     multi-furcating trees ! */

	  setupMask(smallTreeMask, smallTree->start,       smallTree->mxtips);
	  setupMask(smallTreeMask, smallTree->start->back, smallTree->mxtips);

	  /* now get the index of the first taxon of the small tree.
	     we will use this to unambiguously store the bipartitions 
	  */
	  
	  firstTaxon = smallTree->start->number;
	  
	  /* make sure that this bit vector is set up correctly, i.e., that 
	     it contains as many non-zero bits as there are taxa in this small tree 
	  */
	  
	  for(j = 0; j < vLength; j++)
	    taxa += BIT_COUNT(smallTreeMask[j]);
	  assert(taxa == smallTree->ntips);
	  
	  /* now re-hash the big tree by applying the above bit mask */
	  
	  
	  /* loop over hash table */
	  
	  for(k = 0, entryCount = 0; k < h->tableSize; k++)	     
	    {    
	      if(h->table[k] != NULL)
		{
		  entry *e = h->table[k];
		  
		  /* we resolve collisions by chaining, hence the loop here */
		  
		  do
		    {
		      unsigned int 
			*bitVector = e->bitVector; 
		      
		      hashNumberType 
			position;
		      
		      int 
			count = 0;
		      
		      /* double check that our tree mask contains the first taxon of the small tree */
		      
		      assert(smallTreeMask[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]);
		      
		      /* if the first taxon is set then we will re-hash the bit-wise complement of the 
			 bit vector.
			 The count variable is used for a small optimization */
		      
		      if(bitVector[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH])		    
			{
			  //hash complement
			  
			  for(j = 0; j < vLength; j++)
			    {
			      masked[j] = (~bitVector[j]) & smallTreeMask[j];			     
			      count += BIT_COUNT(masked[j]);
			    }
			}
		      else
			{
			  //hash this vector 
			  
			  for(j = 0; j < vLength; j++)
			    {
			      masked[j] = bitVector[j] & smallTreeMask[j];  
			      count += BIT_COUNT(masked[j]);      
			    }
			}
		      
		      /* note that padding the last bits is not required because they are set to 0 automatically by smallTreeMask */	
		      
		      /* make sure that we will re-hash  the canonic representation of the bipartition 
			 where the bit for firstTaxon is set to 0!
		      */
		      
		      assert(!(masked[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]));
		      
		      /* only if the masked bipartition of the large tree is a non-trivial bipartition (two or more bits set to 1 
			 will we re-hash it */
		      
		      if(count > 1)
			{
			  /* compute hash */
			  position = oat_hash((unsigned char *)masked, sizeof(unsigned int) * vLength);
			  position = position % rehash->tableSize;
			  
			  /* re-hash to the new hash table that contains the bips of the large tree, pruned down 
			     to the taxa contained in the small tree
			  */
			  insertHashPlausibility(masked, rehash, vLength, position);
			}		
		      
		      entryCount++;
		      
		      e = e->next;
		    }
		  while(e != NULL);
		}
	    }
	  
	  /* make sure that we tried to re-hash all bipartitions of the original tree */
	  
	  assert(entryCount == (unsigned int)(tr->mxtips - 3));
	  
	  /* now traverse the small tree and count how many bipartitions it shares 
	     with the corresponding induced tree from the large tree */
	  
	  /* the following function also had to be modified to account for multi-furcating trees ! */
	  
	  bips = bitVectorTraversePlausibility(bitVectors, smallTree->start->back, smallTree->mxtips, vLength, rehash, &bCounter, firstTaxon, smallTree, TRUE);
	  
	  /* compute the relative RF */
	  
	  rf = (double)(2 * (numberOfSplits - bips)) / maxRF;           
	  
	  assert(numberOfSplits >= bips);

	  assert(rf <= 1.0);
	  
	  avgRF += rf;
	  
	  if(numberOfTreesAnalyzed % 100 == 0)
	    printBothOpen("Relative RF tree %d: %f\n\n", i, rf);

	  fprintf(rfFile, "%d %f\n", i, rf);
	  
	  /* I also modified this assertion, we nee to make sure here that we checked all non-trivial splits/bipartitions 
	     in the multi-furcating tree whech can be less than n - 3 ! */
	  
	  assert(bCounter == numberOfSplits);         
	  
	  /* free masks and hast table for this iteration */
	  
	  rax_free(smallTreeMask);
	  rax_free(masked);
	  freeHashTable(rehash);
	  rax_free(rehash);
	  numberOfTreesAnalyzed++;
	}
    }

  printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed);
  
  printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed);

  printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime);

  printBothOpen("\nFile containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName);

  

  fclose(treeFile);
  fclose(rfFile);    
  
  /* free the data structure used for parsing the potentially multi-furcating tree */

  freeMultifurcations(smallTree);
  rax_free(smallTree);

  freeBitVectors(bitVectors, 2 * tr->mxtips);
  rax_free(bitVectors);
  
  freeHashTable(h);
  rax_free(h);
}
Пример #12
0
void doAllInOne(tree *tr, analdef *adef)
{
  int i, n, bestIndex, bootstrapsPerformed;

#ifdef _WAYNE_MPI
  int 
    bootStopTests = 1,
    j,
    bootStrapsPerProcess = 0;
#endif 

  double loopTime; 
  int      *originalRateCategories;
  int      *originalInvariant;
#ifdef _WAYNE_MPI
  int      slowSearches, fastEvery;
#else
  int      slowSearches, fastEvery = 5;
#endif
  int treeVectorLength = -1;
  topolRELL_LIST *rl;  
  double bestLH, mlTime, overallTime;  
  long radiusSeed = adef->rapidBoot;
  FILE *f;
  char bestTreeFileName[1024];  
  hashtable *h = (hashtable*)NULL;
  unsigned int **bitVectors = (unsigned int**)NULL;
  boolean bootStopIt = FALSE;
  double pearsonAverage = 0.0;
  pInfo *catParams         = allocParams(tr);
  pInfo *gammaParams = allocParams(tr);
  unsigned int vLength;

  n = adef->multipleRuns; 

#ifdef _WAYNE_MPI
  if(n % processes != 0)
    n = processes * ((n / processes) + 1);
#endif

  if(adef->bootStopping)
    {    
      h = initHashTable(tr->mxtips * 100);

      treeVectorLength = adef->multipleRuns;
      
      bitVectors = initBitVector(tr, &vLength);          
    }

  rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST));
  initTL(rl, tr, n);
     
  originalRateCategories = (int*)rax_malloc(tr->cdta->endsite * sizeof(int));      
  originalInvariant      = (int*)rax_malloc(tr->cdta->endsite * sizeof(int));

             

  initModel(tr, tr->rdta, tr->cdta, adef);

  if(adef->grouping)
    printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the constraint tree specified in %s\n", tree_file);
  if(adef->constraint)
    printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the bifurcating backbone constraint tree specified in %s\n", tree_file);
 

#ifdef _WAYNE_MPI
  long parsimonySeed0 = adef->parsimonySeed;
  long replicateSeed0 = adef->rapidBoot;
  n = n / processes;
#endif
 
  for(i = 0; i < n && !bootStopIt; i++)
    {  
#ifdef _WAYNE_MPI
      j = i + n * processID;
      tr->treeID = j;
#else              
      tr->treeID = i;
#endif

      tr->checkPointCounter = 0;
        
      loopTime = gettime();  

#ifdef _WAYNE_MPI
      if(i == 0)
        {
          if(parsimonySeed0 != 0)
            adef->parsimonySeed = parsimonySeed0 + 10000 * processID;
          adef->rapidBoot = replicateSeed0 + 10000 * processID;
          radiusSeed = adef->rapidBoot;
        }
#endif          
     
      if(i % 10 == 0)
	{
	  if(i > 0)	    	    
	    reductionCleanup(tr, originalRateCategories, originalInvariant);	    	  

	  if(adef->grouping || adef->constraint)
	    {
	      FILE *f = myfopen(tree_file, "rb");	

	      assert(adef->restart);	      
	      if (! treeReadLenMULT(f, tr, adef))
		exit(-1);
	     
	      fclose(f);
	    }
	  else
	    makeParsimonyTree(tr, adef);
	  
	  tr->likelihood = unlikely;
	  if(i == 0)
	    {
	      double t;
	          
	      onlyInitrav(tr, tr->start);
	      treeEvaluate(tr, 1);	     	
	     	      
	      t = gettime();    	      

	      modOpt(tr, adef, FALSE, 5.0);	    
#ifdef _WAYNE_MPI
	      printBothOpen("\nTime for BS model parameter optimization on Process %d: %f seconds\n", processID, gettime() - t);	     
#else
	      printBothOpen("\nTime for BS model parameter optimization %f\n", gettime() - t);
#endif
	      
	      memcpy(originalRateCategories, tr->cdta->rateCategory, sizeof(int) * tr->cdta->endsite);
	      memcpy(originalInvariant,      tr->invariant,          sizeof(int) * tr->cdta->endsite);

	      if(adef->bootstrapBranchLengths)
		{
		  if(tr->rateHetModel == CAT)
		    {
		      copyParams(tr->NumberOfModels, catParams, tr->partitionData, tr);		      
		      assert(tr->cdta->endsite == tr->originalCrunchedLength);		 
		      catToGamma(tr, adef);		      
		      modOpt(tr, adef, TRUE, adef->likelihoodEpsilon);
		      copyParams(tr->NumberOfModels, gammaParams, tr->partitionData, tr);		      
		      gammaToCat(tr);
		      copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr);		      
		    }
		  else
		    {		  
		      assert(tr->cdta->endsite == tr->originalCrunchedLength);		 		     		     		      		     
		    }
		}
	    }	  	  
	}

      computeNextReplicate(tr, &adef->rapidBoot, originalRateCategories, originalInvariant, TRUE, TRUE); 
      resetBranches(tr);

     

      evaluateGenericInitrav(tr, tr->start);
    
      treeEvaluate(tr, 1);    	             
     
      computeBOOTRAPID(tr, adef, &radiusSeed);  
#ifdef _WAYNE_MPI
      saveTL(rl, tr, j);
#else                      	  
      saveTL(rl, tr, i);
#endif

      if(adef->bootstrapBranchLengths)
	{
	  double 
	    lh = tr->likelihood;
	  	 
	  if(tr->rateHetModel == CAT)
	    {
	      copyParams(tr->NumberOfModels, tr->partitionData, gammaParams, tr);	      
	     
	      catToGamma(tr, adef);
	      
	      
	      resetBranches(tr);
	      onlyInitrav(tr, tr->start);
	      treeEvaluate(tr, 2.0);
	  
	     
	      gammaToCat(tr);
	     
	
	      copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr);	      
	      tr->likelihood = lh;
	    }
	  else
	    {	     
	      treeEvaluate(tr, 2.0);
	      tr->likelihood = lh;
	    }
	}
      
      printBootstrapResult(tr, adef, TRUE); 

      loopTime = gettime() - loopTime; 
      writeInfoFile(adef, tr, loopTime); 
     
      if(adef->bootStopping)
#ifdef _WAYNE_MPI
	{
	  int 
	    nn = (i + 1) * processes;

	  if((nn > START_BSTOP_TEST) && 
	     (i * processes < FC_SPACING * bootStopTests) &&
	     ((i + 1) * processes >= FC_SPACING * bootStopTests)
	     )	     
	    {
	      MPI_Barrier(MPI_COMM_WORLD);
	                    
	      concatenateBSFiles(processes, bootstrapFileName);                
	      
              MPI_Barrier(MPI_COMM_WORLD);	      
	      
	      bootStopIt = computeBootStopMPI(tr, bootstrapFileName, adef, &pearsonAverage);
	      bootStopTests++;
	    }
	}	
#else	
      bootStopIt = bootStop(tr, h, i, &pearsonAverage, bitVectors, treeVectorLength, vLength, adef);
#endif


    }  
 
#ifdef _WAYNE_MPI      
  MPI_Barrier(MPI_COMM_WORLD);
  
  bootstrapsPerformed = i * processes; 
  bootStrapsPerProcess = i;   
      
  concatenateBSFiles(processes, bootstrapFileName);
  removeBSFiles(processes, bootstrapFileName);  
  
  MPI_Barrier(MPI_COMM_WORLD); 
#else
  bootstrapsPerformed = i;
#endif

  rax_freeParams(tr->NumberOfModels, catParams);
  rax_free(catParams);

  rax_freeParams(tr->NumberOfModels, gammaParams);
  rax_free(gammaParams);

  if(adef->bootStopping)
    {
      freeBitVectors(bitVectors, 2 * tr->mxtips);
      rax_free(bitVectors);
      freeHashTable(h);
      rax_free(h);      
    }

 
  {      
    double t;

    printBothOpenMPI("\n\n");
    
    if(adef->bootStopping)
      {
	if(bootStopIt)
	  {
	    switch(tr->bootStopCriterion)
	      {
	      case FREQUENCY_STOP:
		printBothOpenMPI("Stopped Rapid BS search after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage);	      
		break;
	      case MR_STOP:
		printBothOpenMPI("Stopped Rapid BS search after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);	     
		break;
	      case MRE_STOP:
		printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);	     
		break;
	      case MRE_IGN_STOP:
		printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE_IGN-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);	     
		break;
	      default:
		assert(0);
	      }
	  }
	else
	  { 
	    switch(tr->bootStopCriterion)	     
	      {
	      case FREQUENCY_STOP:
		printBothOpenMPI("Rapid BS search did not converge after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage);
		break;
	      case MR_STOP:
		printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);
		break;
	      case MRE_STOP:
		printBothOpenMPI("Rapid BS search did not converge after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);
		break;
	      case MRE_IGN_STOP:
		printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR_IGN-based Bootstopping criterion\n", bootstrapsPerformed);
		printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage);
		break;
	      default:
		assert(0);
	      }
	  }
      }
    

    t = gettime() - masterTime;

    printBothOpenMPI("Overall Time for %d Rapid Bootstraps %f seconds\n", bootstrapsPerformed, t);     
    printBothOpenMPI("Average Time per Rapid Bootstrap %f seconds\n", (double)(t/((double)bootstrapsPerformed)));  
        
    if(!adef->allInOne)     
      {
	printBothOpenMPI("All %d bootstrapped trees written to: %s\n", bootstrapsPerformed, bootstrapFileName);

#ifdef _WAYNE_MPI      	 
	MPI_Finalize();
#endif
	exit(0);
      }
  }
 
  
  /* ML-search */ 

  mlTime = gettime();
  double t = mlTime;
  
  printBothOpenMPI("\nStarting ML Search ...\n\n"); 

  /***CLEAN UP reduction stuff */  

  reductionCleanup(tr, originalRateCategories, originalInvariant);  

  /****/     	   
  
#ifdef _WAYNE_MPI 
  restoreTL(rl, tr, n * processID); 
#else
  restoreTL(rl, tr, 0);
#endif

  resetBranches(tr);

  

  evaluateGenericInitrav(tr, tr->start);   

  

  modOpt(tr, adef, TRUE, adef->likelihoodEpsilon);  

#ifdef _WAYNE_MPI
  
  if(bootstrapsPerformed <= 100)
    fastEvery = 5;
  else
    fastEvery = bootstrapsPerformed / 20;

  for(i = 0; i < bootstrapsPerformed; i++)
    rl->t[i]->likelihood = unlikely;

  for(i = 0; i < bootStrapsPerProcess; i++)
    {            
      j = i + n * processID;
    
      if(i % fastEvery == 0)
	{	 
	  restoreTL(rl, tr, j); 	 	    	   	
	  
	  resetBranches(tr);	 

	  evaluateGenericInitrav(tr, tr->start);
	  	  
	  treeEvaluate(tr, 1); 		 
	  	  
	  optimizeRAPID(tr, adef);	  			         	  
	  
	  saveTL(rl, tr, j);  
	}    
    }     
#else
  for(i = 0; i < bootstrapsPerformed; i++)
    {            
      rl->t[i]->likelihood = unlikely;
    
      if(i % fastEvery == 0)
	{
	 
	  
	  restoreTL(rl, tr, i); 	 	    	   	
	  
	  resetBranches(tr);	 

	  evaluateGenericInitrav(tr, tr->start);
	  	  
	  treeEvaluate(tr, 1); 		 
	  	  
	  optimizeRAPID(tr, adef);	  			         	  
	  
	 

	  saveTL(rl, tr, i); 	 
	}    
    }     
#endif
 
  printBothOpenMPI("Fast ML optimization finished\n\n"); 
  t = gettime() - t;
  
#ifdef _WAYNE_MPI
  printBothOpen("Fast ML search on Process %d: Time %f seconds\n\n", processID, t);
  j = n * processID;

  qsort(&(rl->t[j]), n, sizeof(topolRELL*), compareTopolRell);

  restoreTL(rl, tr, j);
#else
  printBothOpen("Fast ML search Time: %f seconds\n\n", t);
  qsort(&(rl->t[0]), bootstrapsPerformed, sizeof(topolRELL*), compareTopolRell);
       
  restoreTL(rl, tr, 0);
#endif
  t = gettime();
  
  resetBranches(tr);

  evaluateGenericInitrav(tr, tr->start);

  modOpt(tr, adef, TRUE, adef->likelihoodEpsilon);     
  
  slowSearches = bootstrapsPerformed / 5;
  if(bootstrapsPerformed % 5 != 0)
    slowSearches++;

  slowSearches  = MIN(slowSearches, 10); 

#ifdef _WAYNE_MPI
   if(processes > 1)
    {
      if(slowSearches % processes == 0)
        slowSearches = slowSearches / processes;
      else
        slowSearches = (slowSearches / processes) + 1;
    }
   
   for(i = 0; i < slowSearches; i++)
    {           
      j = i + n * processID;
      restoreTL(rl, tr, j);     
      rl->t[j]->likelihood = unlikely;  
      
      evaluateGenericInitrav(tr, tr->start);

      treeEvaluate(tr, 1.0);   
      
      thoroughOptimization(tr, adef, rl, j); 
   }   
#else
  for(i = 0; i < slowSearches; i++)
    {           
      restoreTL(rl, tr, i);     
      rl->t[i]->likelihood = unlikely;  
      
      evaluateGenericInitrav(tr, tr->start);

      treeEvaluate(tr, 1.0);   
      
      thoroughOptimization(tr, adef, rl, i); 	 

   }
#endif
  
  

  /*************************************************************************************************************/  
  
  if(tr->rateHetModel == CAT) 
    {      
      catToGamma(tr, adef);    
      modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); 
    }

  bestIndex = -1;
  bestLH = unlikely;
    
#ifdef _WAYNE_MPI
  for(i = 0; i < slowSearches; i++)
    { 
      j = i + n * processID;
      restoreTL(rl, tr, j);
      resetBranches(tr);

      evaluateGenericInitrav(tr, tr->start);

      treeEvaluate(tr, 2);
      
      printBothOpen("Slow ML Search %d Likelihood: %f\n", j, tr->likelihood);
      
      if(tr->likelihood > bestLH)
	{
	  bestLH = tr->likelihood;
	  bestIndex = j;
	}
    }
  /*printf("processID = %d, bestIndex = %d; bestLH = %f\n", processID, bestIndex, bestLH);*/
#else
  for(i = 0; i < slowSearches; i++)
    { 
      restoreTL(rl, tr, i);
      resetBranches(tr);

      evaluateGenericInitrav(tr, tr->start);

      treeEvaluate(tr, 2);
      
      printBothOpen("Slow ML Search %d Likelihood: %f\n", i, tr->likelihood);
      
      if(tr->likelihood > bestLH)
	{
	  bestLH = tr->likelihood;
	  bestIndex = i;
	}
    }
#endif
  
  printBothOpenMPI("Slow ML optimization finished\n\n");

  t = gettime() - t;

#ifdef _WAYNE_MPI
  printBothOpen("Slow ML search on Process %d: Time %f seconds\n", processID, t);
#else
  printBothOpen("Slow ML search Time: %f seconds\n", t);
#endif
  
  t = gettime();
  
  restoreTL(rl, tr, bestIndex);
  resetBranches(tr);

  evaluateGenericInitrav(tr, tr->start);
 
  treeEvaluate(tr, 2); 
         
  Thorough = 1;
  tr->doCutoff = FALSE;  
	 
  treeOptimizeThorough(tr, 1, 10);
  evaluateGenericInitrav(tr, tr->start);
  
  modOpt(tr, adef, TRUE, adef->likelihoodEpsilon);
  t = gettime() - t;

#ifdef _WAYNE_MPI
  printBothOpen("Thorough ML search on Process %d: Time %f seconds\n", processID, t);
#else
  printBothOpen("Thorough ML search Time: %f seconds\n", t);
#endif

#ifdef _WAYNE_MPI
  bestLH = tr->likelihood;

  printf("\nprocessID = %d, bestLH = %f\n", processID,  bestLH);

  if(processes > 1)
    {
      double *buffer;
      int bestProcess;

      buffer = (double *)rax_malloc(sizeof(double) * processes);
      for(i = 0; i < processes; i++)
        buffer[i] = unlikely;
      buffer[processID] = bestLH;
      for(i = 0; i < processes; i++)
        MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD);
      bestLH = buffer[0];
      bestProcess = 0;
      for(i = 1; i < processes; i++)
        if(buffer[i] > bestLH)
          {
             bestLH = buffer[i];
             bestProcess = i;
          }
      rax_free(buffer);

      if(processID != bestProcess)
        {
          MPI_Finalize();
          exit(0);
        }
    }
#endif

  printBothOpen("\nFinal ML Optimization Likelihood: %f\n", tr->likelihood);   
  printBothOpen("\nModel Information:\n\n");
  
  printModelParams(tr, adef);    
  
  strcpy(bestTreeFileName, workdir); 
  strcat(bestTreeFileName, "RAxML_bestTree.");
  strcat(bestTreeFileName,         run_id);
   
  Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE);
  f = myfopen(bestTreeFileName, "wb");
  fprintf(f, "%s", tr->tree_string);
  fclose(f);

  if(adef->perGeneBranchLengths)
    printTreePerGene(tr, adef, bestTreeFileName, "w");

  
  overallTime = gettime() - masterTime;
  mlTime    = gettime() - mlTime;

  printBothOpen("\nML search took %f secs or %f hours\n", mlTime, mlTime / 3600.0); 
  printBothOpen("\nCombined Bootstrap and ML search took %f secs or %f hours\n", overallTime, overallTime / 3600.0);   
  printBothOpen("\nDrawing Bootstrap Support Values on best-scoring ML tree ...\n\n");
      
  
  freeTL(rl);   
  rax_free(rl);       
  
  calcBipartitions(tr, adef, bestTreeFileName, bootstrapFileName);    
  

  overallTime = gettime() - masterTime;

  printBothOpen("Program execution info written to %s\n", infoFileName);
  printBothOpen("All %d bootstrapped trees written to: %s\n\n", bootstrapsPerformed, bootstrapFileName);
  printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName);
  if(adef->perGeneBranchLengths && tr->NumberOfModels > 1)    
    printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to  %s.PARTITION.%d\n\n", bestTreeFileName,  bestTreeFileName, 
		  tr->NumberOfModels - 1);    
  printBothOpen("Best-scoring ML tree with support values written to: %s\n\n", bipartitionsFileName);
  printBothOpen("Best-scoring ML tree with support values as branch labels written to: %s\n\n", bipartitionsFileNameBranchLabels);
  printBothOpen("Overall execution time for full ML analysis: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0);

#ifdef _WAYNE_MPI
  MPI_Finalize();
#endif      

  exit(0); 
}
Пример #13
0
//Use the plausibility checker overhead
void plausibilityChecker(tree *tr, analdef *adef)
{
  FILE
  *treeFile, 
    *treeFile2,
    *rfFile;
  
  tree 
    *smallTree = (tree *)rax_malloc(sizeof(tree));

  char 
    rfFileName[1024];

  int
    numberOfTreesAnalyzed = 0,
    i;

  double 
    avgRF = 0.0,
    sumEffectivetime = 0.0;

  /* set up an output file name */

  strcpy(rfFileName,         workdir);  
  strcat(rfFileName,         "RAxML_RF-Distances.");
  strcat(rfFileName,         run_id);

  rfFile = myfopen(rfFileName, "wb");  

  assert(adef->mode ==  PLAUSIBILITY_CHECKER);

  /* open the big reference tree file and parse it */

  treeFile = myfopen(tree_file, "r");

  printBothOpen("Parsing reference tree %s\n", tree_file);

  treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE);


  assert(tr->mxtips == tr->ntips);
  
  /*************************************************************************************/
  /* Preprocessing Step */

  double 
    preprocesstime = gettime();
  
  /* taxonToLabel[2*tr->mxtips - 2]; 
  Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */
  int 
    *taxonToLabel  = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)),

    /* taxonHasDeg[2*tr->mxtips - 2] 
    Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees 
    (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */

    *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)),

    /* taxonToReduction[2*tr->mxtips - 2]; 
  Array used for reducing bitvector and speeding up extraction: 

  (Taxonnumber - 1) -> Index in smallTreeTaxa (starting from 0)
  which is also:
  (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree))
  (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */

    *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int));
    
  int 
    newcount = 0; //counter used for correct traversals

  /* labelToTaxon[2*tr->mxtips - 2];
  is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */
  int 
    *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int));
  
  /* Preorder-Traversal of the large tree */
  preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount);

  newcount = 0; //counter set to 0 to be now used for Eulertraversal

  /* eulerIndexToLabel[4*tr->mxtips - 5]; 
  Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */
  int* 
    eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int));

  /* taxonToEulerIndex[tr->mxtips]; 
  Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) 
  is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step
  */
  int*
    taxonToEulerIndex  = (int *)rax_malloc((tr->mxtips) * sizeof(int));

  /* Init taxonToEulerIndex and taxonToReduction */
  int 
    ix;

  for(ix = 0; ix < tr->mxtips; ++ix)    
    taxonToEulerIndex[ix] = -1;    
  
  for(ix = 0; ix < (2*tr->mxtips - 2); ++ix)    
    taxonToReduction[ix] = -1;    


  /* Eulertraversal of the large tree*/
  unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex);

  /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C
  Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */
  RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5);

  double 
    preprocessendtime = gettime() - preprocesstime;

  /* Proprocessing Step End */
  /*************************************************************************************/

  printBothOpen("The reference tree has %d tips\n", tr->ntips);

  fclose(treeFile);
  
  /***********************************************************************************/
  /* RF-OPT Preprocessing Step */
  /***********************************************************************************/

  /* now see how many small trees we have */
  treeFile = getNumberOfTrees(tr, bootStrapFile, adef);
  treeFile2 = getNumberOfTrees(tr, bootStrapFile, adef);

  checkTreeNumber(tr->numberOfTrees, bootStrapFile);

  /* allocate a data structure for parsing the potentially mult-furcating tree */

  allocateMultifurcations(tr, smallTree);

  /* Start Additional preprocessing step */

  int 
    numberOfBips = 0,
    numberOfSets = 0;

  //Stores the number of bips of each tree
  int *bipsPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int));

  //Stores the number of taxa for each tree
  int *taxaPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int));

  //To calculate all bipartitions, I created a new treeFile2 and a new getNumberOfTrees method!!
  for(i = 0; i < tr->numberOfTrees; i++) {

    int this_treeBips = readMultifurcatingTree(treeFile2, smallTree, adef, TRUE);
  
    numberOfBips = numberOfBips + this_treeBips;
  
    numberOfSets = numberOfSets + this_treeBips * this_treeBips;

    bipsPerTree[i] = this_treeBips;
  }

  printf("numberOfBips: %i , numberOfSets: %i \n \n", numberOfBips, numberOfSets);  

  //stores induced bips (OLD?)
  unsigned int *ind_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int));

  //stores smalltree bips (OLD?)
  unsigned int *s_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int));

  //stores small bips per tree
  unsigned int ***sBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**));

  //stores induced bips per tree
  unsigned int ***indBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**));

  //stores vLength of each tree for processing bitVectors
  unsigned int *vectorLengthPerTree = (unsigned int *)rax_malloc(tr->numberOfTrees * sizeof(unsigned int*));

  //stores the corresponding tree number for each bip
  int *treenumberOfBip = (int *)rax_malloc(numberOfBips * sizeof(int));

  //Stores all dropsets of all trees 
  int **sets = (int **)rax_malloc(numberOfSets * sizeof(int*)); 
  //int **sets = NULL;

  //For each tree, stores a translation array from taxanumber smalltree->largetree
  int **smallTreeTaxaList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); 

  //For each tree, store a translation array from taxanumber largetree->smalltree
  int **taxonToReductionList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*));

  //I use these variables as global variables for all trees to determine the max number of possible sets to generate a static array
  int currentBips = 0;
  int currentSmallBips = 0;
  int currentSets = 0;

  //int currentTree = 0; already there in number of trees analyzed
  
  //Prefill sets with -1s
  for(int it = 0;it < (numberOfSets);it++){
  int fill[1] = {-1};
  sets[it] = fill; 
  }
  
  /***********************************************************************************/
  /* RF-OPT Preprocessing Step End */
  /***********************************************************************************/

  /* loop over all small trees */

  for(i = 0; i < tr->numberOfTrees;  i++)
    {      
      int
    numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE);
      
      if(numberOfSplits > 0)
  {
    int
      firstTaxon;           

    double
      rec_rf,
      maxRF;

    if(numberOfTreesAnalyzed % 100 == 0)
      printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits);    
    
    /* compute the maximum RF distance for computing the relative RF distance later-on */
    
    /* note that here we need to pay attention, since the RF distance is not normalized 
       by 2 * (n-3) but we need to account for the fact that the multifurcating small tree 
       will potentially contain less bipartitions. 
       Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number 
       of bipartitions of the pruned down large reference tree for which we know that it is 
       bifurcating/strictly binary */
    
    maxRF = (double)(2 * numberOfSplits);
    
    /* now get the index of the first taxon of the small tree.
       we will use this to unambiguously store the bipartitions 
    */
    
    firstTaxon = smallTree->start->number;

    //Saves the number of taxa in the tree (for RF-OPT)
    taxaPerTree[numberOfTreesAnalyzed] = smallTree->ntips; 
    
    /***********************************************************************************/
    /* Reconstruction Step */
    
    double 
      time_start = gettime();
    
    /* Init hashtable to store Bipartitions of the induced subtree T|t_i */
    /* 
       using smallTree->ntips instead of smallTree->mxtips yields faster code 
       e.g. 120 versus 128 seconds for 20,000 small trees on my laptop 
     */
    hashtable
      *s_hash = initHashTable(smallTree->ntips * 4);


    /* Init hashtable to store Bipartitions of the reference tree t_i*/
    hashtable
      *ind_hash = initHashTable(smallTree->ntips * 4);
    
    /* smallTreeTaxa[smallTree->ntips]; 
       Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber)  */
    int* 
      smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int));
    
    /* counter is set to 0 for correctly extracting taxa of the small tree */
    newcount = 0; 
    
    int 
      newcount2 = 0;
    
    /* seq2[2*smallTree->ntips - 2]; 
       stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */
    int* 
      seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int));
    
    /* used to store the vectorLength of the bitvector */
    unsigned int 
      vectorLength;
    
    /* extract all taxa of the smalltree and store it into an array, 
       also store all counts of taxa and nontaxa in taxonToReduction */
    rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2);
    
    rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2);
    
    /* counter is set to 0 to correctly preorder traverse the small tree */
    newcount = 0;
    
    /* Preordertraversal of the small reference tree and save its sequence into seq2 for later extracting the bipartitions, it
       also stores information about the degree of every node */
    
    rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount);
    
    /* calculate the bitvector length */
    if(smallTree->ntips % MASK_LENGTH == 0)
      vectorLength = smallTree->ntips / MASK_LENGTH;
    else
      vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); 


    /***********************************************************************************/
    /* RF-OPT Additional Preprocessing storing Bipartitions */
    /***********************************************************************************/    

    vectorLengthPerTree[numberOfTreesAnalyzed] = vectorLength;
    
    unsigned int 
      **bitVectors = rec_initBitVector(smallTree, vectorLength);

    unsigned int
      **sBips;

    /* store all non trivial bitvectors using an subtree approach for the reference subtree and 
       store it into a hashtable, this method was changed for multifurcation */
    sBips = RFOPT_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, 
               firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits);

    sBipsPerTree[numberOfTreesAnalyzed] = sBips;

    /***********************************************************************************/
    /* End RF-OPT Additional Preprocessing storing Bipartitions */
    /***********************************************************************************/  
    
    /* counter is set to 0 to be used for correctly storing all EulerIndices */
    newcount = 0; 
    
    /* smallTreeTaxonToEulerIndex[smallTree->ntips]; 
       Saves all first Euler indices for all Taxons appearing in small Tree: 
       (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */
    int* 
      smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int));
    
    /* seq[(smallTree->ntips*2) - 1] 
       Stores the Preordersequence of the induced small tree */
    int* 
      seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int));
    
    
    /* iterate through all small tree taxa */
    for(ix = 0; ix < smallTree->ntips; ix++) 
      {        
        int 
          taxanumber = smallTreeTaxa[ix];
        
        /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/
        smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; 
        
        /* Saves all Preorderlabel of the smalltree taxa in seq*/
        seq[newcount] = taxonToLabel[taxanumber-1];
        
        newcount++;
      }
    
    /* sort the euler indices to correctly calculate LCA */
    //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1);             
    
    qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers);
    
    //printf("newcount2 %i \n", newcount2);      
    /* Iterate through all small tree taxa */
    for(ix = 1; ix < newcount; ix++)
      {  
        /* query LCAs using RMQ Datastructure */
        seq[newcount - 1 + ix] =  eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])];   
        
        /* Used for dynamic programming. We save an index for every inner node:
     For example the reference tree has 3 inner nodes which we saves them as 0,1,2.
     Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. 
     Therefore we mark them as 3,4,5,6,7  */
        
        taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2;
        
        newcount2 += 1;
      }
    
    /* sort to construct the Preordersequence of the induced subtree */
    //quicksort(seq,0,(2*smallTree->ntips - 2));
    
    qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers);
    
    /* calculates all bipartitions of the reference small tree and count how many bipartition it 
    shares with the induced small tree and stores those bipartitions in a additional hashtable called ind_hash */
    int 
      rec_bips = 0;

    unsigned int
      **indBips;

    indBips = RFOPT_findAddBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, ind_hash, taxonToReduction);
      
    indBipsPerTree[numberOfTreesAnalyzed] = indBips; 

    /* calculates all bipartitions of the reference small tree and put them into ind_hash*/
    // rec_extractBipartitionsMulti(bitVectors, seq2, (2*smallTree->ntips - 1),tr->mxtips, vectorLength, smallTree->ntips, 
    // firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits);


    /* Reconstruction Step End */
    /***********************************************************************************/
    
    double 
      effectivetime = gettime() - time_start;
    
    /*
      if(numberOfTreesAnalyzed % 100 == 0)
      printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime);
    */
    
    /* compute the relative RF */


    /***********************************************************************************/
    /* RF-OPT Save Translation Vectors */
    /***********************************************************************************/
      
    //copy array taxonToReduction because it is originally defined in preprocessing step
    int * taxonToReductionCopy = (int *)rax_malloc((tr->mxtips)*sizeof(int));

    memcpy(taxonToReductionCopy,taxonToReduction,(tr->mxtips)*sizeof(int));

    //storing smallTree and taxonToReduction Arrays for further usage
    smallTreeTaxaList[numberOfTreesAnalyzed] = smallTreeTaxa;

    taxonToReductionList[numberOfTreesAnalyzed] = taxonToReductionCopy;   

    int this_currentSmallBips = 0; //Variable resets everytime for each tree analyzed
    
    
    /***********************************************************************************/
    /* End RF-OPT Save Translation Vectors */
    /***********************************************************************************/
  

    rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF;
    
    assert(numberOfSplits >= rec_bips);      

    avgRF += rec_rf;
    sumEffectivetime += effectivetime;
    
    //if(numberOfTreesAnalyzed % 100 == 0)
    printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf);
    
    fprintf(rfFile, "%d %f\n", i, rec_rf);
    
    //rax_free(smallTreeTaxa); //Need it for calculating the SmallTreeTaxaList after all iterations!
    rax_free(seq);
    rax_free(seq2);
    rax_free(smallTreeTaxonToEulerIndex);

    numberOfTreesAnalyzed++; //Counting the number of trees analyzed
    }

  }// End of Small Tree Iterations

  /***********************************************************************************/
  /* RF-OPT DropSet Calculation using BitVectors */
  /***********************************************************************************/

  
  log_info("===> Create DropSet Datastructure \n");

  static Hashmap* map = NULL;
  //Set a hashmap for dropsets with a dropset comparision and standard hash
  map = Hashmap_create(compareDropSet, NULL);

  static Hashmap** mapArray = NULL;
  //Set an array to store the pointers to bitvector hashtables for each tree 
  mapArray = rax_malloc(tr->numberOfTrees * sizeof(Hashmap*));


  printf("===> BitVector Set Calculation \n");

  //Calculate dropsets of two given bips lists and extract all sets into array sets and into a hashmap. Each set has following format
  //dropset = {taxa_1,taxa_2,...,taxa_n,-1};
  //Furtheremore calculate Dropset generates two data structures from type bips and dropsets which are pointing to each other in hashtables
  calculateDropSets(mapArray, map, indBipsPerTree, sBipsPerTree, sets, smallTreeTaxaList, bipsPerTree, 
  taxaPerTree, vectorLengthPerTree, tr->numberOfTrees);

  /***********************************************************************************/
  /* RF-OPT Graph Construction */
  /***********************************************************************************/

  // printf("\n == Sets == \n");
  // for(int fooo = 0; fooo < numberOfSets; fooo++){
  //   printf("Set %i: ", fooo);
  //   int i = 0;
  //   while(sets[fooo][i] > -1) {
  //    printf("%i ",sets[fooo][i]);
  //    i++;
  //   }
  //   printf("\n");
  // }
  // printf("\n");
  /*
    Filter for unique sets
  */
  log_info("===> Hashmap tests...\n");
  
  Hashmap_traverse(map, traverse_cb);

  // int key[2] = {0,-1};

  // Dropset* drop = Hashmap_get(map,key);
  // DArray* bips = drop->bipartitions;

  // for(int i = 0; i < DArray_count(bips); i++) {
  //   Bipartition* bip = DArray_get(bips,i);
  //   printBitVector(bip->bitvector[0]);
  //   printf("matching: %i \n", bip->matching);
  //   printf("tree: %i \n", bip->treenumber);
  // }

  // Bipartition* bipFromHash = DArray_first(bips);
  // Bipartition* testBip = Hashmap_get(mapArray[0],bipFromHash->bitvector);
  // printf("matching before: %i",testBip->matching);
  // testBip->matching = 999;

  // for(int i = 0; i < DArray_count(bips); i++) {
  //   Bipartition* bip = DArray_get(bips,i);
  //   printBitVector(bip->bitvector[0]);
  //   printf("matching: %i \n", bip->matching);
  //   printf("tree: %i \n", bip->treenumber);
  // }


  printf("===> Filter for unique sets (naive)...\n");

  /* unique sets array data structures */
  int** uniqSets = (int **) rax_malloc(sizeof(int*) * numberOfSets);
  int* setsToUniqSets = (int*) rax_malloc(sizeof(int) * numberOfSets);
  int numberOfUniqueSets = 0;
  int dropSetCount = 0;



  //stores the scores for each bips, we are using a bitvector approach (need to scale)
    
  //Legacy Code 
  int bvec_scores = 0;
  
  numberOfUniqueSets = getUniqueDropSets(sets, uniqSets, setsToUniqSets, numberOfSets);

  printf("number of unique sets: %i \n", numberOfUniqueSets);

  /*
    Detect initial matchings, we calculate them using bitvectors to represent our bipartitions
  */
  printf("===> Detect initial matchings...\n");
  int vLengthBip = 0;

  //determine the bitVector Length of our bitVector
  if(numberOfBips % MASK_LENGTH == 0)
    vLengthBip = numberOfBips / MASK_LENGTH; 
  else 
    vLengthBip = numberOfBips / MASK_LENGTH + 1;

  //Initialize a bvecScore vector with 0s
  int* bvecScores = (int*)rax_calloc(vLengthBip,sizeof(int));

  //Calculate Initial Matchings and save the result in bvecScores
  detectInitialMatchings(sets, bvecScores, bipsPerTree, numberOfTreesAnalyzed, vLengthBip); 

  //Short summary until now:
  // - bipsPerTree consists of all bipartitions per tree
  // - bvecScores is the bitvector setting 1 to all bipartition indices which can score 
  // - taxaPerTree number of taxa per tree
  // - smallTreeTaxaList list of all smalltree->largetree translation arrays

  /*
    Generate useful data structures for calculating and updating scores
  */
  printf("===> Create data structures...\n");  
  //Stores the number of bips per Set and initialize it with 0s
  int* numberOfBipsPerSet = (int*)rax_calloc(numberOfUniqueSets,sizeof(int));

  //Stores all sets which includes this taxa
  int **setsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) *sizeof(int*));
  
  //Now calculate number of bipartitions affected by each unique set
  for(int i = 0; i < numberOfSets; i++) {

    int setindex = setsToUniqSets[i];

    numberOfBipsPerSet[setindex]++;
  }

  //Now using the knowledge of how many bips there are per set, generate an array for each unique dropset containing all bips
  int** bipsOfDropSet = (int**)rax_malloc(sizeof(int*)*numberOfUniqueSets);
  
  //Allocate the space needed for storing all bips
  for(int i = 0; i < numberOfUniqueSets; i++) {

    bipsOfDropSet[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerSet[i]); 
  }
  
  printf("==> Initialize the Bips Of Taxa \n");
  //Stores the number of bips each taxa is included (ABC|DE is stored by A,B,C,D and E)
  //It can be calculated by iterating through all trees and adding the taxa 
  int **bipsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) * sizeof(int*));
  int *numberOfBipsPerTaxa = (int*)rax_calloc((tr->mxtips + 1), sizeof(int));
  int *taxaBipsCounter = (int*)rax_calloc((tr->mxtips + 1), sizeof(int));

  //Now add up all
  for (int tree = 0; tree < tr->numberOfTrees; tree++) {

    int* list = smallTreeTaxaList[tree];

    for (int j = 0; j < taxaPerTree[tree]; j++) {

      int taxa = list[j];

      numberOfBipsPerTaxa[taxa] = numberOfBipsPerTaxa[taxa] + bipsPerTree[tree];
    } 
  }

  //Now create dummy arrays inside bipsOfTaxa
  for(int i = 1; i < tr->mxtips+1; i++) {
    bipsOfTaxa[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerTaxa[i]);
  }

  printf("==> Storing all bip indices of a certain dropset into an array \n");
  //For checking if all dropsets are iterated
  dropSetCount = 0;
  //Arrays of counter to keep in track
  int* counterOfSet = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets);
  for(int i = 0; i < numberOfUniqueSets; i++) {
    counterOfSet[i] = 0;
  }

  currentBips = 0; //Need to keep in track of the number of bips
  //First iterate through all trees 
  for(int i = 0; i < numberOfTreesAnalyzed; i++ ) {

    //get the correct smallTreeTaxa List
    int* list = smallTreeTaxaList[i];

    //For each bipartition in the tree
    for(int j = 0; j < bipsPerTree[i]; j++) {

      //Look at all bips it is compared too
      int dropSetsPerBip = bipsPerTree[i];

      for(int k = 0; k < dropSetsPerBip; k++){

        int indexOfUniqDropSet = setsToUniqSets[dropSetCount + k];

        int* bips_array = bipsOfDropSet[indexOfUniqDropSet]; 

        //add bipartition j into the bips array of its dropset
        bips_array[counterOfSet[indexOfUniqDropSet]] = currentBips; 

        //increment the internal array index 
        counterOfSet[indexOfUniqDropSet]++;
      }
    //Jump to the next correct dropSetCount!
    dropSetCount = dropSetCount + dropSetsPerBip;

    //now insert the bip into bipsOfTaxa Array
    for(int ix = 0; ix < taxaPerTree[i]; ix++) {

      //get the taxa number
      int stree_Taxa = list[ix];

      //get the bips list of this taxa number
      int* bipsList = bipsOfTaxa[stree_Taxa];

      //now get the position of the biplist and put in our bip index
      bipsList[taxaBipsCounter[stree_Taxa]] = currentBips;

      //increment the counter 
      taxaBipsCounter[stree_Taxa]++;

    }

    //increment currentBips
    currentBips++; 
    }

  }

  /***********************************************************************************/
  /* End RF-OPT Graph Construction */
  /***********************************************************************************/

  /* Short summary :
    sets - array of all dropsets
    uniqSets - array of all unique dropsets
    bipsPerTree - bips per tree
    setsToUniqSets - translates the index of sets to the index of its unique dropset index
    bipsOfDropSets - all bips which disappear when dropset i is pruned
    scores - has all scores between 0 and 1 for the bips (however 0s can be found out by looking at all dropsets with link to dropset 0 (because we sort and it will always be the lowest))  
  */


  /***********************************************************************************/
  /* RF-OPT Initial Score Calculation */
  /***********************************************************************************/


  unsigned int bipsVectorLength;

  /* calculate the bitvector length for bips bitvector */
  if(numberOfBips % MASK_LENGTH == 0)
    bipsVectorLength = numberOfBips / MASK_LENGTH;
  else
    bipsVectorLength = 1 + (numberOfBips / MASK_LENGTH); 

  //Starting from index 1 (because 0 stands for all who already matches)
  //We need a score array saving the scores for each uniqset
  int* rf_score = (int*)rax_calloc(numberOfUniqueSets,sizeof(int));

  printf("==> Calculating the score for the first iteration \n \n");

  //Store all bvecs of all merged and destroyed bipartitions per DropSet 
  int* bvecs_bips = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets);
  int* bvecs_destroyed = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets);



  //Iterate through all sets
  for(int i = 0; i < numberOfUniqueSets; i++) {

    //Bitvectors of merged and destroyed
    int bvec_destroyed = 0;

    int* set = uniqSets[i]; //Get the dropset, first dropset is 0 (if something is matching)

    //printf(" ==> Analyze Unique DropSet %i \n", i);

    //We use this data structure to keep track of the to toggled bits
    int* toggleBits = (int*)rax_calloc(numberOfBips, sizeof(int));

    //Now iterate through the set
    int j = 0;

    //Stores the affected bips into a bitvector
    int bvec_bips = 0;

    while(set[j] != -1) {

      int taxa = set[j]; //Get the taxa
      //printf("  Taxa number is %i \n",taxa);

      //Check if set[j] is itself already a set
      int test[2] = {taxa,-1}; 

      //0 if it is not a set, index + 1 otherwise
      int test_index = contains(test, uniqSets, numberOfUniqueSets);

      if(test_index){
        //printf("  It also is in uniqSet %i \n", test_index - 1);
        bvec_bips = getBipsOfDropSet(bvec_bips, (test_index - 1), numberOfBipsPerSet, bipsOfDropSet);

      }

      //Get all bips of this taxa to detect which one will be destroyed
      int* listOfBips = bipsOfTaxa[taxa]; 

      //Go through all bipartitions containing this taxa
      for(int k = 0; k < numberOfBipsPerTaxa[taxa]; k++){

        int bipindex = listOfBips[k]; //Get the index of the Bipartition

        int bip = ind_bips[bipindex];

        //Now analyze this Bipartition

        //Which tree does this bipartition belongs too?
        int treenumber = treenumberOfBip[bipindex];

        //Get the taxonToSmallTree Array of this tree
        int* stTaxa = taxonToReductionList[treenumber];

        //Translate the global taxon number it into the local index used by our bips
        int translated_index = stTaxa[taxa - 1]; //We use taxa - 1 because we start counting at taxa 1 = 0 !

        //Save the to toggle index into toggleBits vector
        toggleBits[bipindex] |= 1 << translated_index;

        //Sort for bits set on one side of the bip and on the other side
        int leftBits = __builtin_popcount(toggleBits[bipindex] & bip);
        int rightBits = __builtin_popcount(toggleBits[bipindex]) - leftBits;

        //Check for the number of bits set in the original bip 
        int leftBip = __builtin_popcount(bip);
        int rightBip = taxaPerTree[treenumber] - leftBip;

        //Subtract the total number of bits set on one side of the bip with the bits we have to toggle
        int leftBip_after = leftBip - leftBits;
        int rightBip_after = rightBip - rightBits;

        //Check if bipartition gets trivial/destroyed due to pruning the taxa and set the bit (representing the bip) which is destroyed
        if((leftBip_after <= 1) | (rightBip_after <=1)) {

        //Add bips to the bits which represent destroyed bipartitions
        bvec_destroyed = setBit(bvec_destroyed,bipindex);

        }
      
      } 

      j++;

    }//End iterate through the set


    int penality = 0;
    int score = 0;

    int bvec_mask = 0;
    bvec_mask = setOffSet(bvec_mask, numberOfBips);

    //Bitvector of already matching bips
    int bvec_tmp = 0;
    bvec_tmp = ~bvec_scores & bvec_mask;

    //Penality score are all bitvectors who were matching but is destroyed 
    penality = __builtin_popcount(bvec_destroyed & bvec_tmp);

    //Now iterate through bipsOfDropSet list and extract all bips which will merge into a bitVector
    bvec_bips = getBipsOfDropSet(bvec_bips, i, numberOfBipsPerSet, bipsOfDropSet);

    //Calculate the bitvectors which remains
    bvec_tmp = ~bvec_destroyed & bvec_mask;

    bvec_tmp = bvec_bips & bvec_tmp;

    score = __builtin_popcount(bvec_scores & bvec_tmp);

    rf_score[i] = score - penality;

    //Save our results for convenience into an array
    bvecs_bips[i] = bvec_bips;
    bvecs_destroyed[i] = bvec_destroyed;

  }//End Score Calculation


  printf("======> Scores:\n");
  for(int i = 0; i < numberOfUniqueSets; i++) {
    printf("RF Score for %i : %i \n", i, rf_score[i]);
    //printBitVector(bvecs_bips[i]);
    //printBitVector(bvecs_destroyed[i]);
  }

  int maxDropSet = getMax(rf_score, numberOfUniqueSets);
  printf("Max Element is %i \n", maxDropSet);




  /***********************************************************************************/
  /* RF-OPT Create Update Data Structures */
  /***********************************************************************************/


  printf("====> Delete DropSet from all bips and update numbers \n");

  //Create a bitVector to store all deleted taxa
  int bvec_deletedTaxa = 0;

  //Create a bitVector to store all still existing bips
  int bvec_existingBips = 0;

  //Create a bitvector to store deleted dropsets
  int bvec_deletedDropSets = 0;

  //Get the dropset
  int* deleteDropSet = uniqSets[maxDropSet];

  //Store it into a BitVector
  bvec_deletedDropSets = setBit(bvec_deletedDropSets,maxDropSet);

  //Select all bips destroyed by removing this dropset
  int bvec_destroyedBips = bvecs_destroyed[maxDropSet];

  //Select all bips that are now matching
  int bvec_matchingBips = bvecs_bips[maxDropSet];

  //Filter for existent bipartitions
  bvec_existingBips = getExistingBips(bvec_existingBips, numberOfBips, bvec_destroyedBips);

  //Iterate through its taxa
  int iterSet = 0;
  while(deleteDropSet[iterSet] != -1) {

    //Get taxon
    int taxon = deleteDropSet[iterSet];

    //Store the taxon into deletedTaxa BitVector
    bvec_deletedTaxa = setBit(bvec_deletedTaxa, taxon - 1);

    //Check if taxon is inside
    int test[2] = {taxon, -1};

    int index = contains(test, uniqSets, numberOfUniqueSets);

    iterSet++;
  }

  //printBitVector(bvec_existingBips);
  //printBitVector(bvec_deletedTaxa);

  //Update the scores with now matching bips
  bvec_scores = bvec_scores & (~bvec_matchingBips);

  //printBitVector(bvec_scores);

  /* Short summary :
    bvec_existingBips - bitVector of all still existing bips
    bvec_deletedTaxa - bitVector to keep track of destroyed taxa
  */

  /***********************************************************************************/
  /* TODO RF-OPT Update function */
  /***********************************************************************************/

  
  /***********************************************************************************/
  /* End RF-OPT Update function */
  /***********************************************************************************/


  //printf("Ind Bipartitions?: ");


  // printf("Induced Bipartitions: ");

  // printBitVector(ind_bips[0]);
  // printBitVector(ind_bips[1]);
  // printBitVector(ind_bips[2]);
  // printBitVector(ind_bips[3]);
  // printBitVector(ind_bips[4]);
  // printBitVector(ind_bips[5]);
  // printBitVector(ind_bips[6]);


  /***********************************************************************************/
  /* Console Logs for debugging */
  /***********************************************************************************/

  //Printing if

  printf("==> Unique Sets: ");
  for(int i = 0; i < numberOfUniqueSets; i++) {
    int j = 0;
    int* set = uniqSets[i];
    while(set[j] > -1) {
      printf("%i ",set[j]);
      j++;
    }
    printf("; ");
  }
  printf("\n");

  printf("\n == Sets == \n");
  for(int fooo = 0; fooo < numberOfSets; fooo++){
    printf("Set %i: ", fooo);
    int i = 0;
    while(sets[fooo][i] > -1) {
     printf("%i ",sets[fooo][i]);
     i++;
    }
    printf("\n");
  }
  printf("\n");

      
    //#define _PRINT_
      
    #ifdef _PRINT_

    for(int i = 0; i < numberOfUniqueSets; i++) {
      printf("Bips of Set %i: ", i);
        for(int j = 0; j < numberOfBipsPerSet[i]; j++) {
          int* bips = bipsOfDropSet[i];
          printf("%i ", bips[j]);
        }
      printf("\n");
    }


    printf("Induced Bips! \n");
    // Now checking which dropset would destroy which bipartition 
    for(int i = 0 ; i < numberOfBips; i++) {
      printf("Bip %i is %i \n",i,ind_bips[i]);
    }


    printf("Taxa Names : \n");
    for(int i = 0; i < tr->mxtips + 1; i++) {
      printf("%s ",tr->nameList[i]);
    }
    printf("\n");

    printf("Small Tree Taxa Names 0 : \n");
    for(int i = 0; i < taxaPerTree[0]; i++) {
      int* list = smallTreeTaxaList[0];
      int taxa = list[i]; 
      printf("%s ",tr->nameList[taxa]);
    }
    printf("\n");

    printf("Small Tree Taxa Names 1 : \n");
    for(int i = 0; i < taxaPerTree[1]; i++) {
      int* list = smallTreeTaxaList[1];
      int taxa = list[i]; 
      printf("%s ",tr->nameList[taxa]);
    }
    printf("\n");

    printf("Small Tree Taxa Names 2 : \n");
    for(int i = 0; i < taxaPerTree[2]; i++) {
      int* list = smallTreeTaxaList[2];
      int taxa = list[i]; 
      printf("%s ",tr->nameList[taxa]);
    }
    printf("\n");

    printf("Number of DropSets extracted%i \n",dropSetCount);
    printf("Number of Bips extracted %i \n",currentBips);

    //Testing ...
    printf("Number of Sets is %i \n",numberOfSets);
    printf("Number of Unique Sets is %i \n",numberOfUniqueSets);

    printf("==> Testing bips of unique sets \n");
    for(int i = 0; i < numberOfUniqueSets; i++) {
      printf("Bips of Set %i: ", i);
        for(int j = 0; j < numberOfBipsPerSet[i]; j++) {
          int* bips = bipsOfDropSet[i];
          printf("%i ", bips[j]);
        }
      printf("\n");
    }

    printf("==> Testing bips of taxa \n");
    for(int i = 1; i < tr->mxtips + 1; i++) {
      printf("Bips of Taxa %i: ", i);
        for(int j = 0; j < numberOfBipsPerTaxa[i]; j++) {
        int* bips = bipsOfTaxa[i];
        printf("%i ", bips[j]);
        }
      printf("\n");
    }



  printf("==> Unique Sets: ");
  for(int i = 0; i < numberOfUniqueSets; i++) {
    int j = 0;
    int* set = uniqSets[i];
    while(set[j] > -1) {
      printf("%i ",set[j]);
      j++;
    }
    printf("; ");
  }
  printf("\n");

  printf("==> setsToUniqSets: ");
  for(int i = 0; i < numberOfSets; i++) {
    printf("%i ",setsToUniqSets[i]);
  }
  printf("\n");

  //=== TREE GRAPH CONSTRUCTION ENDS ===
  printf("Scores: ");
  printBitVector(bvec_scores);
  
  printf("BipsPerTree: ");
  for(int foo = 0; foo < tr->numberOfTrees; foo++) {

    printf("%i ",bipsPerTree[foo]);

  } 

  printf("\nInduced Bips: ");
  for(int foo = 0;foo < numberOfBips; foo++) {
    
    printf("%u ",ind_bips[foo]);
  
  }

  printf("\nSmall Tree Bips: ");
  for(int foo = 0;foo < numberOfBips; foo++) {
  
    printf("%u ",s_bips[foo]);

  }

  printf("\n == Sets == \n");
  for(int fooo = 0; fooo < numberOfSets; fooo++){
    printf("Set %i: ", fooo);
    int i = 0;
    while(sets[fooo][i] > -1) {
     printf("%i ",sets[fooo][i]);
     i++;
    }
    printf("\n");
  }
  printf("\n");

  #endif

  printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed);
  
  printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed);
  
  printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); 
  
  printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime);
   
  printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName);

  printBothOpen("execution stats:\n\n");
  printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime);
  printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed);
  printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime);
 

  fclose(treeFile);
  fclose(rfFile);    
  
  /* free the data structure used for parsing the potentially multi-furcating tree */

  freeMultifurcations(smallTree);
  rax_free(smallTree);

  rax_free(taxonToLabel);
  rax_free(taxonToEulerIndex);
  rax_free(labelToTaxon);
  rax_free(eulerIndexToLabel);
  rax_free(taxonToReduction);
  rax_free(taxonHasDeg);
}
Пример #14
0
void computeAncestralStates(tree *tr, double referenceLikelihood, analdef *adef)
{
  int 
    counter = 0;
  
  char 
    treeFileName[2048],
    ancestralProbsFileName[2048],
    ancestralStatesFileName[2048];

  FILE
    *treeFile,
    *probsFile,
    *statesFile;

#ifdef _USE_PTHREADS
  tr->ancestralStates = (double*)malloc(getContiguousVectorLength(tr) * sizeof(double));
#endif

  /*  assert(!adef->compressPatterns);*/

  strcpy(ancestralProbsFileName,         workdir);
  strcpy(ancestralStatesFileName,         workdir);
  strcpy(treeFileName,         workdir);

  strcat(ancestralProbsFileName,         "RAxML_marginalAncestralProbabilities.");
  strcat(ancestralStatesFileName,        "RAxML_marginalAncestralStates.");
  strcat(treeFileName,                   "RAxML_nodeLabelledRootedTree.");

  strcat(ancestralProbsFileName,         run_id);
  strcat(ancestralStatesFileName,        run_id);
  strcat(treeFileName,                   run_id);
  
  probsFile = myfopen(ancestralProbsFileName,   "w");
  statesFile = myfopen(ancestralStatesFileName, "w");
  treeFile   = myfopen(treeFileName,            "w");

  assert(tr->leftRootNode == tr->rightRootNode->back);
  
  computeAncestralRec(tr, tr->leftRootNode, &counter, probsFile, statesFile, FALSE);
  computeAncestralRec(tr, tr->rightRootNode, &counter, probsFile, statesFile, FALSE);
  computeAncestralRec(tr, tr->rightRootNode, &counter, probsFile, statesFile, TRUE);
  
  evaluateGeneric(tr, tr->rightRootNode);

  if(fabs(tr->likelihood - referenceLikelihood) > 0.5)
    {
      printf("Something suspiciuous is going on with the marginal ancestral probability computations\n");
      assert(0);
    } 
  
  assert(counter == tr->mxtips - 1);
    
  ancestralTree(tr->tree_string, tr);

  fprintf(treeFile, "%s\n", tr->tree_string);

  fclose(probsFile);
  fclose(statesFile);
  fclose(treeFile);

  printBothOpen("Marginal Ancestral Probabilities written to file:\n%s\n\n", ancestralProbsFileName);
  printBothOpen("Ancestral Sequences based on Marginal Ancestral Probabilities written to file:\n%s\n\n", ancestralStatesFileName); 
  printBothOpen("Node-laballed ROOTED tree written to file:\n%s\n", treeFileName);
}
Пример #15
0
int treeReadLen (FILE *fp, tree *tr, boolean readBranches, boolean readNodeLabels, boolean topologyOnly, analdef *adef, boolean completeTree)
{
  nodeptr  
    p;
  
  int      
    i, 
    ch, 
    lcount = 0; 

  for (i = 1; i <= tr->mxtips; i++) 
    {
      tr->nodep[i]->back = (node *) NULL; 
      if(topologyOnly)
	tr->nodep[i]->support = -1;
    }

  for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
    {
      tr->nodep[i]->back = (nodeptr)NULL;
      tr->nodep[i]->next->back = (nodeptr)NULL;
      tr->nodep[i]->next->next->back = (nodeptr)NULL;
      tr->nodep[i]->number = i;
      tr->nodep[i]->next->number = i;
      tr->nodep[i]->next->next->number = i;

      if(topologyOnly)
	{
	  tr->nodep[i]->support = -2;
	  tr->nodep[i]->next->support = -2;
	  tr->nodep[i]->next->next->support = -2;
	}
    }

  if(topologyOnly)
    tr->start       = tr->nodep[tr->mxtips];
  else
    tr->start       = tr->nodep[1];

  tr->ntips       = 0;
  tr->nextnode    = tr->mxtips + 1;      
 
  for(i = 0; i < tr->numBranches; i++)
    tr->partitionSmoothed[i] = FALSE;
  
  tr->rooted      = FALSE;     

  p = tr->nodep[(tr->nextnode)++]; 
  
  while((ch = treeGetCh(fp)) != '(');
      
  if(!topologyOnly)
    assert(readBranches == FALSE && readNodeLabels == FALSE);
  
       
  if (! addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount))                 
    assert(0);
  if (! treeNeedCh(fp, ',', "in"))                
    assert(0);
  if (! addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount))
    assert(0);
  if (! tr->rooted) 
    {
      if ((ch = treeGetCh(fp)) == ',') 
	{ 
	  if (! addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount))
	    assert(0);	    
	}
      else 
	{                                    /*  A rooted format */
	  tr->rooted = TRUE;
	  if (ch != EOF)  (void) ungetc(ch, fp);
	}	
    }
  else 
    {      
      p->next->next->back = (nodeptr) NULL;
    }
  if (! treeNeedCh(fp, ')', "in"))                
    assert(0);

  if(topologyOnly)
    assert(!(tr->rooted && readNodeLabels));

  (void) treeFlushLabel(fp);
  
  if (! treeFlushLen(fp))                         
    assert(0);
 
  if (! treeNeedCh(fp, ';', "at end of"))       
    assert(0);
  
  if (tr->rooted) 
    {     
      assert(!readNodeLabels);

      p->next->next->back = (nodeptr) NULL;      
      tr->start = uprootTree(tr, p->next->next, FALSE, FALSE);      
      if (! tr->start)                              
	{
	  printf("FATAL ERROR UPROOTING TREE\n");
	  assert(0);
	}    
    }
  else    
    tr->start = findAnyTip(p, tr->rdta->numsp);    
  
   if(!topologyOnly)
    {
      setupPointerMesh(tr);

      assert(tr->ntips <= tr->mxtips);
      

      if(tr->ntips < tr->mxtips)
	{
	  if(completeTree)
	    {
	      printBothOpen("Hello this is your friendly RAxML tree parsing routine\n");
	      printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n");
	      printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips);
	      exit(-1);
	    }
	  else
	    {
	      if(adef->computeDistance)
		{
		  printBothOpen("Error: pairwise distance computation only allows for complete, i.e., containing all taxa\n");
		  printBothOpen("bifurcating starting trees\n");
		  exit(-1);
		}     
	      if(adef->mode == CLASSIFY_ML)
		{	 
		  printBothOpen("RAxML classifier Algo: You provided a reference tree with %d taxa; alignmnet has %d taxa\n", tr->ntips, tr->mxtips);
		  printBothOpen("%d query taxa will be classifed under ML\n", tr->mxtips - tr->ntips);
		  classifyML(tr, adef);	  
		}
	      else
		{
		  printBothOpen("You provided an incomplete starting tree %d alignmnet has %d taxa\n", tr->ntips, tr->mxtips);	  
		  makeParsimonyTreeIncomplete(tr, adef);	 		 
		}    
	    }
	}
      else
	{
	  if(adef->mode == PARSIMONY_ADDITION)
	    {
	      printBothOpen("Error you want to add sequences to a trees via MP stepwise addition, but \n");
	      printBothOpen("you have provided an input tree that already contains all taxa\n");
	      exit(-1);
	    }
	  if(adef->mode == CLASSIFY_ML)
	    {
	      printBothOpen("Error you want to classify query sequences into a tree via ML, but \n");
	      printBothOpen("you have provided an input tree that already contains all taxa\n");
	      exit(-1);
	    }
	}
   
      onlyInitrav(tr, tr->start);
    }
 
  
  return lcount;
}
Пример #16
0
int treeReadLen (FILE *fp, tree *tr, boolean readBranches, boolean readNodeLabels, boolean topologyOnly, analdef *adef, boolean completeTree, boolean storeBranchLabels)
{
  nodeptr  
    p;
  
  int 
    i, 
    ch, 
    lcount = 0; 

  tr->branchLabelCounter = 0;

  for (i = 1; i <= tr->mxtips; i++) 
    {
      tr->nodep[i]->back = (node *) NULL; 
      if(topologyOnly)
	tr->nodep[i]->support = -1;
    }

  for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
    {
      tr->nodep[i]->back = (nodeptr)NULL;
      tr->nodep[i]->next->back = (nodeptr)NULL;
      tr->nodep[i]->next->next->back = (nodeptr)NULL;
      tr->nodep[i]->number = i;
      tr->nodep[i]->next->number = i;
      tr->nodep[i]->next->next->number = i;

      if(topologyOnly)
	{
	  tr->nodep[i]->support = -2;
	  tr->nodep[i]->next->support = -2;
	  tr->nodep[i]->next->next->support = -2;
	}
    }

  if(topologyOnly)
    tr->start       = tr->nodep[tr->mxtips];
  else
    tr->start       = tr->nodep[1];

  tr->ntips       = 0;
  tr->nextnode    = tr->mxtips + 1;      
 
  for(i = 0; i < tr->numBranches; i++)
    tr->partitionSmoothed[i] = FALSE;
  
  tr->rooted      = FALSE;  
  tr->wasRooted   = FALSE;

  p = tr->nodep[(tr->nextnode)++]; 
  
  while((ch = treeGetCh(fp)) != '(');
      
  if(!topologyOnly)
    {
      if(adef->mode != CLASSIFY_ML)
	{
	  if(adef->mode != OPTIMIZE_BR_LEN_SCALER)
	    assert(readBranches == FALSE && readNodeLabels == FALSE);
	  else		 
	    assert(readBranches == TRUE && readNodeLabels == FALSE);		
	}
      else
	{
	  if(adef->useBinaryModelFile)
	    assert(readBranches == TRUE && readNodeLabels == FALSE);		
	  else
	    assert(readBranches == FALSE && readNodeLabels == FALSE);
	}
    }
  
       
  if (! addElementLen(fp, tr, p, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels))                 
    assert(0);
  if (! treeNeedCh(fp, ',', "in"))                
    assert(0);
  if (! addElementLen(fp, tr, p->next, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels))
    assert(0);
  if (! tr->rooted) 
    {
      if ((ch = treeGetCh(fp)) == ',') 
	{ 
	  if (! addElementLen(fp, tr, p->next->next, readBranches, readNodeLabels, &lcount, adef, storeBranchLabels))
	    assert(0);	    
	}
      else 
	{  	  
	  /*  A rooted format */
	  
	  tr->rooted = TRUE;
	  tr->wasRooted     = TRUE;
	  
	  if (ch != EOF)  (void) ungetc(ch, fp);
	}	
    }
  else 
    {            
      p->next->next->back = (nodeptr) NULL;
      tr->wasRooted     = TRUE;    
    }

  if(!tr->rooted && adef->mode == ANCESTRAL_STATES)
    {
      printf("Error: The ancestral state computation mode requires a rooted tree as input, exiting ....\n");
      exit(0);
    }

  if (! treeNeedCh(fp, ')', "in"))                
    assert(0);

  if(topologyOnly)
    assert(!(tr->rooted && readNodeLabels));

  (void) treeFlushLabel(fp);
  
  if (! treeFlushLen(fp, tr))                         
    assert(0);
 
  if (! treeNeedCh(fp, ';', "at end of"))       
    assert(0);
  
  if (tr->rooted) 
    {     
      assert(!readNodeLabels);

      p->next->next->back = (nodeptr) NULL;      
      tr->start = uprootTree(tr, p->next->next, readBranches, FALSE);      

       
      /*tr->leftRootNode  = p->back;
	tr->rightRootNode = p->next->back;   
      */

      if (! tr->start)                              
	{
	  printf("FATAL ERROR UPROOTING TREE\n");
	  assert(0);
	}    
    }
  else    
    tr->start = findAnyTip(p, tr->rdta->numsp);    
  
   if(!topologyOnly || adef->mode == CLASSIFY_MP)
    {      
      assert(tr->ntips <= tr->mxtips);
      

      if(tr->ntips < tr->mxtips)
	{
	  if(completeTree)
	    {
	      printBothOpen("Hello this is your friendly RAxML tree parsing routine\n");
	      printBothOpen("The RAxML option you are uisng requires to read in only complete trees\n");
	      printBothOpen("with %d taxa, there is at least one tree with %d taxa though ... exiting\n", tr->mxtips, tr->ntips);
	      exit(-1);
	    }
	  else
	    {
	      if(adef->computeDistance)
		{
		  printBothOpen("Error: pairwise distance computation only allows for complete, i.e., containing all taxa\n");
		  printBothOpen("bifurcating starting trees\n");
		  exit(-1);
		}     
	      if(adef->mode == CLASSIFY_ML || adef->mode == CLASSIFY_MP)
		{	 
		  printBothOpen("RAxML placement algorithm: You provided a reference tree with %d taxa; alignmnet has %d taxa\n", tr->ntips, tr->mxtips);		  
		  printBothOpen("%d query taxa will be placed using %s\n", tr->mxtips - tr->ntips, (adef->mode == CLASSIFY_ML)?"maximum likelihood":"parsimony");
		  if(adef->mode == CLASSIFY_ML)
		    classifyML(tr, adef);	  
		  else
		    {
		      assert(adef->mode == CLASSIFY_MP);
		      classifyMP(tr, adef);
		    }
		}
	      else
		{
		  printBothOpen("You provided an incomplete starting tree %d alignmnet has %d taxa\n", tr->ntips, tr->mxtips);	  
		  makeParsimonyTreeIncomplete(tr, adef);	 		 
		}    
	    }
	}
      else
	{
	  if(adef->mode == PARSIMONY_ADDITION)
	    {
	      printBothOpen("Error you want to add sequences to a trees via MP stepwise addition, but \n");
	      printBothOpen("you have provided an input tree that already contains all taxa\n");
	      exit(-1);
	    }
	  if(adef->mode == CLASSIFY_ML || adef->mode == CLASSIFY_MP)
	    {
	      printBothOpen("Error you want to place query sequences into a tree using %s, but\n", tr->mxtips - tr->ntips, (adef->mode == CLASSIFY_ML)?"maximum likelihood":"parsimony");
	      printBothOpen("you have provided an input tree that already contains all taxa\n");
	      exit(-1);
	    }
	}
   
      onlyInitrav(tr, tr->start);
    }
 
  
  return lcount;
}
Пример #17
0
static prop proposal(state * instate)
/* so here the idea would be to randomly choose among proposals? we can use typedef enum to label each, and return that */ 
{
  double randprop = (double)rand()/(double)RAND_MAX;
  boolean proposalSuccess;
  //double start_LH = evaluateGeneric(instate->tr, instate->tr->start); /* for validation */
  prop proposal_type;
  //simple proposal
  if(randprop < 0.25) 
  {
    if(randprop < 0.2)//TOPOLOGICAL MOVE
      {
	if(randprop > 0.1)//SPR MOVE
	  {
	    proposal_type = SPR;
	    // printBothOpen("Propose SPR\n");
	    if (randprop < 0.15)
	      instate->maxradius = 1;
	    else
	      instate->maxradius = 2;
	    
	    doSPR(instate->tr, instate);

	    proposalSuccess = TRUE;

	    /* TODO */
	    /*proposalSuccess = simpleNodeProposal(instate);*/
	  }
	else
	  {
	    proposal_type = stNNI;
	    proposalSuccess = stNNIproposal(instate); 
	    if(proposalSuccess == FALSE)
	      {
		/* TODOFER this came up with ds 20 and GTRPSR, see why */
		printBothOpen("WARNING!! stNNI proposal failed, doing SPR\n");
		proposal_type = SPR;
		instate->maxradius = 1;
		proposalSuccess = simpleNodeProposal(instate);
	      }
	  }
	if(proposalSuccess == FALSE)
	  {
	    assert(FALSE); // this should either never happen or look below and return PROPOSAL_FAILED to react accordingly
	  }
	else
	  {
	    /* A moved has been made, previous state is in instate */
	    if(proposal_type != stNNI) /*TODOFER delete this when bl are changed*/
	      assert(instate->tr->startLH != instate->tr->likelihood);
	  }
      }
    else{//MODEL
      proposal_type = UPDATE_MODEL;
      simpleModelProposal(instate);
    }
  }
  else
    {
      if(randprop < 0.95)//UPDATE_ALL_BL
	{
	  proposal_type = UPDATE_ALL_BL;
	  instate->bl_prior = 0;
	  //printBothOpen("Propose BL_UPDATE\n");
	  assert(proposal_type == UPDATE_ALL_BL);
	  proposalSuccess = simpleBranchLengthProposal(instate);
	  assert(instate->tr->startLH != instate->tr->likelihood);
	  assert(proposalSuccess);
	}
      else//GAMMA
	{
	  proposal_type = UPDATE_GAMMA;
	  simpleGammaProposal(instate);
	}
    }
  //record the curprior
  instate->newprior = instate->bl_prior;
  return proposal_type;
}
Пример #18
0
void doInference(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta)
{
  int i, n;

#ifdef _WAYNE_MPI
  int 
    j,
    bestProcess;
#endif

  double loopTime;
  topolRELL_LIST *rl = (topolRELL_LIST *)NULL; 
  int 
    best = -1,
    newBest = -1;
  double 
    bestLH = unlikely; 
  FILE *f;
  char bestTreeFileName[1024]; 
  double overallTime;

  n = adef->multipleRuns;
     
#ifdef _WAYNE_MPI
  if(n % processes != 0)
    n = processes * ((n / processes) + 1);
#endif 

  if(!tr->catOnly)
    {
      rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST));
      initTL(rl, tr, n);
    }

#ifdef _WAYNE_MPI
  long parsimonySeed0 = adef->parsimonySeed;
  n = n / processes;
#endif

  if(adef->rellBootstrap)
    { 
#ifdef _WAYNE_MPI          
      tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, parsimonySeed0 + 10000 * processID); 
#else     
      tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, adef->parsimonySeed);        
#endif

       tr->rellTrees = (treeList *)rax_malloc(sizeof(treeList));
       initTreeList(tr->rellTrees, tr, NUM_RELL_BOOTSTRAPS);
    }
  else
    {
      tr->resample = (int *)NULL;
      tr->rellTrees =  (treeList *)NULL;
    }

  for(i = 0; i < n; i++)
    { 
#ifdef _WAYNE_MPI 
      if(i == 0)
        { 
          if(parsimonySeed0 != 0) 
            adef->parsimonySeed = parsimonySeed0 + 10000 * processID;
        }
      j = i + n * processID;
      tr->treeID = j;
#else    
      tr->treeID = i;
#endif

      tr->checkPointCounter = 0;
         
      loopTime = gettime();
                                             
      initModel(tr, rdta, cdta, adef); 

      if(i == 0)
	printBaseFrequencies(tr);
     
      getStartingTree(tr, adef); 
                       
      computeBIGRAPID(tr, adef, TRUE);  

#ifdef _WAYNE_MPI
      if(tr->likelihood > bestLH)
	{
	  best = j;
	  bestLH = tr->likelihood;
	}

      if(!tr->catOnly)
	saveTL(rl, tr, j);
#else
      if(tr->likelihood > bestLH)
	{
	  best = i;
	  bestLH = tr->likelihood;
	}

      if(!tr->catOnly)
	saveTL(rl, tr, i);
#endif

      loopTime = gettime() - loopTime; 
      writeInfoFile(adef, tr, loopTime);
     
    }     
 
  assert(best >= 0);

#ifdef _WAYNE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
  n = n * processes;
#endif

  if(tr->catOnly)
    {
      printBothOpenMPI("\n\nNOT conducting any final model optimizations on all %d trees under CAT-based model ....\n", n);
      printBothOpenMPI("\nREMEMBER that CAT-based likelihood scores are meaningless!\n\n", n);        
#ifdef _WAYNE_MPI
      if(processID != 0)
        {
          MPI_Finalize();
          exit(0);
        }
#endif
    }
  else
    {
      printBothOpenMPI("\n\nConducting final model optimizations on all %d trees under GAMMA-based models ....\n\n", n);
 
#ifdef _WAYNE_MPI
      n = n / processes;
#endif

      if(tr->rateHetModel == GAMMA ||  tr->rateHetModel == GAMMA_I)
	{
	  restoreTL(rl, tr, best);
	  evaluateGenericInitrav(tr, tr->start);
	  if(!adef->useBinaryModelFile)
	    modOpt(tr, adef, FALSE, adef->likelihoodEpsilon); 
	  else
	    {
	      readBinaryModel(tr, adef);
	      evaluateGenericInitrav(tr, tr->start);
	      treeEvaluate(tr, 2);
	    }
	  bestLH = tr->likelihood;
	  tr->likelihoods[best] = tr->likelihood;
	  saveTL(rl, tr, best);
	  tr->treeID = best; 
	  printResult(tr, adef, TRUE);
	  newBest = best;      
	  
	  for(i = 0; i < n; i++)
	    {
#ifdef _WAYNE_MPI
	      j = i + n * processID;
	      if(j != best)
		{
		  restoreTL(rl, tr, j);
		  evaluateGenericInitrav(tr, tr->start);
		  treeEvaluate(tr, 1);
		  tr->likelihoods[j] = tr->likelihood;
		  
		  if(tr->likelihood > bestLH)
		    {
		      newBest = j;
		      bestLH = tr->likelihood;		  
		      saveTL(rl, tr, j);
		    }
		  tr->treeID = j;
		  printResult(tr, adef, TRUE);
		}
	      if(n == 1 && processes == 1)
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName);	   
	      else	    
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j);
#else	  
	      if(i != best)
		{
		  restoreTL(rl, tr, i);
		  evaluateGenericInitrav(tr, tr->start);
		  treeEvaluate(tr, 1);
		  tr->likelihoods[i] = tr->likelihood;
		  
		  if(tr->likelihood > bestLH)
		    {
		      newBest = i;
		      bestLH = tr->likelihood;		  
		      saveTL(rl, tr, i);
		    }
		  tr->treeID = i;
		  printResult(tr, adef, TRUE);
		}

	      
	      if(n == 1)
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName);	   
	      else	    
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i);
#endif	    	 
	    }    
	}
      else
	{     
	  catToGamma(tr, adef);
	  
#ifdef _WAYNE_MPI
	  for(i = 0; i < n; i++)
            {
              j = i + n*processID;
	      rl->t[j]->likelihood = unlikely;
            }  
#else
	  for(i = 0; i < n; i++)
	    rl->t[i]->likelihood = unlikely;
#endif
	  
	  initModel(tr, rdta, cdta, adef);
	  
	  restoreTL(rl, tr, best);      
	  
	  resetBranches(tr);
	  evaluateGenericInitrav(tr, tr->start);
	  modOpt(tr, adef, TRUE, adef->likelihoodEpsilon);      
	  tr->likelihoods[best] = tr->likelihood;
	  bestLH = tr->likelihood;     
	  saveTL(rl, tr, best);
	  tr->treeID = best;
	  printResult(tr, adef, TRUE);
	  newBest = best;
	  
	  for(i = 0; i < n; i++)
	    {
#ifdef _WAYNE_MPI
	      j = i + n*processID;
	      if(j != best)
		{
		  restoreTL(rl, tr, j);	    
		  resetBranches(tr);
		  evaluateGenericInitrav(tr, tr->start);
		  treeEvaluate(tr, 2);
		  tr->likelihoods[j] = tr->likelihood;
		  
		  if(tr->likelihood > bestLH)
		    { 
		      newBest = j;
		      bestLH = tr->likelihood;		
		      saveTL(rl, tr, j);	  
		    }
		  tr->treeID = j;
		  printResult(tr, adef, TRUE);
		} 
	      
	      if(n == 1 && processes == 1)	    
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName);
	      else
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j);
#else
	      if(i != best)
		{
		  restoreTL(rl, tr, i);	    
		  resetBranches(tr);
		  evaluateGenericInitrav(tr, tr->start);
		  treeEvaluate(tr, 2);
		  tr->likelihoods[i] = tr->likelihood;
		  
		  if(tr->likelihood > bestLH)
		    { 
		      newBest = i;
		      bestLH = tr->likelihood;		
		      saveTL(rl, tr, i);	  
		    }
		  tr->treeID = i;
		  printResult(tr, adef, TRUE);
		} 
	      
	      if(n == 1)	    
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName);
	      else
		printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i);	   	  
#endif
	    }
	}     
    
      assert(newBest >= 0);

#ifdef _WAYNE_MPI
      if(processes > 1)
	{
	  double 
	    *buffer = (double *)rax_malloc(sizeof(double) * processes);
	  for(i = 0; i < processes; i++)
	    buffer[i] = unlikely;
	  buffer[processID] = bestLH;
	  for(i = 0; i < processes; i++)
	    MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD);
	  bestLH = buffer[0];
	  bestProcess = 0;
	  for(i = 1; i < processes; i++)
	    if(buffer[i] > bestLH)
	      {
		bestLH = buffer[i];
		bestProcess = i;
	      }
	  
	  rax_free(buffer);	  	  
	}

      if(processID == bestProcess)
	{
#endif

	  restoreTL(rl, tr, newBest);
	  evaluateGenericInitrav(tr, tr->start);
     
	  printBothOpen("\n\nStarting final GAMMA-based thorough Optimization on tree %d likelihood %f .... \n\n", newBest, tr->likelihoods[newBest]);
	  
	  Thorough = 1;
	  tr->doCutoff = FALSE; 
	  treeOptimizeThorough(tr, 1, 10); 
	  evaluateGenericInitrav(tr, tr->start);
	  
	  printBothOpen("Final GAMMA-based Score of best tree %f\n\n", tr->likelihood); 
	  
	  
	  strcpy(bestTreeFileName, workdir); 
	  strcat(bestTreeFileName, "RAxML_bestTree.");
	  strcat(bestTreeFileName,         run_id);
	  
	  
	  Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE);
	  
	  f = myfopen(bestTreeFileName, "wb");
	  fprintf(f, "%s", tr->tree_string);
	  fclose(f);
	  
	  if(adef->perGeneBranchLengths)
	    printTreePerGene(tr, adef, bestTreeFileName, "w");      
#ifdef _WAYNE_MPI
	}
#endif
    }

  if(adef->rellBootstrap)
    {
      //WARNING the functions below need to be invoked after all other trees have been printed
      //don't move this part of the code further up!

      int
	i;
#ifdef _WAYNE_MPI
      FILE 
	*f = myfopen(rellBootstrapFileNamePID, "wb");      
#else
      FILE 
	*f = myfopen(rellBootstrapFileName, "wb");
#endif
      
      for(i = 0; i < NUM_RELL_BOOTSTRAPS; i++)
	{
	   restoreTreeList(tr->rellTrees, tr, i);
	   Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE);
	   fprintf(f, "%s", tr->tree_string);
	}

      freeTreeList(tr->rellTrees);
      rax_free(tr->rellTrees);
      rax_free(tr->resample);

      fclose(f);

#ifdef _WAYNE_MPI      
      MPI_Barrier(MPI_COMM_WORLD);      
      
      concatenateBSFiles(processes, rellBootstrapFileName);
      removeBSFiles(processes, rellBootstrapFileName);  
      
      MPI_Barrier(MPI_COMM_WORLD); 
      
      if(processID == 0)
	printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName);
#else
      printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName);    
#endif
    }
  
#ifdef _WAYNE_MPI 
  if(processID == bestProcess)
    {
#endif
      overallTime = gettime() - masterTime;
      
      printBothOpen("Program execution info written to %s\n", infoFileName);
      
      if(!tr->catOnly)
	{
	  printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName);
	  
	  if(adef->perGeneBranchLengths && tr->NumberOfModels > 1)    
	    printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to  %s.PARTITION.%d\n\n", bestTreeFileName,  bestTreeFileName, 
			  tr->NumberOfModels - 1);  
	}
      
      printBothOpen("Overall execution time: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0);    
#ifdef _WAYNE_MPI 
    }
#endif

  if(!tr->catOnly)
    {
      freeTL(rl);   
      rax_free(rl); 
    }
  
#ifdef _WAYNE_MPI
  MPI_Finalize();
#endif
  exit(0);
}
Пример #19
0
void mcmc(tree *tr, analdef *adef)
{
  int i=0;

  tr->startLH = tr->likelihood;
  printBothOpen("start minimalistic search with LH %f\n", tr->likelihood);
  printBothOpen("tr LH %f, startLH %f\n", tr->likelihood, tr->startLH);
  
  int insert_id;
  int j;

  int maxradius = 30;
  int accepted_spr = 0, accepted_nni = 0, accepted_bl = 0, accepted_model = 0, accepted_gamma = 0, inserts = 0;
  int rejected_spr = 0, rejected_nni = 0, rejected_bl = 0, rejected_model = 0, rejected_gamma = 0;
  int num_moves = 10000;
  boolean proposalAccepted;
  boolean proposalSuccess;
  prop which_proposal;
  double testr;
  double acceptance;

  srand (440);
  double totalTime = 0.0, proposalTime = 0.0, blTime = 0.0, printTime = 0.0;
  double t_start = gettime();
  double t;


  //allocate states
  double bl_prior_exp_lambda = 0.1;
  double bl_sliding_window_w = 0.005;
  double gm_sliding_window_w = 0.75;
  double rt_sliding_window_w = 0.5;
  state *curstate = state_init(tr, adef, maxradius, bl_sliding_window_w, rt_sliding_window_w, gm_sliding_window_w, bl_prior_exp_lambda);
  printStateFileHeader(curstate);
  set_start_bl(curstate);
  printf("start bl_prior: %f\n",curstate->bl_prior);
  set_start_prior(curstate);
  curstate->hastings = 1;//needs to be set by the proposal when necessary

  /* Set the starting LH with a full traversal */
  evaluateGeneric(tr, tr->start, TRUE);	 
  tr->startLH = tr->likelihood;
  printBothOpen("Starting with tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);

  /* Set reasonable model parameters */
  evaluateGeneric(curstate->tr, curstate->tr->start, FALSE); // just for validation 
  printBothOpen("tr LH before modOpt %f\n",curstate->tr->likelihood);
  printSubsRates(curstate->tr, curstate->model, curstate->numSubsRates);

  /* optimize the model with Brents method for reasonable starting points */
  modOpt(curstate->tr, curstate->adef, 5.0); /* not by proposal, just using std raxml machinery... */
  evaluateGeneric(curstate->tr, curstate->tr->start, FALSE); // just for validation 
  printBothOpen("tr LH after modOpt %f\n",curstate->tr->likelihood);
  printSubsRates(curstate->tr, curstate->model, curstate->numSubsRates);
  recordSubsRates(curstate->tr, curstate->model, curstate->numSubsRates, curstate->curSubsRates);

  int first = 1;
  /* beginning of the MCMC chain */
  for(j=0; j<num_moves; j++)
  {
    //printBothOpen("iter %d, tr LH %f, startLH %f\n",j, tr->likelihood, tr->startLH);
    //printRecomTree(tr, TRUE, "startiter");
    proposalAccepted = FALSE;
    t = gettime(); 

    /*
      evaluateGeneric(tr, tr->start); // just for validation 
      printBothOpen("before proposal, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
    */

    which_proposal = proposal(curstate);
    if (first == 1)
    {
      first = 0;
      curstate->curprior = curstate->newprior;
    }
   //printBothOpen("proposal done, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
    assert(which_proposal == SPR || which_proposal == stNNI ||
           which_proposal == UPDATE_ALL_BL || 
           which_proposal == UPDATE_MODEL || which_proposal == UPDATE_GAMMA);
    proposalTime += gettime() - t;
    /* decide upon acceptance */
    testr = (double)rand()/(double)RAND_MAX;
    //should look something like 
    acceptance = fmin(1,(curstate->hastings) * 
		      (exp(curstate->newprior-curstate->curprior)) * (exp(curstate->tr->likelihood-curstate->tr->startLH)));
    
    /*
      //printRecomTree(tr, FALSE, "after proposal");
      printBothOpen("after proposal, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
    */
    if(testr < acceptance)
    {
      proposalAccepted = TRUE;

      switch(which_proposal)
	{
	case SPR:      
	  //printRecomTree(tr, TRUE, "after accepted");
	  // printBothOpen("SPR new topology , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
	   accepted_spr++;
	  break;
	case stNNI:	  
	  printBothOpen("NNI new topology , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
	  accepted_nni++;
	  break;
	case UPDATE_ALL_BL:	  
	  //      printBothOpen("BL new , iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
	  accepted_bl++;
	  break;
	case UPDATE_MODEL:      
	  //	printBothOpen("Model new, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
	  accepted_model++;
	  break;
	case UPDATE_GAMMA:      
	  //	printBothOpen("Gamma new, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
	  accepted_gamma++;
	  break;
	default:
	  assert(0);
	}

      curstate->tr->startLH = curstate->tr->likelihood;  //new LH
      curstate->curprior = curstate->newprior;          
    }
    else
    {
      //printBothOpen("rejected , iter %d tr LH %f, startLH %f, %i \n", j, tr->likelihood, tr->startLH, which_proposal);
      resetState(which_proposal,curstate);
      
      switch(which_proposal)
	{
	case SPR:
	  rejected_spr++;
	  break;
	case stNNI:
	  rejected_nni++;
	  break;
	case UPDATE_ALL_BL:
	  rejected_bl++;
	  break;
	case UPDATE_MODEL:
	  rejected_model++;
	  break;
	case UPDATE_GAMMA:
	  rejected_gamma++;
	  break;
	default:
	  assert(0);
	}
      
      evaluateGeneric(tr, tr->start, FALSE); 
      
      // just for validation 

      if(fabs(curstate->tr->startLH - tr->likelihood) > 1.0E-10)
      {
        printBothOpen("WARNING: LH diff %.10f\n", curstate->tr->startLH - tr->likelihood);
      }
      //printRecomTree(tr, TRUE, "after reset");
      //printBothOpen("after reset, iter %d tr LH %f, startLH %f\n", j, tr->likelihood, tr->startLH);
      assert(fabs(curstate->tr->startLH - tr->likelihood) < 1.0E-10);
    }       
    inserts++;
    
    /* need to print status */
    if (j % 50 == 0)
    {
      t = gettime(); 
      printBothOpen("sampled at iter %d, tr LH %f, startLH %f, prior %f, incr %f\n",j, tr->likelihood, tr->startLH, curstate->curprior, tr->likelihood - tr->startLH);
      boolean printBranchLengths = TRUE;
      /*printSimpleTree(tr, printBranchLengths, adef);*/
      //TODO: print some parameters to a file 
      printStateFile(j,curstate);
      printTime += gettime() - t;
    }
  }

  t = gettime(); 
  treeEvaluate(tr, 1);
  blTime += gettime() - t;
  printBothOpen("accepted SPR %d, accepted stNNI %d, accepted BL %d, accepted model %d, accepted gamma %d, num moves tried %d, SPRs with max radius %d\n", 
		accepted_spr, accepted_nni, accepted_bl, accepted_model, accepted_gamma, num_moves, maxradius);
  printBothOpen("rejected SPR %d, rejected stNNI %d, rejected BL %d, rejected model %d, rejected gamma %d\n",
		rejected_spr, rejected_nni, rejected_bl, rejected_model, rejected_gamma);
  printBothOpen("ratio SPR %f, ratio stNNI %f,  ratio BL %f, ratio model %f, ratio gamma %f\n",
		accepted_spr/(double)(rejected_spr+accepted_spr), accepted_nni/(double)(rejected_nni+accepted_nni), accepted_bl/(double)(rejected_bl+accepted_bl), 
		accepted_model/(double)(rejected_model+accepted_model), accepted_gamma/(double)(rejected_gamma+accepted_gamma));
  printBothOpen("total  %f, BL %f, printing %f, proposal %f\n", gettime()- t_start, blTime, printTime, proposalTime);
  assert(inserts == num_moves);
  state_free(curstate);
}
void handleExcludeFile(tree *tr, analdef *adef, rawdata *rdta)
{
  FILE *f;  
  char buf[256];
  int
    ch,
    j, value, i,
    state = 0,
    numberOfModels = 0,
    l = -1,
    excludeRegion   = 0,
    excludedColumns = 0,
    modelCounter    = 1;
  int
    *excludeArray, *countArray, *modelList;
  int
    **partitions;

  printf("\n\n");

  f = myfopen(excludeFileName, "rb");    

  while((ch = getc(f)) != EOF)
  {
    if(ch == '-')
      numberOfModels++;
  } 

  excludeArray = (int*)malloc(sizeof(int) * (rdta->sites + 1));
  countArray   = (int*)malloc(sizeof(int) * (rdta->sites + 1));
  modelList    = (int *)malloc((rdta->sites + 1)* sizeof(int));

  partitions = (int **)malloc(sizeof(int *) * numberOfModels);  
  for(i = 0; i < numberOfModels; i++)
    partitions[i] = (int *)malloc(sizeof(int) * 2);

  rewind(f);

  while((ch = getc(f)) != EOF)
  {     
    switch(state)
    {
      case 0: /* get first number */
        if(!whitechar(ch))
        {
          if(!isNum(ch))
          {
            printf("exclude file must have format: number-number [number-number]*\n");
            exit(-1);
          }
          l = 0;
          buf[l++] = ch;
          state = 1;
        }
        break;
      case 1: /*get the number or detect - */
        if(!isNum(ch) && ch != '-')
        {
          printf("exclude file must have format: number-number [number-number]*\n");
          exit(-1);
        }
        if(isNum(ch))
        {
          buf[l++] = ch;
        }
        else
        {
          buf[l++] = '\0';	     
          value = atoi(buf);
          partitions[excludeRegion][0] = value;
          state = 2;
        }
        break;
      case 2: /*get second number */
        if(!isNum(ch))
        {
          printf("exclude file must have format: number-number [number-number]*\n");
          exit(-1);
        }
        l = 0;
        buf[l++] = ch;
        state = 3;
        break;
      case 3: /* continue second number or find end */	 
        if(!isNum(ch) && !whitechar(ch))
        {
          printf("exclude file must have format: number-number [number-number]*\n");
          exit(-1);
        }
        if(isNum(ch))
        {
          buf[l++] = ch;
        }
        else
        {	      
          buf[l++] = '\0';	     
          value = atoi(buf);
          partitions[excludeRegion][1] = value;
          excludeRegion++;
          state = 0;
        }
        break;
      default:
        assert(0);
    }
  }

  if(state == 3)
  {
    buf[l++] = '\0';     
    value = atoi(buf);
    partitions[excludeRegion][1] = value;
    excludeRegion++;
  }

  assert(excludeRegion == numberOfModels);

  for(i = 0; i <= rdta->sites; i++)
  {
    excludeArray[i] = -1;
    countArray[i] = 0;      
    modelList[i] = -1;
  }  

  for(i = 0; i < numberOfModels; i++)
  {
    int lower = partitions[i][0];
    int upper = partitions[i][1];

    if(lower > upper)
    {
      printf("Misspecified exclude region %d\n", i);
      printf("lower bound %d is greater than upper bound %d\n", lower, upper);
      exit(-1);
    }

    if(lower == 0)
    {
      printf("Misspecified exclude region %d\n", i);
      printf("lower bound must be greater than 0\n");
      exit(-1);
    }

    if(upper > rdta->sites)
    {
      printf("Misspecified exclude region %d\n", i);
      printf("upper bound %d must be smaller than %d\n", upper, (rdta->sites + 1));
      exit(-1);
    }	
    for(j = lower; j <= upper; j++)
    {
      if(excludeArray[j] != -1)
      {
        printf("WARNING: Exclude regions %d and %d overlap at position %d (already excluded %d times)\n", 
            excludeArray[j], i, j, countArray[j]);
      }
      excludeArray[j] = i;
      countArray[j]   =  countArray[j] + 1;	 
    }
  }

  for(i = 1; i <= rdta->sites; i++)
  {
    if(excludeArray[i] != -1)
      excludedColumns++;
    else
    {
      modelList[modelCounter] = tr->model[i];
      modelCounter++;
    }
  }

  printf("You have excluded %d out of %d columns\n", excludedColumns, rdta->sites);

  if(excludedColumns == rdta->sites)
  {
    printf("Error: You have excluded all sites\n");
    exit(-1);
  }

  if(adef->useSecondaryStructure && (excludedColumns > 0))
  {
    char mfn[2048];
    int countColumns;
    FILE *newFile;

    assert(adef->useMultipleModel);

    strcpy(mfn, secondaryStructureFileName);
    strcat(mfn, ".");
    strcat(mfn, excludeFileName);

    newFile = myfopen(mfn, "wb");

    printBothOpen("\nA secondary structure file with analogous structure assignments for non-excluded columns is printed to file %s\n", mfn);        	     	    

    for(i = 1, countColumns = 0; i <= rdta->sites; i++)
    {		  
      if(excludeArray[i] == -1)
        fprintf(newFile, "%c", tr->secondaryStructureInput[i - 1]);
      else
        countColumns++;
    }

    assert(countColumns == excludedColumns);

    fprintf(newFile,"\n");

    fclose(newFile);
  }


  if(adef->useMultipleModel && (excludedColumns > 0))
  {      
    char mfn[2048];
    FILE *newFile;

    strcpy(mfn, modelFileName);
    strcat(mfn, ".");
    strcat(mfn, excludeFileName);

    newFile = myfopen(mfn, "wb");

    printf("\nA partition file with analogous model assignments for non-excluded columns is printed to file %s\n", mfn);     

    for(i = 0; i < tr->NumberOfModels; i++)
    {
      boolean modelStillExists = FALSE;

      for(j = 1; (j <= rdta->sites) && (!modelStillExists); j++)
      {
        if(modelList[j] == i)
          modelStillExists = TRUE;
      }

      if(modelStillExists)
      {	  	      
        int k = 1;
        int lower, upper;
        int parts = 0;

        switch(tr->partitionData[i].dataType)
        {
          case AA_DATA:		      		     
            {
              char AAmodel[1024];

              strcpy(AAmodel, protModels[tr->partitionData[i].protModels]);
              if(tr->partitionData[i].protFreqs)
                strcat(AAmodel, "F");		  

              fprintf(newFile, "%s, ", AAmodel);
            }
            break;
          case DNA_DATA:
            fprintf(newFile, "DNA, ");
            break;
          case BINARY_DATA:
            fprintf(newFile, "BIN, ");
            break;
          case GENERIC_32:
            fprintf(newFile, "MULTI, ");
            break;
          case GENERIC_64:
            fprintf(newFile, "CODON, ");
            break;
          default:
            assert(0);
        }

        fprintf(newFile, "%s = ", tr->partitionData[i].partitionName);

        while(k <= rdta->sites)
        {
          if(modelList[k] == i)
          {
            lower = k;
            while((modelList[k + 1] == i) && (k <= rdta->sites))		      			
              k++;
            upper = k;

            if(lower == upper)		  
            {
              if(parts == 0)
                fprintf(newFile, "%d", lower);
              else
                fprintf(newFile, ",%d", lower);
            }
            else
            {
              if(parts == 0)
                fprintf(newFile, "%d-%d", lower, upper);
              else
                fprintf(newFile, ",%d-%d", lower, upper);
            }		  
            parts++;
          }
          k++;
        }
        fprintf(newFile, "\n");
      }		  
    }	
    fclose(newFile);
  }


  {
    FILE *newFile;
    char mfn[2048];


    strcpy(mfn, seq_file);
    strcat(mfn, ".");
    strcat(mfn, excludeFileName);

    newFile = myfopen(mfn, "wb");

    printf("\nAn alignment file with excluded columns is printed to file %s\n\n\n", mfn);

    fprintf(newFile, "%d %d\n", tr->mxtips, rdta->sites - excludedColumns);

    for(i = 1; i <= tr->mxtips; i++)
    {   
      unsigned char *tipI =  &(rdta->y[i][1]);
      fprintf(newFile, "%s ", tr->nameList[i]);

      for(j = 0; j < rdta->sites; j++)
      {
        if(excludeArray[j + 1] == -1)	      
          fprintf(newFile, "%c", getInverseMeaning(tr->dataVector[j + 1], tipI[j]));	       	  
      }

      fprintf(newFile, "\n");
    }

    fclose(newFile);
  }


  fclose(f);
  for(i = 0; i < numberOfModels; i++)
    free(partitions[i]);
  free(partitions);  
  free(excludeArray);
  free(countArray);
  free(modelList);
}
Пример #21
0
void plausibilityChecker(tree *tr, analdef *adef)
{
  FILE 
    *treeFile,
    *rfFile;
  
  tree 
    *smallTree = (tree *)rax_malloc(sizeof(tree));

  char 
    rfFileName[1024];

  int
    numberOfTreesAnalyzed = 0,
    i;

  double 
    avgRF = 0.0,
    sumEffectivetime = 0.0;

  /* set up an output file name */

  strcpy(rfFileName,         workdir);  
  strcat(rfFileName,         "RAxML_RF-Distances.");
  strcat(rfFileName,         run_id);

  rfFile = myfopen(rfFileName, "wb");  

  assert(adef->mode ==  PLAUSIBILITY_CHECKER);

  /* open the big reference tree file and parse it */

  treeFile = myfopen(tree_file, "r");

  printBothOpen("Parsing reference tree %s\n", tree_file);

  treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE);

  assert(tr->mxtips == tr->ntips);
  
  /*************************************************************************************/
  /* Preprocessing Step */

  double 
    preprocesstime = gettime();
  
  /* taxonToLabel[2*tr->mxtips - 2]; 
  Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */
  int 
    *taxonToLabel  = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)),

    /* taxonHasDeg[2*tr->mxtips - 2] 
    Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees 
    (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */

    *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)),

    /* taxonToReduction[2*tr->mxtips - 2]; 
  Array used for reducing bitvector and speeding up extraction: 
  (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree))
  (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */

    *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int));
    
  int 
    newcount = 0; //counter used for correct traversals

  /* labelToTaxon[2*tr->mxtips - 2];
  is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */
  int 
    *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int));
  
  /* Preorder-Traversal of the large tree */
  preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount);

  newcount = 0; //counter set to 0 to be now used for Eulertraversal

  /* eulerIndexToLabel[4*tr->mxtips - 5]; 
  Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */
  int* 
    eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int));

  /* taxonToEulerIndex[tr->mxtips]; 
  Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) 
  is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step
  */
  int*
    taxonToEulerIndex  = (int *)rax_malloc((tr->mxtips) * sizeof(int));

  /* Init taxonToEulerIndex and taxonToReduction */
  int 
    ix;

  for(ix = 0; ix < tr->mxtips; ++ix)    
    taxonToEulerIndex[ix] = -1;    
  
  for(ix = 0; ix < (2*tr->mxtips - 2); ++ix)    
    taxonToReduction[ix] = -1;    


  /* Eulertraversal of the large tree*/
  unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex);

  /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C
  Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */
  RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5);

  double 
    preprocessendtime = gettime() - preprocesstime;

  /* Proprocessing Step End */
  /*************************************************************************************/

  printBothOpen("The reference tree has %d tips\n", tr->ntips);

  fclose(treeFile);
  
  /* now see how many small trees we have */

  treeFile = getNumberOfTrees(tr, bootStrapFile, adef);

  checkTreeNumber(tr->numberOfTrees, bootStrapFile);

  /* allocate a data structure for parsing the potentially mult-furcating tree */

  allocateMultifurcations(tr, smallTree);

  /* loop over all small trees */

  for(i = 0; i < tr->numberOfTrees;  i++)
    {      
      int
	numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE);
      
      if(numberOfSplits > 0)
	{
	  int
	    firstTaxon;           

	  double
	    rec_rf,
	    maxRF;

	  if(numberOfTreesAnalyzed % 100 == 0)
	    printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits);    
	  
	  /* compute the maximum RF distance for computing the relative RF distance later-on */
	  
	  /* note that here we need to pay attention, since the RF distance is not normalized 
	     by 2 * (n-3) but we need to account for the fact that the multifurcating small tree 
	     will potentially contain less bipartitions. 
	     Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number 
	     of bipartitions of the pruned down large reference tree for which we know that it is 
	     bifurcating/strictly binary */
	  
	  maxRF = (double)(2 * numberOfSplits);
	  
	  /* now get the index of the first taxon of the small tree.
	     we will use this to unambiguously store the bipartitions 
	  */
	  
	  firstTaxon = smallTree->start->number;
	  
	  /***********************************************************************************/
	  /* Reconstruction Step */
	  
	  double 
	    time_start = gettime();
	  
	  /* Init hashtable to store Bipartitions of the induced subtree */
	  /* 
	     using smallTree->ntips instead of smallTree->mxtips yields faster code 
	     e.g. 120 versus 128 seconds for 20,000 small trees on my laptop 
	   */
	  hashtable
	    *s_hash = initHashTable(smallTree->ntips * 4);
	  
	  /* smallTreeTaxa[smallTree->ntips]; 
	     Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber)  */
	  int* 
	    smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int));
	  
	  /* counter is set to 0 for correctly extracting taxa of the small tree */
	  newcount = 0; 
	  
	  int 
	    newcount2 = 0;
	  
	  /* seq2[2*smallTree->ntips - 2]; 
	     stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */
	  int* 
	    seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int));
	  /* used to store the vectorLength of the bitvector */
	  unsigned int 
	    vectorLength;
	  
	  /* extract all taxa of the smalltree and store it into an array, 
	     also store all counts of taxa and nontaxa in taxonToReduction */
	  rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2);
	  
	  rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2);
	  
	  /* counter is set to 0 to correctly preorder traverse the small tree */
	  newcount = 0;
	  
	  /* Preordertraversal of the small tree and save its sequence into seq2 for later extracting the bipartitions, it
	     also stores information about the degree of every node */
	  
	  rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount);
	  
	  /* calculate the bitvector length */
	  if(smallTree->ntips % MASK_LENGTH == 0)
	    vectorLength = smallTree->ntips / MASK_LENGTH;
	  else
	    vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); 
	  
	  unsigned int 
	    **bitVectors = rec_initBitVector(smallTree, vectorLength);
	  
	  /* store all non trivial bitvectors using an subtree approach for the induced subtree and 
	     store it into a hashtable, this method was changed for multifurcation */
	  rec_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, 
				       firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits);
	  
	  /* counter is set to 0 to be used for correctly storing all EulerIndices */
	  newcount = 0; 
	  
	  /* smallTreeTaxonToEulerIndex[smallTree->ntips]; 
	     Saves all first Euler indices for all Taxons appearing in small Tree: 
	     (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */
	  int* 
	    smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int));
	  
	  /* seq[(smallTree->ntips*2) - 1] 
	     Stores the Preordersequence of the induced small tree */
	  int* 
	    seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int));
	  
	  
	  /* iterate through all small tree taxa */
	  for(ix = 0; ix < smallTree->ntips; ix++) 
	    {        
	      int 
		taxanumber = smallTreeTaxa[ix];
	      
	      /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/
	      smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; 
	      
	      /* Saves all Preorderlabel of the smalltree taxa in seq*/
	      seq[newcount] = taxonToLabel[taxanumber-1];
	      
	      newcount++;
	    }
	  
	  /* sort the euler indices to correctly calculate LCA */
	  //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1);             
	  
	  qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers);
	  
	  //printf("newcount2 %i \n", newcount2);      
	  /* Iterate through all small tree taxa */
	  for(ix = 1; ix < newcount; ix++)
	    {  
	      /* query LCAs using RMQ Datastructure */
	      seq[newcount - 1 + ix] =  eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; 	 
	      
	      /* Used for dynamic programming. We save an index for every inner node:
		 For example the reference tree has 3 inner nodes which we saves them as 0,1,2.
		 Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. 
		 Therefore we mark them as 3,4,5,6,7  */
	      
	      taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2;
	      
	      newcount2 += 1;
	    }
	  
	  /* sort to construct the Preordersequence of the induced subtree */
	  //quicksort(seq,0,(2*smallTree->ntips - 2));
	  
	  qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers);
	  
	  /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree */
	  int 
	    rec_bips = rec_findBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction);
	  
	  /* Reconstruction Step End */
	  /***********************************************************************************/
	  
	  double 
	    effectivetime = gettime() - time_start;
	  
	  /*
	    if(numberOfTreesAnalyzed % 100 == 0)
	    printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime);
	  */
	  
	  /* compute the relative RF */
	  
	  rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF;
	  
	  assert(numberOfSplits >= rec_bips);	  	 

	  avgRF += rec_rf;
	  sumEffectivetime += effectivetime;
	  
	  if(numberOfTreesAnalyzed % 100 == 0)
	    printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf);
	  
	  fprintf(rfFile, "%d %f\n", i, rec_rf);
	  
	  /* free masks and hast table for this iteration */
	  rec_freeBitVector(smallTree, bitVectors);
	  rax_free(bitVectors);
	  
	  freeHashTable(s_hash);
	  rax_free(s_hash);
	  
	  rax_free(smallTreeTaxa);
	  rax_free(seq);
	  rax_free(seq2);
	  rax_free(smallTreeTaxonToEulerIndex);

	  numberOfTreesAnalyzed++;
	}
    }
  
  printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed);
  
  printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed);
  
  printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); 
  
  printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime);
   
  printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName);

  printBothOpen("execution stats:\n\n");
  printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime);
  printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed);
  printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime);
 

  fclose(treeFile);
  fclose(rfFile);    
  
  /* free the data structure used for parsing the potentially multi-furcating tree */

  freeMultifurcations(smallTree);
  rax_free(smallTree);

  rax_free(taxonToLabel);
  rax_free(taxonToEulerIndex);
  rax_free(labelToTaxon);
  rax_free(eulerIndexToLabel);
  rax_free(taxonToReduction);
  rax_free(taxonHasDeg);
}