static void copyParams(int numberOfModels, pInfo *dst, pInfo *src, tree *tr)
{
  int i;

  assert(src != dst);

  for(i = 0; i < numberOfModels; i++)
    {
      const partitionLengths *pl = getPartitionLengths(&src[i]);
      
      dst[i].dataType = src[i].dataType;

       memcpy(dst[i].EIGN,        src[i].EIGN,        pl->eignLength * sizeof(double));
       memcpy(dst[i].EV,          src[i].EV,          pl->evLength * sizeof(double));
       memcpy(dst[i].EI,          src[i].EI,          pl->eiLength * sizeof(double));	  
       memcpy(dst[i].substRates,  src[i].substRates,  pl->substRatesLength * sizeof(double));	  
       memcpy(dst[i].frequencies, src[i].frequencies, pl->frequenciesLength * sizeof(double));	  
       memcpy(dst[i].tipVector,   src[i].tipVector,   pl->tipVectorLength * sizeof(double));
       
       
    }
  
#ifdef _USE_PTHREADS
  masterBarrier(THREAD_COPY_PARAMS, tr);
#endif    
}
void gammaToCat(tree *tr)
{

  assert(tr->rateHetModel == GAMMA || tr->rateHetModel == GAMMA_I);
   
  tr->rateHetModel = CAT;

 
#ifdef _USE_PTHREADS
  masterBarrier(THREAD_GAMMA_TO_CAT, tr); 
#endif  
}
void newviewGenericAncestral (tree *tr, nodeptr p, boolean atRoot)
{  
  if(atRoot)
    {
      assert(!tr->multiGene);
      tr->td[0].count = 1;
      traversalInfoAncestralRoot(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches);
      
      if(tr->td[0].count > 1)
	{
#ifdef _USE_PTHREADS
	  masterBarrier(THREAD_NEWVIEW_ANCESTRAL, tr);
#else
	  newviewIterativeAncestral(tr);
#endif
	}
    }   
  else
    {
      if(isTip(p->number, tr->mxtips))
	return;
      
      if(tr->multiGene)       
	assert(0);     
      else
	{
	  tr->td[0].count = 1;
	  computeTraversalInfo(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches);
	  
	  if(tr->td[0].count > 1)
	    {
#ifdef _USE_PTHREADS
	      masterBarrier(THREAD_NEWVIEW_ANCESTRAL, tr);
#else
	      newviewIterativeAncestral(tr);
#endif
	    }
	}
    }
}
void catToGamma(tree *tr, analdef *adef)
{
  assert(tr->rateHetModel == CAT);  
  
  if(adef->useInvariant)
    tr->rateHetModel = GAMMA_I;
  else
    tr->rateHetModel = GAMMA;

  
#ifdef _USE_PTHREADS
  masterBarrier(THREAD_CAT_TO_GAMMA, tr); 
#endif
}
void fixModelIndices(tree *tr, int endsite, boolean fixRates)
{
  int model, i;

  assert(tr->NumberOfModels > 0);   

  tr->partitionData[0].lower = 0;
     
  model = tr->model[0];
  i = 1;

  while(i < endsite)
    {
      if(tr->model[i] != model)
	{	      
	  tr->partitionData[model].upper = i;
	  tr->partitionData[model + 1].lower = i;
	  model = tr->model[i];
	}
      i++;
    }       
  
  tr->partitionData[tr->NumberOfModels - 1].upper = endsite;
  
  for(model = 0; model < tr->NumberOfModels; model++)    
    tr->partitionData[model].width = tr->partitionData[model].upper -  tr->partitionData[model].lower;
 
  
#ifndef _USE_PTHREADS
  for(model = 0; model < tr->NumberOfModels; model++)
    {
      int 
	j,
	lower =  tr->partitionData[model].lower;
      
      /* SOS what about sumBuffer? */
      /* tr->partitionData[model].sumBuffer    = &tr->sumBuffer[offset]; */
      tr->partitionData[model].perSiteLL    = &tr->perSiteLL[lower];     
      tr->partitionData[model].wgt          = &tr->cdta->aliaswgt[lower];
      

      tr->partitionData[model].invariant    = &tr->invariant[lower];
      tr->partitionData[model].rateCategory = &tr->cdta->rateCategory[lower];
      

      for(j = 1; j <= tr->mxtips; j++)
	tr->partitionData[model].yVector[j] = &(tr->yVector[j][tr->partitionData[model].lower]);

      
      {
	int 
	  width =  tr->partitionData[model].width,
	  undetermined = getUndetermined(tr->partitionData[model].dataType),
	  j;		         		     
	
	tr->partitionData[model].gapVectorLength = ((int)width / 32) + 1;

	memset(tr->partitionData[model].gapVector, 0, tr->partitionData[model].initialGapVectorSize);

	for(j = 1; j <= tr->mxtips; j++)
	  for(i = 0; i < width; i++)
	    if(tr->partitionData[model].yVector[j][i] == undetermined)
	      tr->partitionData[model].gapVector[tr->partitionData[model].gapVectorLength * j + i / 32] |= mask32[i % 32];
      }

    }
#else
  masterBarrier(THREAD_FIX_MODEL_INDICES, tr);
#endif

  
  if(fixRates)  
    updatePerSiteRates(tr, TRUE);
}
void perSiteLogLikelihoods(tree *tr, double *logLikelihoods)
{
  double 
    likelihood,
    accumulatedPerSiteLikelihood = 0.0;

  size_t
    localCount,
    i,
    globalCounter,
    model,
    lower,
    upper;

  /* compute the likelihood of the tree with the standard function to:
     1. obtain the current score for error checking
     2. store a full tree traversal in the traversal descriptor that 
     will then be used for calculating per-site log likelihoods 
     for each site individually and independently */

  evaluateGeneric (tr, tr->start, TRUE);

  likelihood = tr->likelihood;

  /* now compute per-site log likelihoods using the respective functions */

#if (defined( _USE_PTHREADS ) || defined(_FINE_GRAIN_MPI))
  /* here we need a barrier to invoke a parallel region that calls 
     function 
     perSiteLogLikelihoodsPthreads(tree *tr, double *lhs, int n, int tid)
     defined above and subsequently collects the per-site log likelihoods 
     computed by the threads and stored in local per-thread memory 
     and stores them in buffer tr->lhs.
     This corresponds to a gather operation in MPI.
     */

  masterBarrier(THREAD_PER_SITE_LIKELIHOODS, tr);

  /* 
     when the parallel region has terminated, the per-site log likelihoods 
     are stored in array tr->lhs of the master thread which we copy to the result buffer
  */
  
  memcpy(logLikelihoods, tr->lhs, sizeof(double) * tr->originalCrunchedLength);


#else

  /* sequential case: just loop over all partitions and compute per site log likelihoods */

  for(model = 0; model < tr->NumberOfModels; model++)
  {
    lower = tr->partitionData[model].lower;
    upper = tr->partitionData[model].upper;

    for(i = lower, localCount = 0; i < upper; i++, localCount++)
    {
      double 
        l;

      /* 
         we need to switch of rate heterogeneity implementations here.
         when we have PSR we actually need to provide the per-site rate 
         to the function evaluatePartialGeneric() that computes the 
         per-site log likelihood.
         Under GAMMA, the rate will just be ignored, here we just set it to 1.0
         */

      switch(tr->rateHetModel)
      {
        case CAT:
          l = evaluatePartialGeneric (tr, i, tr->partitionData[model].perSiteRates[tr->partitionData[model].rateCategory[localCount]], model);
          break;
        case GAMMA:
          l = evaluatePartialGeneric (tr, i, 1.0, model);
          break;
        default:
          assert(0);
      }

      /* store value in result array and add the likelihood of this site to the overall likelihood */

      logLikelihoods[i] = l;
      accumulatedPerSiteLikelihood += l;
    } 
  }


  /* error checking. We need a dirt ABS() < epsilon here, because the implementations 
     (standard versus per-site) are pretty different and hence slight numerical 
     deviations are expected */

  assert(ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
  
#endif
  


}
void evaluateGeneric (tree *tr, nodeptr p, boolean fullTraversal)
{
  /* now this may be the entry point of the library to compute 
     the log like at a branch defined by p and p->back == q */

  volatile double 
    result = 0.0;

  nodeptr 
    q = p->back; 
  int 
    i,
    model;

  boolean
        p_recom = FALSE, /* if one of was missing, we will need to force recomputation */
        q_recom = FALSE;


  /* set the first entry of the traversal descriptor to contain the indices
     of nodes p and q */

  tr->td[0].ti[0].pNumber = p->number;
  tr->td[0].ti[0].qNumber = q->number;          

  /* copy the branch lengths of the tree into the first entry of the traversal descriptor.
     if -M is not used tr->numBranches must be 1 */

  for(i = 0; i < tr->numBranches; i++)    
    tr->td[0].ti[0].qz[i] =  q->z[i];

  /* recom part */
  if(tr->useRecom)
  {
    int slot = -1;
    if(!isTip(q->number, tr->mxtips))
    {
      q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
      tr->td[0].ti[0].slot_q = slot;
    }
    if(!isTip(p->number, tr->mxtips))
    {
      p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
      tr->td[0].ti[0].slot_p = slot;
    }
    if(!isTip(p->number, tr->mxtips) &&  !isTip(q->number, tr->mxtips))
      assert(tr->td[0].ti[0].slot_q != tr->td[0].ti[0].slot_p);
  }


  /* now compute how many conditionals must be re-computed/re-oriented by newview
     to be able to calculate the likelihood at the root defined by p and q.
     */

  /* one entry in the traversal descriptor is already used, hence set the tarversal length counter to 1 */
  tr->td[0].count = 1;

  if(fullTraversal)
  { 
    assert(isTip(q->back->number, tr->mxtips));
    computeTraversal(tr, q, FALSE);
  }
  else
  {
    if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
      computeTraversal(tr, p, TRUE);

    if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
      computeTraversal(tr, q, TRUE);
  }


  /* now we copy this partition execute mask into the traversal descriptor which must come from the 
     calling program, the logic of this should not form part of the library */

  storeExecuteMaskInTraversalDescriptor(tr);  

  /* also store in the traversal descriptor that something has changed i.e., in the parallel case that the 
     traversal descriptor list of nodes needs to be broadcast once again */

  tr->td[0].traversalHasChanged = TRUE;
#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))

  /* now here we enter the fork-join region for Pthreads */


  /* start the parallel region and tell all threads to compute the log likelihood for 
     their fraction of the data. This call is implemented in the case switch of execFunction in axml.c
     */

  masterBarrier(THREAD_EVALUATE, tr); 

  /* and now here we explicitly do the reduction operation , that is add over the 
     per-thread and per-partition log likelihoods to obtain the overall log like 
     over all sites and partitions */


  /* 
     for unpartitioned data that's easy, we just sum over the log likes computed 
     by each thread, thread 0 stores his results in reductionBuffer[0] thread 1 in 
     reductionBuffer[1] and so on 
     */

  /* This reduction for the partitioned case is more complicated because each thread 
     needs to store the partial log like of each partition and we then need to collect 
     and add everything */

#else
  /* and here is just the sequential case, we directly call evaluateIterative() above 
     without having to tell the threads/processes that they need to compute this function now */

  evaluateIterative(tr);  
#endif

  for(model = 0; model < tr->NumberOfModels; model++)
    result += tr->perPartitionLH[model];
  /* set the tree data structure likelihood value to the total likelihood */

  tr->likelihood = result;    

  if(tr->useRecom)
  {
    unpinNode(tr->rvec, p->number, tr->mxtips);
    unpinNode(tr->rvec, q->number, tr->mxtips);
  }

  /* do some bookkeeping to have traversalHasChanged in a consistent state */

  tr->td[0].traversalHasChanged = FALSE;
}
static void computeAncestralRec(tree *tr, nodeptr p, int *counter, FILE *probsFile, FILE *statesFile, boolean atRoot)
{
#ifdef _USE_PTHREADS
  size_t 
    accumulatedOffset = 0;
#endif

  int 
    model,
    globalIndex = 0;
  
  ancestralState 
    *a = (ancestralState *)malloc(sizeof(ancestralState) * tr->cdta->endsite),
    *unsortedA = (ancestralState *)malloc(sizeof(ancestralState) * tr->rdta->sites);
  
  if(!atRoot)
    {
      if(isTip(p->number, tr->mxtips))
	return;
  
      computeAncestralRec(tr, p->next->back,       counter, probsFile, statesFile, atRoot);
      computeAncestralRec(tr, p->next->next->back, counter, probsFile, statesFile, atRoot);

      newviewGeneric(tr, p);
    }

  newviewGenericAncestral(tr, p, atRoot);

#ifdef _USE_PTHREADS
  masterBarrier(THREAD_GATHER_ANCESTRAL, tr);
#endif

  if(atRoot)
    {
      fprintf(probsFile, "ROOT\n");
      fprintf(statesFile, "ROOT ");
    }
  else
    {
      fprintf(probsFile, "%d\n", p->number);
      fprintf(statesFile, "%d ", p->number);
    }

  for(model = 0; model < tr->NumberOfModels; model++)
    {
      int	
	offset,
	i,
	width = tr->partitionData[model].upper - tr->partitionData[model].lower,	
	states = tr->partitionData[model].states;
#ifdef _USE_PTHREADS
      double
	*ancestral = tr->ancestralStates;
#else
      double 
	*ancestral = tr->partitionData[model].sumBuffer;
#endif

      if(tr->rateHetModel == CAT)
	offset = 1;
      else
	offset = 4;            

      for(i = 0; i < width; i++, globalIndex++)
	{
	  double
	    equal = 1.0 / (double)states,
	    max = -1.0;
	    
	  boolean
	    approximatelyEqual = TRUE;

	  int
	    max_l = -1,
	    l;
	  
	  char 
	    c;

	  a[globalIndex].states = states;
	  a[globalIndex].probs = (double *)malloc(sizeof(double) * states);
	  
	  for(l = 0; l < states; l++)
	    {
	      double 
		value = ancestral[offset * states * i + l];

	      if(value > max)
		{
		  max = value;
		  max_l = l;
		}
	      
	      approximatelyEqual = approximatelyEqual && (ABS(equal - value) < 0.000001);
	      
	      a[globalIndex].probs[l] = value;	      	      
	    }

	  
	  if(approximatelyEqual)
	    c = '?';	  
	  else
	    c = getStateCharacter(tr->partitionData[model].dataType, max_l);
	  
	  a[globalIndex].c = c;	  
	}

#ifdef _USE_PTHREADS
      accumulatedOffset += width * offset * states;
#endif            
    }

  {
    int 
      j, 
      k;
    
    for(j = 0; j < tr->cdta->endsite; j++)
      {
	for(k = 0; k < tr->rdta->sites; k++)	    
	  if(j == tr->patternPosition[k])		
	    {
	      int 
		sorted = j,
		unsorted = tr->columnPosition[k] - 1;
	      
	      unsortedA[unsorted].states = a[sorted].states;
	      unsortedA[unsorted].c = a[sorted].c;
	      unsortedA[unsorted].probs = (double*)malloc(sizeof(double) * unsortedA[unsorted].states);
	      memcpy(unsortedA[unsorted].probs,  a[sorted].probs, sizeof(double) * a[sorted].states);	      
	    }	   
	}  

    for(k = 0; k < tr->rdta->sites; k++)
      {
	for(j = 0; j < unsortedA[k].states; j++)
	  fprintf(probsFile, "%f ", unsortedA[k].probs[j]);
	fprintf(probsFile, "\n");
	fprintf(statesFile, "%c", unsortedA[k].c);
      }
    fprintf(probsFile, "\n");
    fprintf(statesFile, "\n");
  }


  *counter = *counter + 1;

  {
    int j;

    for(j = 0; j < tr->rdta->sites; j++)
      free(unsortedA[j].probs);
    for(j = 0; j < tr->cdta->endsite; j++)
      free(a[j].probs);
  }

  free(a);
  free(unsortedA);
}