示例#1
0
void GaussianSetHistogram(Histogram * h, float mean, float sd)
{
  int   sc;
  int   hsize, idx;
  int   nbins;
  float delta;

  UnfitHistogram(h);
  h->fit_type          = HISTFIT_GAUSSIAN;
  h->param[GAUSS_MEAN] = mean;
  h->param[GAUSS_SD]   = sd;

  /* Calculate the expected values for the histogram.
   */
  hsize     = h->max - h->min + 1;
  h->expect = (float *) ckalloc(sizeof(float) * hsize);
  if( h->expect == NULL ) {
      fatal("Unable to allocate expect size in expected histogram...");
  }

  for (idx = 0; idx < hsize; idx++)
    h->expect[idx] = 0.;

  /* Note: ideally we'd use the Gaussian distribution function
   * to find the histogram occupancy in the window sc..sc+1. 
   * However, the distribution function is hard to calculate.
   * Instead, estimate the histogram by taking the density at sc+0.5.
   */
  for (sc = h->min; sc <= h->max; sc++)
    { 
      delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN];
      h->expect[sc - h->min] =
	(float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * 
	    (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
    }

  /* Calculate the goodness-of-fit (within whole region)
   */
  h->chisq = 0.;
  nbins    = 0;
  for (sc = h->lowscore; sc <= h->highscore; sc++)
    if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
      {
	delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
	h->chisq += delta * delta / h->expect[sc-h->min];
	nbins++;
      }
	/* -1 d.f. for normalization */
  if (nbins > 1)
    h->chip = (float) IncompleteGamma((double)(nbins-1)/2., 
				      (double) h->chisq/2.);
  else
    h->chip = 0.;		
}
示例#2
0
void ExtremeValueSetHistogram(Histogram * h, float mu, float lambda, float lowbound, float highbound, float wonka, int ndegrees)
{
  int   sc;
  int   hsize, idx;
  int   nbins;
  float delta;

  UnfitHistogram(h);
  h->fit_type          = HISTFIT_EVD;
  h->param[EVD_LAMBDA] = lambda;
  h->param[EVD_MU]     = mu;
  h->param[EVD_WONKA]  = wonka;

  hsize     = h->max - h->min + 1;
  h->expect = (float *) ckalloc(sizeof(float) * hsize);
  if( h->expect == NULL ) {
     fatal("Cannot make memory for expect thing... ");
     }
  for (idx = 0; idx < hsize; idx++)
    h->expect[idx] = 0.;

  /* Calculate the expected values for the histogram.
   */
  for (sc = h->min; sc <= h->max; sc++)
    h->expect[sc - h->min] =
      ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA], 
		    h->total) -
      ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA],
		    h->total);
  
  /* Calculate the goodness-of-fit (within whole region)
   */
  h->chisq = 0.;
  nbins    = 0;
  for (sc = lowbound; sc <= highbound; sc++)
    if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
      {
	delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
	h->chisq += delta * delta / h->expect[sc-h->min];
	nbins++;
      }

  /* Since we fit the whole histogram, there is at least 
   * one constraint on chi-square: the normalization to h->total.
   */
  if (nbins > 1 + ndegrees)
    h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2., 
				      (double) h->chisq/2.);
  else
    h->chip = 0.;		
}
/* Function: GaussianFitHistogram()
 * 
 * Purpose:  Fit a score histogram to a Gaussian distribution.
 *           Set the parameters mean and sd in the histogram
 *           structure, as well as a chi-squared test for
 *           goodness of fit.
 *
 * Args:     h         - histogram to fit
 *           high_hint - score cutoff; above this are `real' hits that aren't fit
 *           
 * Return:   1 if fit is judged to be valid.
 *           else 0 if fit is invalid (too few seqs.)           
 */
int
GaussianFitHistogram(struct histogram_s *h, float high_hint)
{
  float sum;
  float sqsum;
  float delta;
  int   sc;
  int   nbins;
  int   hsize, idx;
  
  /* Clear any previous fitting from the histogram.
   */
  UnfitHistogram(h);

  /* Determine if we have enough hits to fit the histogram;
   * arbitrarily require 1000.
   */
  if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; }

  /* Simplest algorithm for mean and sd;
   * no outlier detection yet (not even using high_hint)
   * 
   * Magic 0.5 correction is because our histogram is for
   * scores between x and x+1; we estimate the expectation
   * (roughly) as x + 0.5. 
   */
  sum = sqsum = 0.;
  for (sc = h->lowscore; sc <= h->highscore; sc++)
    {
      delta  = (float) sc + 0.5;
      sum   += (float) h->histogram[sc-h->min] * delta;
      sqsum += (float) h->histogram[sc-h->min] * delta * delta;
    }
  h->fit_type          = HISTFIT_GAUSSIAN;
  h->param[GAUSS_MEAN] = sum / (float) h->total;
  h->param[GAUSS_SD]   = sqrt((sqsum - (sum*sum/(float)h->total)) / 
			      (float)(h->total-1));
  
  /* Calculate the expected values for the histogram.
   * Note that the magic 0.5 correction appears again.
   * Calculating difference between distribution functions for Gaussian 
   * would be correct but hard.
   */
  hsize     = h->max - h->min + 1;
  h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
  for (idx = 0; idx < hsize; idx++)
    h->expect[idx] = 0.;

  for (sc = h->min; sc <= h->max; sc++)
    {
      delta = (float) sc + 0.5 - h->param[GAUSS_MEAN];
      h->expect[sc - h->min] =
	(float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * 
        (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
    }

  /* Calculate the goodness-of-fit (within region that was fitted)
   */
  h->chisq = 0.;
  nbins    = 0;
  for (sc = h->lowscore; sc <= h->highscore; sc++)
    if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
      {
	delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
	h->chisq += delta * delta / h->expect[sc-h->min];
	nbins++;
      }
	/* -1 d.f. for normalization; -2 d.f. for two free parameters */
  if (nbins > 3)
    h->chip = (float) IncompleteGamma((double)(nbins-3)/2., 
				      (double) h->chisq/2.);
  else
    h->chip = 0.;		

  return 1;
}
/* Function: ExtremeValueFitHistogram()
 * Date:     SRE, Sat Nov 15 17:16:15 1997 [St. Louis]
 * 
 * Purpose:  Fit a score histogram to the extreme value 
 *           distribution. Set the parameters lambda
 *           and mu in the histogram structure. Calculate
 *           a chi-square test as a measure of goodness of fit. 
 *           
 * Methods:  Uses a maximum likelihood method [Lawless82].
 *           Lower outliers are removed by censoring the data below the peak.
 *           Upper outliers are removed iteratively using method 
 *           described by [Mott92].
 *           
 * Args:     h         - histogram to fit
 *           censor    - TRUE to censor data left of the peak
 *           high_hint - score cutoff; above this are `real' hits that aren't fit
 *           
 * Return:   1 if fit is judged to be valid.
 *           else 0 if fit is invalid (too few seqs.)
 */
int
ExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint) 
{
  float *x;                     /* array of EVD samples to fit */
  int   *y;                     /* histogram counts            */ 
  int    n;			/* number of observed samples  */
  int    z;			/* number of censored samples  */
  int    hsize;			/* size of histogram           */
  float  lambda, mu;		/* new estimates of lambda, mu */
  int    sc;		        /* loop index for score        */
  int    lowbound;		/* lower bound of fitted region*/
  int    highbound;		/* upper bound of fitted region*/
  int    new_highbound;
  int    iteration;

  /* Determine lower bound on fitted region;
   * if we're censoring the data, choose the peak of the histogram.
   * if we're not, then we take the whole histogram.
   */
  lowbound = h->lowscore;
  if (censor) 
    {
      int max = -1;
      for (sc = h->lowscore; sc <= h->highscore; sc++)
	if (h->histogram[sc - h->min] > max) 
	  {
	    max      = h->histogram[sc - h->min];
	    lowbound = sc;
	  }
    }

  /* Determine initial upper bound on fitted region.
   */
  highbound = MIN(high_hint, h->highscore);

  /* Now, iteratively converge on our lambda, mu:
   */
  for (iteration = 0; iteration < 100; iteration++)
    {
      /* Construct x, y vectors.
       */
      x = NULL;
      y = NULL;
      hsize = highbound - lowbound + 1;
      if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */

      x = MallocOrDie(sizeof(float) * hsize);
      y = MallocOrDie(sizeof(int)   * hsize);
      n = 0;
      for (sc = lowbound; sc <= highbound; sc++)
	{
	  x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */
	  y[sc-lowbound] = h->histogram[sc - h->min];
	  n             += h->histogram[sc - h->min];
	}

      if (n < 100) goto FITFAILED;  /* require fitting to at least 100 points */

      /* If we're censoring, estimate z, the number of censored guys
       * left of the bound. Our initial estimate is crudely that we're
       * missing e^-1 of the total distribution (which would be exact
       * if we censored exactly at mu; but we censored at the observed peak).
       * Subsequent estimates are more exact based on our current estimate of mu.
       */
      if (censor)
	{
	  if (iteration == 0)
	    z = MIN(h->total-n, (int) (0.58198 * (float) n));
	  else
	    {
	      double psx;
	      psx = EVDDistribution((float) lowbound, mu, lambda);
	      z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx)));
	    }
	}

      /* Do an ML fit
       */
      if (censor) {
	if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda))
	  goto FITFAILED;
      } else  
	if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda))
	  goto FITFAILED;

      /* Find the Eval = 1 point as a new highbound;
       * the total number of samples estimated to "belong" to the EVD is n+z  
       */
      new_highbound = (int)
	(mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda));

      free(x);
      free(y);
      if (new_highbound >= highbound) break; 
      highbound = new_highbound;
    }

  /* Set the histogram parameters;
   * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom
   *   for fitting mu, lambda, but we get 1 back because we're unnormalized
   *   in this interval, hence we pass 2-1 = 1 as ndegrees.
   */    
  ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1); 
  return 1;

FITFAILED:
  UnfitHistogram(h);
  if (x != NULL) free(x);
  if (y != NULL) free(y);
  return 0;
}
示例#5
0
int ExtremeValueFitHistogram(Histogram * h, int censor, float high_hint) 
{
  float *x;                     /* array of EVD samples to fit */
  int   *y;                     /* histogram counts            */ 
  int    n;			/* number of observed samples  */
  int    z;			/* number of censored samples  */
  int    hsize;			/* size of histogram           */
  float  lambda, mu;		/* new estimates of lambda, mu */
  int    sc;		        /* loop index for score        */
  int    lowbound;		/* lower bound of fitted region*/
  int    highbound;		/* upper bound of fitted region*/
  int    new_highbound;
  int    iteration;

  /* Determine lower bound on fitted region;
   * if we're censoring the data, choose the peak of the histogram.
   * if we're not, then we take the whole histogram.
   */
  lowbound = h->lowscore;
  if (censor) 
    {
      int max = -1;
      for (sc = h->lowscore; sc <= h->highscore; sc++)
	if (h->histogram[sc - h->min] > max) 
	  {
	    max      = h->histogram[sc - h->min];
	    lowbound = sc;
	  }
    }

  /* Determine initial upper bound on fitted region.
   */
  highbound = MIN(high_hint, h->highscore);

  /* Now, iteratively converge on our lambda, mu:
   */
  for (iteration = 0; iteration < 100; iteration++)
    {
      /* Construct x, y vectors.
       */
      x = NULL;
      y = NULL;
      hsize = highbound - lowbound + 1;
      if (hsize < 5) {
	warn("On iteration %d, got %d bins, which is not fitable",iteration,hsize);
	goto FITFAILED; /* require at least 5 bins or we don't fit */
      }


      x = ckalloc(sizeof(float) * hsize);
      y = ckalloc(sizeof(int)   * hsize);
      if( x == NULL || y == NULL ) {
          warn("Out of temporary memory for evd fitting... exiting with error, though I'd worry about this");
	  return 0;
	  }

      n = 0;
      for (sc = lowbound; sc <= highbound; sc++)
	{
	  x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */
	  y[sc-lowbound] = h->histogram[sc - h->min];
	  n             += h->histogram[sc - h->min];
	}

      if (n < 100) {
	warn("On iteration %d, got only %d points, which is not fitable",iteration,n);
	goto FITFAILED;  /* require fitting to at least 100 points */
      }


      /* If we're censoring, estimate z, the number of censored guys
       * left of the bound. Our initial estimate is crudely that we're
       * missing e^-1 of the total distribution (which would be exact
       * if we censored exactly at mu; but we censored at the observed peak).
       * Subsequent estimates are more exact based on our current estimate of mu.
       */
      if (censor)
	{
	  if (iteration == 0)
	    z = MIN(h->total-n, (int) (0.58198 * (float) n));
	  else
	    {
	      double psx;
	      psx = EVDDistribution((float) lowbound, mu, lambda);
	      z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx)));
	    }
	}

      /* Do an ML fit
       */
      if (censor) {
	if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda)) {
	  warn("On iteration %d, unable to make maxlikehood evd fit with censor",iteration);
	  goto FITFAILED;
	}
      } else {
	if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda)) {
	  warn("On iteration %d, unable to make maxlikehood evd fit without censor",iteration);
	  goto FITFAILED;
	}
      }


      /* Find the Eval = 1 point as a new highbound;
       * the total number of samples estimated to "belong" to the EVD is n+z  
       */
      new_highbound = (int)
	(mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda));

      free(x);
      free(y);
      if (new_highbound >= highbound) break; 
      highbound = new_highbound;
    }

  /* Set the histogram parameters;
   * - the wonka factor is n+z / h->total : e.g. that's the fraction of the
   *   hits that we expect to match the EVD, others are generally lower
   * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom
   *   for fitting mu, lambda, but we get 1 back because we're unnormalized
   *   in this interval, hence we pass 2-1 = 1 as ndegrees.
   *   
   *   Mon Jan 19 06:18:14 1998: wonka = 1.0, temporarily disabled.
   */    
  ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1.0, 1); 
  return 1;

FITFAILED:
  UnfitHistogram(h);
  if (x != NULL) free(x);
  if (y != NULL) free(y);
  return 0;
}