void GaussianSetHistogram(Histogram * h, float mean, float sd) { int sc; int hsize, idx; int nbins; float delta; UnfitHistogram(h); h->fit_type = HISTFIT_GAUSSIAN; h->param[GAUSS_MEAN] = mean; h->param[GAUSS_SD] = sd; /* Calculate the expected values for the histogram. */ hsize = h->max - h->min + 1; h->expect = (float *) ckalloc(sizeof(float) * hsize); if( h->expect == NULL ) { fatal("Unable to allocate expect size in expected histogram..."); } for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; /* Note: ideally we'd use the Gaussian distribution function * to find the histogram occupancy in the window sc..sc+1. * However, the distribution function is hard to calculate. * Instead, estimate the histogram by taking the density at sc+0.5. */ for (sc = h->min; sc <= h->max; sc++) { delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN]; h->expect[sc - h->min] = (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); } /* Calculate the goodness-of-fit (within whole region) */ h->chisq = 0.; nbins = 0; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* -1 d.f. for normalization */ if (nbins > 1) h->chip = (float) IncompleteGamma((double)(nbins-1)/2., (double) h->chisq/2.); else h->chip = 0.; }
void ExtremeValueSetHistogram(Histogram * h, float mu, float lambda, float lowbound, float highbound, float wonka, int ndegrees) { int sc; int hsize, idx; int nbins; float delta; UnfitHistogram(h); h->fit_type = HISTFIT_EVD; h->param[EVD_LAMBDA] = lambda; h->param[EVD_MU] = mu; h->param[EVD_WONKA] = wonka; hsize = h->max - h->min + 1; h->expect = (float *) ckalloc(sizeof(float) * hsize); if( h->expect == NULL ) { fatal("Cannot make memory for expect thing... "); } for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; /* Calculate the expected values for the histogram. */ for (sc = h->min; sc <= h->max; sc++) h->expect[sc - h->min] = ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA], h->total) - ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA], h->total); /* Calculate the goodness-of-fit (within whole region) */ h->chisq = 0.; nbins = 0; for (sc = lowbound; sc <= highbound; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* Since we fit the whole histogram, there is at least * one constraint on chi-square: the normalization to h->total. */ if (nbins > 1 + ndegrees) h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2., (double) h->chisq/2.); else h->chip = 0.; }
/* Function: GaussianFitHistogram() * * Purpose: Fit a score histogram to a Gaussian distribution. * Set the parameters mean and sd in the histogram * structure, as well as a chi-squared test for * goodness of fit. * * Args: h - histogram to fit * high_hint - score cutoff; above this are `real' hits that aren't fit * * Return: 1 if fit is judged to be valid. * else 0 if fit is invalid (too few seqs.) */ int GaussianFitHistogram(struct histogram_s *h, float high_hint) { float sum; float sqsum; float delta; int sc; int nbins; int hsize, idx; /* Clear any previous fitting from the histogram. */ UnfitHistogram(h); /* Determine if we have enough hits to fit the histogram; * arbitrarily require 1000. */ if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; } /* Simplest algorithm for mean and sd; * no outlier detection yet (not even using high_hint) * * Magic 0.5 correction is because our histogram is for * scores between x and x+1; we estimate the expectation * (roughly) as x + 0.5. */ sum = sqsum = 0.; for (sc = h->lowscore; sc <= h->highscore; sc++) { delta = (float) sc + 0.5; sum += (float) h->histogram[sc-h->min] * delta; sqsum += (float) h->histogram[sc-h->min] * delta * delta; } h->fit_type = HISTFIT_GAUSSIAN; h->param[GAUSS_MEAN] = sum / (float) h->total; h->param[GAUSS_SD] = sqrt((sqsum - (sum*sum/(float)h->total)) / (float)(h->total-1)); /* Calculate the expected values for the histogram. * Note that the magic 0.5 correction appears again. * Calculating difference between distribution functions for Gaussian * would be correct but hard. */ hsize = h->max - h->min + 1; h->expect = (float *) MallocOrDie(sizeof(float) * hsize); for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; for (sc = h->min; sc <= h->max; sc++) { delta = (float) sc + 0.5 - h->param[GAUSS_MEAN]; h->expect[sc - h->min] = (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); } /* Calculate the goodness-of-fit (within region that was fitted) */ h->chisq = 0.; nbins = 0; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* -1 d.f. for normalization; -2 d.f. for two free parameters */ if (nbins > 3) h->chip = (float) IncompleteGamma((double)(nbins-3)/2., (double) h->chisq/2.); else h->chip = 0.; return 1; }
/* Function: ExtremeValueFitHistogram() * Date: SRE, Sat Nov 15 17:16:15 1997 [St. Louis] * * Purpose: Fit a score histogram to the extreme value * distribution. Set the parameters lambda * and mu in the histogram structure. Calculate * a chi-square test as a measure of goodness of fit. * * Methods: Uses a maximum likelihood method [Lawless82]. * Lower outliers are removed by censoring the data below the peak. * Upper outliers are removed iteratively using method * described by [Mott92]. * * Args: h - histogram to fit * censor - TRUE to censor data left of the peak * high_hint - score cutoff; above this are `real' hits that aren't fit * * Return: 1 if fit is judged to be valid. * else 0 if fit is invalid (too few seqs.) */ int ExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint) { float *x; /* array of EVD samples to fit */ int *y; /* histogram counts */ int n; /* number of observed samples */ int z; /* number of censored samples */ int hsize; /* size of histogram */ float lambda, mu; /* new estimates of lambda, mu */ int sc; /* loop index for score */ int lowbound; /* lower bound of fitted region*/ int highbound; /* upper bound of fitted region*/ int new_highbound; int iteration; /* Determine lower bound on fitted region; * if we're censoring the data, choose the peak of the histogram. * if we're not, then we take the whole histogram. */ lowbound = h->lowscore; if (censor) { int max = -1; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->histogram[sc - h->min] > max) { max = h->histogram[sc - h->min]; lowbound = sc; } } /* Determine initial upper bound on fitted region. */ highbound = MIN(high_hint, h->highscore); /* Now, iteratively converge on our lambda, mu: */ for (iteration = 0; iteration < 100; iteration++) { /* Construct x, y vectors. */ x = NULL; y = NULL; hsize = highbound - lowbound + 1; if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */ x = MallocOrDie(sizeof(float) * hsize); y = MallocOrDie(sizeof(int) * hsize); n = 0; for (sc = lowbound; sc <= highbound; sc++) { x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */ y[sc-lowbound] = h->histogram[sc - h->min]; n += h->histogram[sc - h->min]; } if (n < 100) goto FITFAILED; /* require fitting to at least 100 points */ /* If we're censoring, estimate z, the number of censored guys * left of the bound. Our initial estimate is crudely that we're * missing e^-1 of the total distribution (which would be exact * if we censored exactly at mu; but we censored at the observed peak). * Subsequent estimates are more exact based on our current estimate of mu. */ if (censor) { if (iteration == 0) z = MIN(h->total-n, (int) (0.58198 * (float) n)); else { double psx; psx = EVDDistribution((float) lowbound, mu, lambda); z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx))); } } /* Do an ML fit */ if (censor) { if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda)) goto FITFAILED; } else if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda)) goto FITFAILED; /* Find the Eval = 1 point as a new highbound; * the total number of samples estimated to "belong" to the EVD is n+z */ new_highbound = (int) (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda)); free(x); free(y); if (new_highbound >= highbound) break; highbound = new_highbound; } /* Set the histogram parameters; * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom * for fitting mu, lambda, but we get 1 back because we're unnormalized * in this interval, hence we pass 2-1 = 1 as ndegrees. */ ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1); return 1; FITFAILED: UnfitHistogram(h); if (x != NULL) free(x); if (y != NULL) free(y); return 0; }
int ExtremeValueFitHistogram(Histogram * h, int censor, float high_hint) { float *x; /* array of EVD samples to fit */ int *y; /* histogram counts */ int n; /* number of observed samples */ int z; /* number of censored samples */ int hsize; /* size of histogram */ float lambda, mu; /* new estimates of lambda, mu */ int sc; /* loop index for score */ int lowbound; /* lower bound of fitted region*/ int highbound; /* upper bound of fitted region*/ int new_highbound; int iteration; /* Determine lower bound on fitted region; * if we're censoring the data, choose the peak of the histogram. * if we're not, then we take the whole histogram. */ lowbound = h->lowscore; if (censor) { int max = -1; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->histogram[sc - h->min] > max) { max = h->histogram[sc - h->min]; lowbound = sc; } } /* Determine initial upper bound on fitted region. */ highbound = MIN(high_hint, h->highscore); /* Now, iteratively converge on our lambda, mu: */ for (iteration = 0; iteration < 100; iteration++) { /* Construct x, y vectors. */ x = NULL; y = NULL; hsize = highbound - lowbound + 1; if (hsize < 5) { warn("On iteration %d, got %d bins, which is not fitable",iteration,hsize); goto FITFAILED; /* require at least 5 bins or we don't fit */ } x = ckalloc(sizeof(float) * hsize); y = ckalloc(sizeof(int) * hsize); if( x == NULL || y == NULL ) { warn("Out of temporary memory for evd fitting... exiting with error, though I'd worry about this"); return 0; } n = 0; for (sc = lowbound; sc <= highbound; sc++) { x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */ y[sc-lowbound] = h->histogram[sc - h->min]; n += h->histogram[sc - h->min]; } if (n < 100) { warn("On iteration %d, got only %d points, which is not fitable",iteration,n); goto FITFAILED; /* require fitting to at least 100 points */ } /* If we're censoring, estimate z, the number of censored guys * left of the bound. Our initial estimate is crudely that we're * missing e^-1 of the total distribution (which would be exact * if we censored exactly at mu; but we censored at the observed peak). * Subsequent estimates are more exact based on our current estimate of mu. */ if (censor) { if (iteration == 0) z = MIN(h->total-n, (int) (0.58198 * (float) n)); else { double psx; psx = EVDDistribution((float) lowbound, mu, lambda); z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx))); } } /* Do an ML fit */ if (censor) { if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda)) { warn("On iteration %d, unable to make maxlikehood evd fit with censor",iteration); goto FITFAILED; } } else { if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda)) { warn("On iteration %d, unable to make maxlikehood evd fit without censor",iteration); goto FITFAILED; } } /* Find the Eval = 1 point as a new highbound; * the total number of samples estimated to "belong" to the EVD is n+z */ new_highbound = (int) (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda)); free(x); free(y); if (new_highbound >= highbound) break; highbound = new_highbound; } /* Set the histogram parameters; * - the wonka factor is n+z / h->total : e.g. that's the fraction of the * hits that we expect to match the EVD, others are generally lower * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom * for fitting mu, lambda, but we get 1 back because we're unnormalized * in this interval, hence we pass 2-1 = 1 as ndegrees. * * Mon Jan 19 06:18:14 1998: wonka = 1.0, temporarily disabled. */ ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1.0, 1); return 1; FITFAILED: UnfitHistogram(h); if (x != NULL) free(x); if (y != NULL) free(y); return 0; }