Example #1
0
/* Function:  esl_hxp_FitGuessBinned()
 *
 * Purpose:   Given a histogram <g> with binned observations;
 *            obtain a very crude guesstimate of a fit -- suitable only 
 *            as a starting point for further optimization -- and return 
 *            those parameters in <h>.
 *
 *            Assigns $q_k \propto \frac{1}{k}$ and  $\mu = \min_i x_i$;
 *            splits $x$ into $K$ roughly equal-sized bins, and
 *            and assigns $\lambda_k$ as the ML estimate from bin $k$.
 *            If the coefficients have already been set to known values,
 *            this step is skipped.
 */
int
esl_hxp_FitGuessBinned(ESL_HISTOGRAM *g, ESL_HYPEREXP *h)
{
  double sum;
  int    n;
  int    i,k;
  int    nb;
  double ai;

  if      (g->is_tailfit) h->mu = g->phi;  /* all x > mu in this case */
  else if (g->is_rounded) h->mu = esl_histogram_Bin2LBound(g, g->imin);
  else                    h->mu = g->xmin; 

  nb    = g->imax - g->cmin + 1;
  k     = h->K-1;
  sum   = 0;
  n     = 0;
  for (i = g->imax; i >= g->cmin; i--)
    {
      ai = esl_histogram_Bin2LBound(g,i);
      if (ai < g->xmin) ai = g->xmin;
      n      += g->obs[i];
      sum    += g->obs[i] * ai;
      
      if (i == g->cmin + (k*nb)/h->K)
	h->lambda[k--] = 1 / ((sum/(double) n) - ai);
    }

  if (! h->fixmix) {
    for (k = 0; k < h->K; k++)
      h->q[k] = 1 / (double) h->K;
  }

  return eslOK;
}
/* Function:  esl_exp_FitCompleteBinned()
* Incept:    SRE, Sun Aug 21 13:07:22 2005 [St. Louis]
*
* Purpose:   Fit a complete exponential distribution to the observed
*            binned data in a histogram <g>, where each
*            bin i holds some number of observed samples x with values from 
*            lower bound l to upper bound u (that is, $l < x \leq u$);
*            find maximum likelihood parameters $\mu,\lambda$ and 
*            return them in <*ret_mu>, <*ret_lambda>.
*            
*            If the binned data in <g> were set to focus on 
*            a tail by virtual censoring, the "complete" exponential is 
*            fitted to this tail. The caller then also needs to
*            remember what fraction of the probability mass was in this
*            tail.
*            
*            The ML estimate for $mu$ is the smallest observed
*            sample.  For complete data, <ret_mu> is generally set to
*            the smallest observed sample value, except in the
*            special case of a "rounded" complete dataset, where
*            <ret_mu> is set to the lower bound of the smallest
*            occupied bin. For tails, <ret_mu> is set to the cutoff
*            threshold <phi>, where we are guaranteed that <phi> is
*            at the lower bound of a bin (by how the histogram
*            object sets tails). 
*
*            The ML estimate for <ret_lambda> has an analytical 
*            solution, so this routine is fast. 
*            
*            If all the data are in one bin, the ML estimate of
*            $\lambda$ will be $\infty$. This is mathematically correct,
*            but is probably a situation the caller wants to avoid, perhaps
*            by choosing smaller bins.
*
*            This function currently cannot fit an exponential tail
*            to truly censored, binned data, because it assumes that
*            all bins have equal width, but in true censored data, the
*            lower cutoff <phi> may fall anywhere in the first bin.
*
* Returns:   <eslOK> on success.
*
* Throws:    <eslEINVAL> if dataset is true-censored.
*/
int
esl_exp_FitCompleteBinned(ESL_HISTOGRAM *g, double *ret_mu, double *ret_lambda)
{
    int    i;
    double ai, bi, delta;
    double sa, sb;
    double mu = 0.;

	if (g->dataset_is == ESL_HISTOGRAM::COMPLETE)
    {
        if   (g->is_rounded) mu = esl_histogram_Bin2LBound(g, g->imin);
        else                 mu = g->xmin;
    }
    else if (g->dataset_is == ESL_HISTOGRAM::VIRTUAL_CENSORED) /* i.e., we'll fit to tail */
        mu = g->phi;
    else if (g->dataset_is == ESL_HISTOGRAM::TRUE_CENSORED)
        ESL_EXCEPTION(eslEINVAL, "can't fit true censored dataset");

    delta = g->w;
    sa = sb = 0.;
    for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */
    {
        if (g->obs[i] == 0) continue;
        ai = esl_histogram_Bin2LBound(g,i);
        bi = esl_histogram_Bin2UBound(g,i);
        sa += g->obs[i] * (ai-mu);
        sb += g->obs[i] * (bi-mu);
    }
    *ret_mu     = mu;
    *ret_lambda = 1/delta * (log(sb) - log(sa));
    return eslOK;
}
Example #3
0
/* Function:  esl_sxp_FitCompleteBinned()
 *
 * Purpose:   Given a histogram <g> with binned observations, where each
 *            bin i holds some number of observed samples x with values from 
 *            lower bound l to upper bound u (that is, $l < x \leq u$);
 *            find maximum likelihood parameters mu, lambda, tau by conjugate
 *            gradient descent optimization.
 */
int
esl_sxp_FitCompleteBinned(ESL_HISTOGRAM *g,
			  double *ret_mu, double *ret_lambda, double *ret_tau)

{
  struct sxp_binned_data data;
  double p[2], u[2], wrk[8];
  double mu, tau, lambda;
  double tol = 1e-6;
  double fx;
  int    status;
  double ai, mean;
  int    i;

  /* Set the fixed mu.
   * Make a good initial guess of lambda, based on exponential fit.
   * Choose an arbitrary tau.
   */
  if      (g->is_tailfit) mu = g->phi;  /* all x > mu in this case */
  else if (g->is_rounded) mu = esl_histogram_Bin2LBound(g, g->imin);
  else                    mu = g->xmin; 

  mean = 0.;
  for (i = g->cmin; i <= g->imax; i++) 
    { 
      ai = esl_histogram_Bin2LBound(g, i);
      ai += 0.5*g->w;		/* midpoint in bin */
      mean += (double)g->obs[i] * ai;
    }
  mean  /= g->No;
  lambda = 1 / (mean - mu);

  tau    = 0.9;

  /* load data structure, param vector, and step vector */
  data.g  = g;
  data.mu = mu;
  p[0]    = log(lambda);
  p[1]    = log(tau);
  u[0]    = 1.0;
  u[1]    = 1.0;

  /* hand it off */
  status =  esl_min_ConjugateGradientDescent(p, u, 2, 
					     &sxp_complete_binned_func, 
					     NULL,
					     (void *) (&data), tol, wrk, &fx);
  *ret_mu     = mu;
  *ret_lambda = exp(p[0]);
  *ret_tau    = exp(p[1]);
  return status;
}
Example #4
0
static double 
hyperexp_complete_binned_func(double *p, int np, void *dptr)
{
  struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr;
  ESL_HISTOGRAM               *g    = data->g;
  ESL_HYPEREXP                *h    = data->h;
  double logL = 0.;
  double ai, delta;
  int    i,k;

  hyperexp_unpack_paramvector(p, np, h);
  delta = g->w;
  /* counting over occupied, uncensored histogram bins */
  for (i = g->cmin; i <= g->imax; i++) 
    {
      if (g->obs[i] == 0) continue; /* skip unoccupied ones */

      ai    = esl_histogram_Bin2LBound(g, i);
      if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */

      for (k = 0; k < h->K; k++)
	{
	  h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu);
	  if (delta * h->lambda[k] < eslSMALLX1) 
	    h->wrk[k] += log(delta * h->lambda[k]);
	  else
	    h->wrk[k] += log(1 - exp(-delta * h->lambda[k]));
	}
      logL += g->obs[i] * esl_vec_DLogSum(h->wrk, h->K);
    }
  return -logL;
}
Example #5
0
static double 
sxp_complete_binned_func(double *p, int np, void *dptr)
{
  struct sxp_binned_data *data = (struct sxp_binned_data *) dptr;
  ESL_HISTOGRAM          *g    = data->g;
  double logL = 0.;
  double ai, bi;		/* lower, upper bounds on bin */
  double lambda, tau;
  int    i;
  double tmp;

  lambda = exp(p[0]);
  tau    = exp(p[1]);  

  ESL_DASSERT1(( ! isnan(lambda) ));
  ESL_DASSERT1(( ! isnan(tau) ));
  
  for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */
    {
      if (g->obs[i] == 0) continue;
      
      ai = esl_histogram_Bin2LBound(g, i);
      bi = esl_histogram_Bin2UBound(g, i);
      if (ai < data->mu) ai = data->mu; /* careful at leftmost bound */

      tmp = esl_sxp_cdf(bi, data->mu, lambda, tau) -
            esl_sxp_cdf(ai, data->mu, lambda, tau);
      if      (tmp == 0.) return eslINFINITY;
      logL += g->obs[i] * log(tmp);
    }
  return -logL;			/* minimizing NLL */
}
Example #6
0
static void
hyperexp_complete_binned_gradient(double *p, int np, void *dptr, double *dp)
{
  struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr;
  ESL_HISTOGRAM               *g    = data->g;
  ESL_HYPEREXP                *h    = data->h;
  int i,k;
  int pidx;			
  double z;
  double tmp;
  double ai, delta;
  
  hyperexp_unpack_paramvector(p, np, h);
  esl_vec_DSet(dp, np, 0.);
  delta = g->w;

  /* counting over occupied, uncensored histogram bins */
  for (i = g->cmin; i <= g->imax; i++)
    {
      if (g->obs[i] == 0) continue;
      ai = esl_histogram_Bin2LBound(g, i);
      if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */

      /* Calculate log (q_m alpha_m(a_i) terms
       */
      for (k = 0; k < h->K; k++)
	{
	  h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu);
	  if (delta * h->lambda[k] < eslSMALLX1) 
	    h->wrk[k] += log(delta * h->lambda[k]);
	  else
	    h->wrk[k] += log(1 - exp(-delta * h->lambda[k]));
	}
      z = esl_vec_DLogSum(h->wrk, h->K); /* z= log \sum_k q_k alpha_k(a_i) */

      /* Bump the gradients for Q_1..Q_{K-1} */
      pidx = 0;
      if (! h->fixmix) {
	for (k = 1; k < h->K; k++)
	  dp[pidx++] -= g->obs[i] * (exp(h->wrk[k] - z) - h->q[k]);
      }
	
      /* Bump the gradients for w_0..w_{K-1}
       */
      for (k = 0; k < h->K; k++)
	if (! h->fixlambda[k])
	  {
	    tmp  = log(h->q[k]) + log(h->lambda[k])- h->lambda[k]*(ai-h->mu);
	    tmp  = exp(tmp - z);
	    tmp *= (ai + delta - h->mu) * exp(-delta * h->lambda[k]) - (ai - h->mu);
	    dp[pidx++] -= g->obs[i] * tmp;
	  }
    }  
}
Example #7
0
/* wei_binned_func():
 * Returns the negative log likelihood of a binned data sample,
 * in the API of the conjugate gradient descent optimizer in esl_minimizer.
 */
static double
wei_binned_func(double *p, int nparam, void *dptr)
{
  struct wei_binned_data *data = (struct wei_binned_data *) dptr;
  ESL_HISTOGRAM          *h    = data->h;
  double lambda, tau;
  double logL;
  double ai,bi;
  int    i; 
  double tmp;
    
  /* Unpack what the optimizer gave us.
   */
  lambda = exp(p[0]); /* see below for c.o.v. notes */
  tau    = exp(p[1]);

  logL = 0.;
  for (i = h->cmin; i <= h->imax; i++)
    {
      if (h->obs[i] == 0) continue;

      ai = esl_histogram_Bin2LBound(h,i);
      bi = esl_histogram_Bin2UBound(h,i);
      if (ai < data->mu) ai = data->mu;

      tmp = esl_wei_cdf(bi, data->mu, lambda, tau) -
            esl_wei_cdf(ai, data->mu, lambda, tau);

      /* for cdf~1.0, numerical roundoff error can create tmp<0 by a
       * teensy amount; tolerate that, but catch anything worse */
      ESL_DASSERT1( (tmp + 1e-7 > 0.)); 
      if (tmp <= 0.) return eslINFINITY;

      logL += h->obs[i] * log(tmp);
    }
  return -logL;			/* goal: minimize NLL */
}
Example #8
0
/* Function:  esl_wei_FitCompleteBinned()
 *
 * Purpose:   Given a histogram <g> with binned observations, where each
 *            bin i holds some number of observed samples x with values from 
 *            lower bound l to upper bound u (that is, $l < x \leq u$), and
 *            <mu>, the known offset (minimum value) of the distribution;
 *            return maximum likelihood parameters <ret_lambda>
 *            and <ret_tau>.
 *            
 * Args:      x          - complete GEV-distributed data [0..n-1]
 *            n          - number of samples in <x>
 *            ret_mu     - lower bound of the distribution (all x_i > mu)
 *            ret_lambda - RETURN: maximum likelihood estimate of lambda
 *            ret_tau    - RETURN: maximum likelihood estimate of tau
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslENOHALT> if the fit doesn't converge.
 *
 * Xref:      STL9/136-137
 */
int
esl_wei_FitCompleteBinned(ESL_HISTOGRAM *h, double *ret_mu,
			  double *ret_lambda, double *ret_tau)
{
  struct wei_binned_data data;
  double p[2];			/* parameter vector                  */
  double u[2];			/* max initial step size vector      */
  double wrk[8];		/* 4 tmp vectors of length 2         */
  double mean;
  double mu, lambda, tau;      	/* initial param guesses             */
  double tol = 1e-6;		/* convergence criterion for CG      */
  double fx;			/* f(x) at minimum; currently unused */
  int    status;
  int    i;
  double ai;

  /* Set the fixed mu.
   * Make a good initial guess of lambda, based on exponential fit.
   * Choose an arbitrary tau.
   */
  if      (h->is_tailfit) mu = h->phi;  /* all x > mu in this case */
  else if (h->is_rounded) mu = esl_histogram_Bin2LBound(h, h->imin);
  else                    mu = h->xmin; 

  mean = 0.;
  for (i = h->cmin; i <= h->imax; i++) 
    { 
      ai = esl_histogram_Bin2LBound(h, i);
      ai += 0.5*h->w;		/* midpoint in bin */
      mean += (double)h->obs[i] * ai;
    }
  mean  /= h->No;
  lambda = 1 / (mean - mu);

  tau    = 0.9;

  /* load the data structure */
  data.h   = h;
  data.mu  = mu;

  /* Change of variables;
   *   lambda > 0, so c.o.v.  lambda = exp^w,  w = log(lambda);
   *   tau > 0, same c.o.v.
   */
  p[0] = log(lambda);		
  p[1] = log(tau);

  u[0] = 1.0;
  u[1] = 1.0;

  /* pass problem to the optimizer
   */
  status = esl_min_ConjugateGradientDescent(p, u, 2, 
					    &wei_binned_func, NULL,
					    (void *)(&data),
					    tol, wrk, &fx);
  *ret_mu     = mu;
  *ret_lambda = exp(p[0]);
  *ret_tau    = exp(p[1]);
  return status;
}