/* Function: esl_hxp_FitGuessBinned() * * Purpose: Given a histogram <g> with binned observations; * obtain a very crude guesstimate of a fit -- suitable only * as a starting point for further optimization -- and return * those parameters in <h>. * * Assigns $q_k \propto \frac{1}{k}$ and $\mu = \min_i x_i$; * splits $x$ into $K$ roughly equal-sized bins, and * and assigns $\lambda_k$ as the ML estimate from bin $k$. * If the coefficients have already been set to known values, * this step is skipped. */ int esl_hxp_FitGuessBinned(ESL_HISTOGRAM *g, ESL_HYPEREXP *h) { double sum; int n; int i,k; int nb; double ai; if (g->is_tailfit) h->mu = g->phi; /* all x > mu in this case */ else if (g->is_rounded) h->mu = esl_histogram_Bin2LBound(g, g->imin); else h->mu = g->xmin; nb = g->imax - g->cmin + 1; k = h->K-1; sum = 0; n = 0; for (i = g->imax; i >= g->cmin; i--) { ai = esl_histogram_Bin2LBound(g,i); if (ai < g->xmin) ai = g->xmin; n += g->obs[i]; sum += g->obs[i] * ai; if (i == g->cmin + (k*nb)/h->K) h->lambda[k--] = 1 / ((sum/(double) n) - ai); } if (! h->fixmix) { for (k = 0; k < h->K; k++) h->q[k] = 1 / (double) h->K; } return eslOK; }
/* Function: esl_exp_FitCompleteBinned() * Incept: SRE, Sun Aug 21 13:07:22 2005 [St. Louis] * * Purpose: Fit a complete exponential distribution to the observed * binned data in a histogram <g>, where each * bin i holds some number of observed samples x with values from * lower bound l to upper bound u (that is, $l < x \leq u$); * find maximum likelihood parameters $\mu,\lambda$ and * return them in <*ret_mu>, <*ret_lambda>. * * If the binned data in <g> were set to focus on * a tail by virtual censoring, the "complete" exponential is * fitted to this tail. The caller then also needs to * remember what fraction of the probability mass was in this * tail. * * The ML estimate for $mu$ is the smallest observed * sample. For complete data, <ret_mu> is generally set to * the smallest observed sample value, except in the * special case of a "rounded" complete dataset, where * <ret_mu> is set to the lower bound of the smallest * occupied bin. For tails, <ret_mu> is set to the cutoff * threshold <phi>, where we are guaranteed that <phi> is * at the lower bound of a bin (by how the histogram * object sets tails). * * The ML estimate for <ret_lambda> has an analytical * solution, so this routine is fast. * * If all the data are in one bin, the ML estimate of * $\lambda$ will be $\infty$. This is mathematically correct, * but is probably a situation the caller wants to avoid, perhaps * by choosing smaller bins. * * This function currently cannot fit an exponential tail * to truly censored, binned data, because it assumes that * all bins have equal width, but in true censored data, the * lower cutoff <phi> may fall anywhere in the first bin. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if dataset is true-censored. */ int esl_exp_FitCompleteBinned(ESL_HISTOGRAM *g, double *ret_mu, double *ret_lambda) { int i; double ai, bi, delta; double sa, sb; double mu = 0.; if (g->dataset_is == ESL_HISTOGRAM::COMPLETE) { if (g->is_rounded) mu = esl_histogram_Bin2LBound(g, g->imin); else mu = g->xmin; } else if (g->dataset_is == ESL_HISTOGRAM::VIRTUAL_CENSORED) /* i.e., we'll fit to tail */ mu = g->phi; else if (g->dataset_is == ESL_HISTOGRAM::TRUE_CENSORED) ESL_EXCEPTION(eslEINVAL, "can't fit true censored dataset"); delta = g->w; sa = sb = 0.; for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */ { if (g->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(g,i); bi = esl_histogram_Bin2UBound(g,i); sa += g->obs[i] * (ai-mu); sb += g->obs[i] * (bi-mu); } *ret_mu = mu; *ret_lambda = 1/delta * (log(sb) - log(sa)); return eslOK; }
/* Function: esl_sxp_FitCompleteBinned() * * Purpose: Given a histogram <g> with binned observations, where each * bin i holds some number of observed samples x with values from * lower bound l to upper bound u (that is, $l < x \leq u$); * find maximum likelihood parameters mu, lambda, tau by conjugate * gradient descent optimization. */ int esl_sxp_FitCompleteBinned(ESL_HISTOGRAM *g, double *ret_mu, double *ret_lambda, double *ret_tau) { struct sxp_binned_data data; double p[2], u[2], wrk[8]; double mu, tau, lambda; double tol = 1e-6; double fx; int status; double ai, mean; int i; /* Set the fixed mu. * Make a good initial guess of lambda, based on exponential fit. * Choose an arbitrary tau. */ if (g->is_tailfit) mu = g->phi; /* all x > mu in this case */ else if (g->is_rounded) mu = esl_histogram_Bin2LBound(g, g->imin); else mu = g->xmin; mean = 0.; for (i = g->cmin; i <= g->imax; i++) { ai = esl_histogram_Bin2LBound(g, i); ai += 0.5*g->w; /* midpoint in bin */ mean += (double)g->obs[i] * ai; } mean /= g->No; lambda = 1 / (mean - mu); tau = 0.9; /* load data structure, param vector, and step vector */ data.g = g; data.mu = mu; p[0] = log(lambda); p[1] = log(tau); u[0] = 1.0; u[1] = 1.0; /* hand it off */ status = esl_min_ConjugateGradientDescent(p, u, 2, &sxp_complete_binned_func, NULL, (void *) (&data), tol, wrk, &fx); *ret_mu = mu; *ret_lambda = exp(p[0]); *ret_tau = exp(p[1]); return status; }
static double hyperexp_complete_binned_func(double *p, int np, void *dptr) { struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr; ESL_HISTOGRAM *g = data->g; ESL_HYPEREXP *h = data->h; double logL = 0.; double ai, delta; int i,k; hyperexp_unpack_paramvector(p, np, h); delta = g->w; /* counting over occupied, uncensored histogram bins */ for (i = g->cmin; i <= g->imax; i++) { if (g->obs[i] == 0) continue; /* skip unoccupied ones */ ai = esl_histogram_Bin2LBound(g, i); if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */ for (k = 0; k < h->K; k++) { h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu); if (delta * h->lambda[k] < eslSMALLX1) h->wrk[k] += log(delta * h->lambda[k]); else h->wrk[k] += log(1 - exp(-delta * h->lambda[k])); } logL += g->obs[i] * esl_vec_DLogSum(h->wrk, h->K); } return -logL; }
static double sxp_complete_binned_func(double *p, int np, void *dptr) { struct sxp_binned_data *data = (struct sxp_binned_data *) dptr; ESL_HISTOGRAM *g = data->g; double logL = 0.; double ai, bi; /* lower, upper bounds on bin */ double lambda, tau; int i; double tmp; lambda = exp(p[0]); tau = exp(p[1]); ESL_DASSERT1(( ! isnan(lambda) )); ESL_DASSERT1(( ! isnan(tau) )); for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */ { if (g->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(g, i); bi = esl_histogram_Bin2UBound(g, i); if (ai < data->mu) ai = data->mu; /* careful at leftmost bound */ tmp = esl_sxp_cdf(bi, data->mu, lambda, tau) - esl_sxp_cdf(ai, data->mu, lambda, tau); if (tmp == 0.) return eslINFINITY; logL += g->obs[i] * log(tmp); } return -logL; /* minimizing NLL */ }
static void hyperexp_complete_binned_gradient(double *p, int np, void *dptr, double *dp) { struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr; ESL_HISTOGRAM *g = data->g; ESL_HYPEREXP *h = data->h; int i,k; int pidx; double z; double tmp; double ai, delta; hyperexp_unpack_paramvector(p, np, h); esl_vec_DSet(dp, np, 0.); delta = g->w; /* counting over occupied, uncensored histogram bins */ for (i = g->cmin; i <= g->imax; i++) { if (g->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(g, i); if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */ /* Calculate log (q_m alpha_m(a_i) terms */ for (k = 0; k < h->K; k++) { h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu); if (delta * h->lambda[k] < eslSMALLX1) h->wrk[k] += log(delta * h->lambda[k]); else h->wrk[k] += log(1 - exp(-delta * h->lambda[k])); } z = esl_vec_DLogSum(h->wrk, h->K); /* z= log \sum_k q_k alpha_k(a_i) */ /* Bump the gradients for Q_1..Q_{K-1} */ pidx = 0; if (! h->fixmix) { for (k = 1; k < h->K; k++) dp[pidx++] -= g->obs[i] * (exp(h->wrk[k] - z) - h->q[k]); } /* Bump the gradients for w_0..w_{K-1} */ for (k = 0; k < h->K; k++) if (! h->fixlambda[k]) { tmp = log(h->q[k]) + log(h->lambda[k])- h->lambda[k]*(ai-h->mu); tmp = exp(tmp - z); tmp *= (ai + delta - h->mu) * exp(-delta * h->lambda[k]) - (ai - h->mu); dp[pidx++] -= g->obs[i] * tmp; } } }
/* wei_binned_func(): * Returns the negative log likelihood of a binned data sample, * in the API of the conjugate gradient descent optimizer in esl_minimizer. */ static double wei_binned_func(double *p, int nparam, void *dptr) { struct wei_binned_data *data = (struct wei_binned_data *) dptr; ESL_HISTOGRAM *h = data->h; double lambda, tau; double logL; double ai,bi; int i; double tmp; /* Unpack what the optimizer gave us. */ lambda = exp(p[0]); /* see below for c.o.v. notes */ tau = exp(p[1]); logL = 0.; for (i = h->cmin; i <= h->imax; i++) { if (h->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(h,i); bi = esl_histogram_Bin2UBound(h,i); if (ai < data->mu) ai = data->mu; tmp = esl_wei_cdf(bi, data->mu, lambda, tau) - esl_wei_cdf(ai, data->mu, lambda, tau); /* for cdf~1.0, numerical roundoff error can create tmp<0 by a * teensy amount; tolerate that, but catch anything worse */ ESL_DASSERT1( (tmp + 1e-7 > 0.)); if (tmp <= 0.) return eslINFINITY; logL += h->obs[i] * log(tmp); } return -logL; /* goal: minimize NLL */ }
/* Function: esl_wei_FitCompleteBinned() * * Purpose: Given a histogram <g> with binned observations, where each * bin i holds some number of observed samples x with values from * lower bound l to upper bound u (that is, $l < x \leq u$), and * <mu>, the known offset (minimum value) of the distribution; * return maximum likelihood parameters <ret_lambda> * and <ret_tau>. * * Args: x - complete GEV-distributed data [0..n-1] * n - number of samples in <x> * ret_mu - lower bound of the distribution (all x_i > mu) * ret_lambda - RETURN: maximum likelihood estimate of lambda * ret_tau - RETURN: maximum likelihood estimate of tau * * Returns: <eslOK> on success. * * Throws: <eslENOHALT> if the fit doesn't converge. * * Xref: STL9/136-137 */ int esl_wei_FitCompleteBinned(ESL_HISTOGRAM *h, double *ret_mu, double *ret_lambda, double *ret_tau) { struct wei_binned_data data; double p[2]; /* parameter vector */ double u[2]; /* max initial step size vector */ double wrk[8]; /* 4 tmp vectors of length 2 */ double mean; double mu, lambda, tau; /* initial param guesses */ double tol = 1e-6; /* convergence criterion for CG */ double fx; /* f(x) at minimum; currently unused */ int status; int i; double ai; /* Set the fixed mu. * Make a good initial guess of lambda, based on exponential fit. * Choose an arbitrary tau. */ if (h->is_tailfit) mu = h->phi; /* all x > mu in this case */ else if (h->is_rounded) mu = esl_histogram_Bin2LBound(h, h->imin); else mu = h->xmin; mean = 0.; for (i = h->cmin; i <= h->imax; i++) { ai = esl_histogram_Bin2LBound(h, i); ai += 0.5*h->w; /* midpoint in bin */ mean += (double)h->obs[i] * ai; } mean /= h->No; lambda = 1 / (mean - mu); tau = 0.9; /* load the data structure */ data.h = h; data.mu = mu; /* Change of variables; * lambda > 0, so c.o.v. lambda = exp^w, w = log(lambda); * tau > 0, same c.o.v. */ p[0] = log(lambda); p[1] = log(tau); u[0] = 1.0; u[1] = 1.0; /* pass problem to the optimizer */ status = esl_min_ConjugateGradientDescent(p, u, 2, &wei_binned_func, NULL, (void *)(&data), tol, wrk, &fx); *ret_mu = mu; *ret_lambda = exp(p[0]); *ret_tau = exp(p[1]); return status; }