/* Function: esl_mixgev_invcdf() * * Purpose: Calculates the inverse CDF for a mixture GEV <mg>, * returning the quantile <x> at which the CDF is <p>, * where $0 < p < 1$. * * The inverse CDF of a mixture model has no analytical * expression as far as I'm aware. The calculation here is * a brute force bisection search in <x> using the CDF * function. It will suffice for a small number of calls * (for plotting applications, for example), but beware, it is not * efficient. */ double esl_mixgev_invcdf(double p, ESL_MIXGEV *mg) { double x1, x2, xm; /* low, high guesses at x */ double f1, f2, fm; double tol = 1e-6; x2 = esl_vec_DMin(mg->mu, mg->K); x1 = x2 - 1.; do { /* bracket, left side */ x1 = x1 + 2.*(x2-x1); f1 = esl_mixgev_cdf(x1, mg); } while (f1 > p); do { /* bracket, right side */ x2 = x2 + 2.*(x2-x1); f2 = esl_mixgev_cdf(x2, mg); } while (f2 < p); do { /* bisection */ xm = (x1+x2) / 2.; fm = esl_mixgev_cdf(xm, mg); if (fm > p) x2 = xm; else if (fm < p) x1 = xm; else return xm; /* unlikely case of fm==p */ } while ( (x2-x1)/(x1+x2+1e-9) > tol); xm = (x1+x2) / 2.; return xm; }
/* Function: esl_wei_FitComplete() * * Purpose: Given an array of <n> samples <x[0]..x[n-1>, fit * them to a stretched exponential distribution starting * at lower bound <mu> (all $x_i > \mu$), and * return maximum likelihood parameters <ret_lambda> * and <ret_tau>. * * Args: x - complete GEV-distributed data [0..n-1] * n - number of samples in <x> * ret_mu - RETURN: lower bound of the distribution (all x_i >= mu) * ret_lambda - RETURN: maximum likelihood estimate of lambda * ret_tau - RETURN: maximum likelihood estimate of tau * * Returns: <eslOK> on success. * * Throws: <eslENOHALT> if the fit doesn't converge. * * Xref: STL9/136-137 */ int esl_wei_FitComplete(double *x, int n, double *ret_mu, double *ret_lambda, double *ret_tau) { struct wei_data data; double p[2]; /* parameter vector */ double u[2]; /* max initial step size vector */ double wrk[8]; /* 4 tmp vectors of length 2 */ double mean; double mu, lambda, tau; /* initial param guesses */ double tol = 1e-6; /* convergence criterion for CG */ double fx; /* f(x) at minimum; currently unused */ int status; /* Make a good initial guess, based on exponential fit; * set an arbitrary tau. */ mu = esl_vec_DMin(x, n); esl_stats_DMean(x, n, &mean, NULL); lambda = 1 / (mean - mu); tau = 0.9; /* Load the data structure */ data.x = x; data.n = n; data.mu = mu; /* Change of variables; * lambda > 0, so c.o.v. lambda = exp^w, w = log(lambda); * tau > 0, same c.o.v. */ p[0] = log(lambda); p[1] = log(tau); u[0] = 1.0; u[1] = 1.0; /* pass problem to the optimizer */ status = esl_min_ConjugateGradientDescent(p, u, 2, &wei_func, NULL, (void *)(&data), tol, wrk, &fx); *ret_mu = mu; *ret_lambda = exp(p[0]); *ret_tau = exp(p[1]); return status; }
/* Function: esl_sxp_FitComplete() * * Purpose: Given a vector of <n> observed data samples <x[]>, * find maximum likelihood parameters by conjugate gradient * descent optimization. */ int esl_sxp_FitComplete(double *x, int n, double *ret_mu, double *ret_lambda, double *ret_tau) { struct sxp_data data; double p[2], u[2], wrk[8]; double mu, tau, lambda; double mean; double tol = 1e-6; double fx; int status; /* initial guesses; mu is definitely = minimum x, * and just use arbitrary #'s to init lambda, tau */ mu = esl_vec_DMin(x, n); esl_stats_DMean(x, n, &mean, NULL); lambda = 1 / (mean - mu); tau = 0.9; /* load data structure, param vector, and step vector */ data.x = x; data.n = n; data.mu = mu; p[0] = log(lambda); p[1] = log(tau); u[0] = 1.0; u[1] = 1.0; /* hand it off */ status = esl_min_ConjugateGradientDescent(p, u, 2, &sxp_complete_func, NULL, (void *) (&data), tol, wrk, &fx); *ret_mu = mu; *ret_lambda = exp(p[0]); *ret_tau = exp(p[1]); return status; }
int main(int argc, char **argv) { ESL_HISTOGRAM *h; ESL_RANDOMNESS *r; ESL_HYPEREXP *hxp; ESL_HYPEREXP *ehxp; int n = 20000; double binwidth = 0.1; int i; double x; double *data; int ndata; int k, ek, mink; double mindiff, diff; int opti; int be_verbose = FALSE; char *paramfile = NULL; char *plotfile = NULL; FILE *pfp = stdout; int plot_pdf = FALSE; int plot_logpdf = FALSE; int plot_cdf = FALSE; int plot_logcdf = FALSE; int plot_surv = FALSE; int plot_logsurv = FALSE; int xmin_set = FALSE; double xmin; int xmax_set = FALSE; double xmax; int xstep_set = FALSE; double xstep; int do_fixmix = FALSE; int status; for (opti = 1; opti < argc && *(argv[opti]) == '-'; opti++) { if (strcmp(argv[opti], "-f") == 0) do_fixmix = TRUE; else if (strcmp(argv[opti], "-i") == 0) paramfile = argv[++opti]; else if (strcmp(argv[opti], "-n") == 0) n = atoi(argv[++opti]); else if (strcmp(argv[opti], "-o") == 0) plotfile = argv[++opti]; else if (strcmp(argv[opti], "-v") == 0) be_verbose = TRUE; else if (strcmp(argv[opti], "-w") == 0) binwidth = atof(argv[++opti]); else if (strcmp(argv[opti], "-C") == 0) plot_cdf = TRUE; else if (strcmp(argv[opti], "-LC") == 0) plot_logcdf = TRUE; else if (strcmp(argv[opti], "-P") == 0) plot_pdf = TRUE; else if (strcmp(argv[opti], "-LP") == 0) plot_logpdf = TRUE; else if (strcmp(argv[opti], "-S") == 0) plot_surv = TRUE; else if (strcmp(argv[opti], "-LS") == 0) plot_logsurv = TRUE; else if (strcmp(argv[opti], "-XL") == 0) { xmin_set = TRUE; xmin = atof(argv[++opti]); } else if (strcmp(argv[opti], "-XH") == 0) { xmax_set = TRUE; xmax = atof(argv[++opti]); } else if (strcmp(argv[opti], "-XS") == 0) { xstep_set = TRUE; xstep = atof(argv[++opti]); } else esl_fatal("bad option"); } if (paramfile != NULL) { status = esl_hyperexp_ReadFile(paramfile, &hxp); if (status == eslENOTFOUND) esl_fatal("Param file %s not found", paramfile); else if (status == eslEFORMAT) esl_fatal("Parse failed: param file %s invalid format", paramfile); else if (status != eslOK) esl_fatal("Unusual failure opening param file %s", paramfile); } else { hxp = esl_hyperexp_Create(3); hxp->mu = -2.0; hxp->q[0] = 0.5; hxp->q[1] = 0.3; hxp->q[2] = 0.2; hxp->lambda[0] = 1.0; hxp->lambda[1] = 0.3; hxp->lambda[2] = 0.1; } if (do_fixmix) esl_hyperexp_FixedUniformMixture(hxp); /* overrides q's above */ if (be_verbose) esl_hyperexp_Dump(stdout, hxp); r = esl_randomness_Create(42); h = esl_histogram_CreateFull(hxp->mu, 100., binwidth); if (plotfile != NULL) { if ((pfp = fopen(plotfile, "w")) == NULL) esl_fatal("Failed to open plotfile"); } if (! xmin_set) xmin = hxp->mu; if (! xmax_set) xmax = hxp->mu+ 20*(1. / esl_vec_DMin(hxp->lambda, hxp->K)); if (! xstep_set) xstep = 0.1; for (i = 0; i < n; i++) { x = esl_hxp_Sample(r, hxp); esl_histogram_Add(h, x); } esl_histogram_GetData(h, &data, &ndata); /* get sorted data vector */ ehxp = esl_hyperexp_Create(hxp->K); if (do_fixmix) esl_hyperexp_FixedUniformMixture(ehxp); esl_hxp_FitGuess(data, ndata, ehxp); if ( esl_hxp_FitComplete(data, ndata, ehxp) != eslOK) esl_fatal("Failed to fit hyperexponential"); if (be_verbose) esl_hyperexp_Dump(stdout, ehxp); if (fabs( (ehxp->mu-hxp->mu)/hxp->mu ) > 0.01) esl_fatal("Error in (complete) fitted mu > 1%\n"); for (ek = 0; ek < ehxp->K; ek++) { /* try to match each estimated lambda up to a parametric lambda */ mindiff = 1.0; mink = -1; for (k = 0; k < hxp->K; k++) { diff = fabs( (ehxp->lambda[ek] - hxp->lambda[k]) / hxp->lambda[k]); if (diff < mindiff) { mindiff = diff; mink = k; } } if (mindiff > 0.50) esl_fatal("Error in (complete) fitted lambda > 50%\n"); if (fabs( (ehxp->q[ek] - hxp->q[mink]) / hxp->q[mink]) > 1.0) esl_fatal("Error in (complete) fitted q > 2-fold%\n"); } esl_hxp_FitGuessBinned(h, ehxp); if ( esl_hxp_FitCompleteBinned(h, ehxp) != eslOK) esl_fatal("Failed to fit binned hyperexponential"); if (be_verbose) esl_hyperexp_Dump(stdout, ehxp); if (fabs( (ehxp->mu-hxp->mu)/hxp->mu ) > 0.01) esl_fatal("Error in (binned) fitted mu > 1%\n"); for (ek = 0; ek < ehxp->K; ek++) { /* try to match each estimated lambda up to a parametric lambda */ mindiff = 1.0; mink = -1; for (k = 0; k < hxp->K; k++) { diff = fabs( (ehxp->lambda[ek] - hxp->lambda[k]) / hxp->lambda[k]); if (diff < mindiff) { mindiff = diff; mink = k; } } if (mindiff > 0.50) esl_fatal("Error in (binned) fitted lambda > 50%\n"); if (fabs( (ehxp->q[ek] - hxp->q[mink]) / hxp->q[mink]) > 1.0) esl_fatal("Error in (binned) fitted q > 2-fold\n"); } if (plot_pdf) esl_hxp_Plot(pfp, hxp, &esl_hxp_pdf, xmin, xmax, xstep); if (plot_logpdf) esl_hxp_Plot(pfp, hxp, &esl_hxp_logpdf, xmin, xmax, xstep); if (plot_cdf) esl_hxp_Plot(pfp, hxp, &esl_hxp_cdf, xmin, xmax, xstep); if (plot_logcdf) esl_hxp_Plot(pfp, hxp, &esl_hxp_logcdf, xmin, xmax, xstep); if (plot_surv) esl_hxp_Plot(pfp, hxp, &esl_hxp_surv, xmin, xmax, xstep); if (plot_logsurv) esl_hxp_Plot(pfp, hxp, &esl_hxp_logsurv, xmin, xmax, xstep); if (plotfile != NULL) fclose(pfp); esl_histogram_Destroy(h); esl_hyperexp_Destroy(hxp); esl_hyperexp_Destroy(ehxp); esl_randomness_Destroy(r); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; int nsmall, nbig; int i; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE){ puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) { if (msa->wgt[i] < 0.2) nsmall++; if (msa->wgt[i] > 5.0) nbig++; } printf("%-20s %5d %5d %8.4f %8.4f %5d %5d\n", msa->name, msa->nseq, msa->alen, esl_vec_DMin(msa->wgt, msa->nseq), esl_vec_DMax(msa->wgt, msa->nseq), nsmall, nbig); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); return eslOK; }