/* Function:  esl_mixgev_invcdf()
 *
 * Purpose:   Calculates the inverse CDF for a mixture GEV <mg>,
 *            returning the quantile <x> at which the CDF is <p>,
 *            where $0 < p < 1$.
 *            
 *            The inverse CDF of a mixture model has no analytical
 *            expression as far as I'm aware. The calculation here is
 *            a brute force bisection search in <x> using the CDF
 *            function. It will suffice for a small number of calls
 *            (for plotting applications, for example), but beware, it is not
 *            efficient.
 */
double
esl_mixgev_invcdf(double p, ESL_MIXGEV *mg)
{
  double x1, x2, xm;		/* low, high guesses at x */
  double f1, f2, fm;
  double tol = 1e-6;

  x2 = esl_vec_DMin(mg->mu, mg->K);
  x1 = x2 - 1.;
  do {				/* bracket, left side */
    x1 = x1 + 2.*(x2-x1);
    f1 = esl_mixgev_cdf(x1, mg);
  } while (f1 > p);
  do {				/* bracket, right side */
    x2 = x2 + 2.*(x2-x1);
    f2 = esl_mixgev_cdf(x2, mg);
  } while (f2 < p);		

  do {				/* bisection */
    xm = (x1+x2) / 2.;
    fm = esl_mixgev_cdf(xm, mg);
    
    if      (fm > p) x2 = xm;
    else if (fm < p) x1 = xm;
    else return xm;		/* unlikely case of fm==p */
  } while ( (x2-x1)/(x1+x2+1e-9) > tol);

  xm = (x1+x2) / 2.;
  return xm;
}
示例#2
0
/* Function:  esl_wei_FitComplete()
 *
 * Purpose:   Given an array of <n> samples <x[0]..x[n-1>, fit
 *            them to a stretched exponential distribution starting
 *            at lower bound <mu> (all $x_i > \mu$), and 
 *            return maximum likelihood parameters <ret_lambda>
 *            and <ret_tau>.
 *            
 * Args:      x          - complete GEV-distributed data [0..n-1]
 *            n          - number of samples in <x>
 *            ret_mu     - RETURN: lower bound of the distribution (all x_i >= mu)
 *            ret_lambda - RETURN: maximum likelihood estimate of lambda
 *            ret_tau    - RETURN: maximum likelihood estimate of tau
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslENOHALT> if the fit doesn't converge.
 *
 * Xref:      STL9/136-137
 */
int
esl_wei_FitComplete(double *x, int n, double *ret_mu,
		    double *ret_lambda, double *ret_tau)
{
  struct wei_data data;
  double p[2];			/* parameter vector                  */
  double u[2];			/* max initial step size vector      */
  double wrk[8];		/* 4 tmp vectors of length 2         */
  double mean;
  double mu, lambda, tau;      	/* initial param guesses             */
  double tol = 1e-6;		/* convergence criterion for CG      */
  double fx;			/* f(x) at minimum; currently unused */
  int    status;

  /* Make a good initial guess, based on exponential fit;
   * set an arbitrary tau.
   */
  mu =  esl_vec_DMin(x, n);
  esl_stats_DMean(x, n, &mean, NULL);
  lambda = 1 / (mean - mu);
  tau    = 0.9;

  /* Load the data structure
   */
  data.x   = x;
  data.n   = n;
  data.mu  = mu;

  /* Change of variables;
   *   lambda > 0, so c.o.v.  lambda = exp^w,  w = log(lambda);
   *   tau > 0, same c.o.v.
   */
  p[0] = log(lambda);		
  p[1] = log(tau);

  u[0] = 1.0;
  u[1] = 1.0;

  /* pass problem to the optimizer
   */
  status = esl_min_ConjugateGradientDescent(p, u, 2, 
					    &wei_func, NULL,
					    (void *)(&data),
					    tol, wrk, &fx);
  *ret_mu     = mu;
  *ret_lambda = exp(p[0]);
  *ret_tau    = exp(p[1]);
  return status;
}
示例#3
0
/* Function:  esl_sxp_FitComplete()
 *
 * Purpose:   Given a vector of <n> observed data samples <x[]>,
 *            find maximum likelihood parameters by conjugate gradient 
 *            descent optimization.
 */
int
esl_sxp_FitComplete(double *x, int n,
		    double *ret_mu, double *ret_lambda, double *ret_tau)

{
  struct sxp_data data;
  double p[2], u[2], wrk[8];
  double mu, tau, lambda;
  double mean;
  double tol = 1e-6;
  double fx;
  int    status;

  /* initial guesses; mu is definitely = minimum x,
   * and just use arbitrary #'s to init lambda, tau
   */
  mu =  esl_vec_DMin(x, n);
  esl_stats_DMean(x, n, &mean, NULL);
  lambda = 1 / (mean - mu);
  tau    = 0.9;


  /* load data structure, param vector, and step vector */
  data.x  = x;
  data.n  = n;
  data.mu = mu;
  p[0]    = log(lambda);
  p[1]    = log(tau);
  u[0]    = 1.0;
  u[1]    = 1.0;

  /* hand it off */
  status =  esl_min_ConjugateGradientDescent(p, u, 2, 
					     &sxp_complete_func, 
					     NULL,
					     (void *) (&data), tol, wrk, &fx);
  *ret_mu     = mu;
  *ret_lambda = exp(p[0]);
  *ret_tau    = exp(p[1]);
  return status;
}
示例#4
0
int
main(int argc, char **argv)
{
  ESL_HISTOGRAM  *h;
  ESL_RANDOMNESS *r;
  ESL_HYPEREXP   *hxp;
  ESL_HYPEREXP   *ehxp;
  int     n         = 20000;
  double  binwidth  = 0.1;
  int     i;
  double  x;
  double *data;
  int     ndata;
  int     k, ek, mink;
  double  mindiff, diff;

  int     opti;
  int     be_verbose   = FALSE;
  char   *paramfile    = NULL;
  char   *plotfile     = NULL;
  FILE   *pfp          = stdout;
  int     plot_pdf     = FALSE;
  int     plot_logpdf  = FALSE;
  int     plot_cdf     = FALSE;
  int     plot_logcdf  = FALSE;
  int     plot_surv    = FALSE;
  int     plot_logsurv = FALSE;
  int     xmin_set     = FALSE;
  double  xmin;
  int     xmax_set     = FALSE;
  double  xmax;
  int     xstep_set    = FALSE;
  double  xstep;
  int     do_fixmix    = FALSE;
  int     status;

  for (opti = 1; opti < argc && *(argv[opti]) == '-'; opti++)
    {
      if      (strcmp(argv[opti], "-f")  == 0) do_fixmix    = TRUE;
      else if (strcmp(argv[opti], "-i")  == 0) paramfile    = argv[++opti];
      else if (strcmp(argv[opti], "-n")  == 0) n            = atoi(argv[++opti]);
      else if (strcmp(argv[opti], "-o")  == 0) plotfile     = argv[++opti];
      else if (strcmp(argv[opti], "-v")  == 0) be_verbose   = TRUE;
      else if (strcmp(argv[opti], "-w")  == 0) binwidth     = atof(argv[++opti]);
      else if (strcmp(argv[opti], "-C")  == 0) plot_cdf     = TRUE;
      else if (strcmp(argv[opti], "-LC") == 0) plot_logcdf  = TRUE;
      else if (strcmp(argv[opti], "-P")  == 0) plot_pdf     = TRUE;
      else if (strcmp(argv[opti], "-LP") == 0) plot_logpdf  = TRUE;
      else if (strcmp(argv[opti], "-S")  == 0) plot_surv    = TRUE;
      else if (strcmp(argv[opti], "-LS") == 0) plot_logsurv = TRUE;
      else if (strcmp(argv[opti], "-XL") == 0) { xmin_set  = TRUE; xmin  = atof(argv[++opti]); }
      else if (strcmp(argv[opti], "-XH") == 0) { xmax_set  = TRUE; xmax  = atof(argv[++opti]); }
      else if (strcmp(argv[opti], "-XS") == 0) { xstep_set = TRUE; xstep = atof(argv[++opti]); }
      else esl_fatal("bad option");
    }

  if (paramfile != NULL)
    {
      status = esl_hyperexp_ReadFile(paramfile, &hxp);
      if      (status == eslENOTFOUND) esl_fatal("Param file %s not found", paramfile);
      else if (status == eslEFORMAT)   esl_fatal("Parse failed: param file %s invalid format", paramfile);
      else if (status != eslOK)        esl_fatal("Unusual failure opening param file %s", paramfile);
    }
  else 
    {
      hxp = esl_hyperexp_Create(3);
      hxp->mu = -2.0;
      hxp->q[0]      = 0.5;    hxp->q[1]      = 0.3;   hxp->q[2]      = 0.2; 
      hxp->lambda[0] = 1.0;    hxp->lambda[1] = 0.3;   hxp->lambda[2] = 0.1;
    }
  if (do_fixmix) esl_hyperexp_FixedUniformMixture(hxp);	/* overrides q's above */

  if (be_verbose) esl_hyperexp_Dump(stdout, hxp);

  r = esl_randomness_Create(42);
  h = esl_histogram_CreateFull(hxp->mu, 100., binwidth);
  if (plotfile != NULL) {
    if ((pfp = fopen(plotfile, "w")) == NULL) 
      esl_fatal("Failed to open plotfile");
  }
  if (! xmin_set)  xmin  = hxp->mu;
  if (! xmax_set)  xmax  = hxp->mu+ 20*(1. / esl_vec_DMin(hxp->lambda, hxp->K));
  if (! xstep_set) xstep = 0.1;

  for (i = 0; i < n; i++)
    {
      x = esl_hxp_Sample(r, hxp);
      esl_histogram_Add(h, x);
    }

  esl_histogram_GetData(h, &data, &ndata); /* get sorted data vector */

  ehxp = esl_hyperexp_Create(hxp->K);
  if (do_fixmix) esl_hyperexp_FixedUniformMixture(ehxp);
  esl_hxp_FitGuess(data, ndata, ehxp);  
  if ( esl_hxp_FitComplete(data, ndata, ehxp) != eslOK) esl_fatal("Failed to fit hyperexponential");

  if (be_verbose) esl_hyperexp_Dump(stdout, ehxp);

  if (fabs( (ehxp->mu-hxp->mu)/hxp->mu ) > 0.01)
    esl_fatal("Error in (complete) fitted mu > 1%\n");
  for (ek = 0; ek < ehxp->K; ek++)
    {  /* try to match each estimated lambda up to a parametric lambda */
      mindiff = 1.0;
      mink    = -1;
      for (k = 0; k < hxp->K; k++)
	{
	  diff =  fabs( (ehxp->lambda[ek] - hxp->lambda[k]) / hxp->lambda[k]);
	  if (diff < mindiff) {
	    mindiff = diff;
	    mink    = k;
	  }
	}
      if (mindiff > 0.50)
	esl_fatal("Error in (complete) fitted lambda > 50%\n");
      if (fabs( (ehxp->q[ek] - hxp->q[mink]) / hxp->q[mink]) > 1.0)
	esl_fatal("Error in (complete) fitted q > 2-fold%\n");
    }

  esl_hxp_FitGuessBinned(h, ehxp);  
  if ( esl_hxp_FitCompleteBinned(h, ehxp) != eslOK) esl_fatal("Failed to fit binned hyperexponential");
  if (be_verbose)  esl_hyperexp_Dump(stdout, ehxp);

  if (fabs( (ehxp->mu-hxp->mu)/hxp->mu ) > 0.01)
    esl_fatal("Error in (binned) fitted mu > 1%\n");
  for (ek = 0; ek < ehxp->K; ek++)
    {  /* try to match each estimated lambda up to a parametric lambda */
      mindiff = 1.0;
      mink    = -1;
      for (k = 0; k < hxp->K; k++)
	{
	  diff =  fabs( (ehxp->lambda[ek] - hxp->lambda[k]) / hxp->lambda[k]);
	  if (diff < mindiff) {
	    mindiff = diff;
	    mink    = k;
	  }
	}
      if (mindiff > 0.50)
	esl_fatal("Error in (binned) fitted lambda > 50%\n");
      if (fabs( (ehxp->q[ek] - hxp->q[mink]) / hxp->q[mink]) > 1.0)
	esl_fatal("Error in (binned) fitted q > 2-fold\n");
    }

  if (plot_pdf)     esl_hxp_Plot(pfp, hxp, &esl_hxp_pdf,     xmin, xmax, xstep);
  if (plot_logpdf)  esl_hxp_Plot(pfp, hxp, &esl_hxp_logpdf,  xmin, xmax, xstep);
  if (plot_cdf)     esl_hxp_Plot(pfp, hxp, &esl_hxp_cdf,     xmin, xmax, xstep);
  if (plot_logcdf)  esl_hxp_Plot(pfp, hxp, &esl_hxp_logcdf,  xmin, xmax, xstep);
  if (plot_surv)    esl_hxp_Plot(pfp, hxp, &esl_hxp_surv,    xmin, xmax, xstep);
  if (plot_logsurv) esl_hxp_Plot(pfp, hxp, &esl_hxp_logsurv, xmin, xmax, xstep);

  if (plotfile != NULL) fclose(pfp);
  esl_histogram_Destroy(h);
  esl_hyperexp_Destroy(hxp);
  esl_hyperexp_Destroy(ehxp);
  esl_randomness_Destroy(r);
  return 0;
}
示例#5
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS  *go;
  char         *msafile;
  ESLX_MSAFILE *afp;
  ESL_MSA      *msa;
  int           do_gsc;
  int           do_pb;
  int           do_blosum;
  int           maxN;
  double        maxid;
  int           nsmall, nbig;
  int           i;
  int           status;

  /* Process command line  */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("%s", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE){
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  do_blosum = esl_opt_GetBoolean(go, "--blosum");
  do_gsc    = esl_opt_GetBoolean(go, "--gsc");
  do_pb     = esl_opt_GetBoolean(go, "--pb");
  maxid     = esl_opt_GetReal   (go, "--id");
  maxN      = esl_opt_GetInteger(go, "--maxN");
  if (esl_opt_ArgNumber(go) != 1) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf);
  esl_getopts_Destroy(go);

  /* Weight one or more alignments from input file
   */
  if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }

      if      (do_gsc) 	  esl_msaweight_GSC(msa);
      else if (do_pb) 	  esl_msaweight_PB(msa);
      else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);

      for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) {
	if (msa->wgt[i] < 0.2) nsmall++;
	if (msa->wgt[i] > 5.0) nbig++;
      }

      printf("%-20s  %5d %5d %8.4f  %8.4f  %5d  %5d\n", 
	     msa->name, 
	     msa->nseq, 
	     msa->alen,
	     esl_vec_DMin(msa->wgt, msa->nseq),
	     esl_vec_DMax(msa->wgt, msa->nseq),
	     nsmall,
	     nbig);
      esl_msa_Destroy(msa);
    } 
  eslx_msafile_Close(afp);
  return eslOK;
}