/* The DChoose() and FChoose() unit tests.
 */
static void
utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose)
{
  double *pd = NULL;
  float  *pf = NULL;
  int    *ct = NULL;
  int     i;
  double  X2, diff, exp, X2p;

  if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); 
  if ((pf = malloc(sizeof(float)  * nbins)) == NULL) esl_fatal("malloc failed");
  if ((ct = malloc(sizeof(int)    * nbins)) == NULL) esl_fatal("malloc failed");

  /* Sample a random multinomial probability vector.  */
  if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed");
  esl_vec_D2F(pd, nbins, pf);

  /* Sample observed counts using DChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_DChoose(r, pd, nbins)]++;

  /* X^2 test on those observed counts. */
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("DChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  /* Repeat above for FChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_FChoose(r, pf, nbins)]++;
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("FChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");
  
  free(pd);
  free(pf);
  free(ct);
  return;
}
/* The esl_random() unit test:
 * a binned frequency test.
 */
static void
utest_random(long seed, int n, int nbins, int be_verbose)
{
  ESL_RANDOMNESS *r      = NULL;
  int            *counts = NULL;
  double          X2p    = 0.;
  int             i;
  double          X2, exp, diff;

  if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed");
  esl_vec_ISet(counts, nbins, 0);

  /* This contrived call sequence exercises CreateTimeseeded() and
   * Init(), while leaving us a reproducible chain. Because it's
   * reproducible, we know this test succeeds, despite being
   * statistical in nature.
   */
  if ((r = esl_randomness_CreateTimeseeded()) == NULL)  esl_fatal("randomness create failed");
  if (esl_randomness_Init(r, seed)            != eslOK) esl_fatal("randomness init failed");

  for (i = 0; i < n; i++)
    counts[esl_rnd_Roll(r, nbins)]++;

  /* X^2 value: \sum (o_i - e_i)^2 / e_i */
  for (X2 = 0., i = 0; i < nbins; i++) {
    exp  = (double) n / (double) nbins;
    diff = (double) counts[i] - exp;
    X2 +=  diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed");
  if (be_verbose) printf("random():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  esl_randomness_Destroy(r);
  free(counts);
  return;
}
示例#3
0
/* Function:  esl_stats_LinearRegression()
 * Synopsis:  Fit data to a straight line.
 * Incept:    SRE, Sat May 26 11:33:46 2007 [Janelia]
 *
 * Purpose:   Fit <n> points <x[i]>, <y[i]> to a straight line
 *            $y = a + bx$ by linear regression. 
 *            
 *            The $x_i$ are taken to be known, and the $y_i$ are taken
 *            to be observed quantities associated with a sampling
 *            error $\sigma_i$. If known, the standard deviations
 *            $\sigma_i$ for $y_i$ are provided in the <sigma> array.
 *            If they are unknown, pass <sigma = NULL>, and the
 *            routine will proceed with the assumption that $\sigma_i
 *            = 1$ for all $i$.
 *            
 *            The maximum likelihood estimates for $a$ and $b$ are
 *            optionally returned in <opt_a> and <opt_b>.
 *            
 *            The estimated standard deviations of $a$ and $b$ and
 *            their estimated covariance are optionally returned in
 *            <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>.
 *            
 *            The Pearson correlation coefficient is optionally
 *            returned in <opt_cc>. 
 *            
 *            The $\chi^2$ P-value for the regression fit is
 *            optionally returned in <opt_Q>. This P-value may only be
 *            obtained when the $\sigma_i$ are known. If <sigma> is
 *            passed as <NULL> and <opt_Q> is requested, <*opt_Q> is
 *            set to 1.0.
 *            
 *            This routine follows the description and algorithm in
 *            \citep[pp.661-666]{Press93}.
 *
 *            <n> must be greater than 2; at least two x[i] must
 *            differ; and if <sigma> is provided, all <sigma[i]> must
 *            be $>0$. If any of these conditions isn't met, the
 *            routine throws <eslEINVAL>.
 *
 * Args:      x            - x[0..n-1]
 *            y            - y[0..n-1]
 *            sigma        - sample error in observed y_i
 *            n            - number of data points
 *            opt_a        - optRETURN: intercept estimate		
 *            opt_b        - optRETURN: slope estimate
 *            opt_sigma_a  - optRETURN: error in estimate of a
 *            opt_sigma_b  - optRETURN: error in estimate of b
 *            opt_cov_ab   - optRETURN: covariance of a,b estimates
 *            opt_cc       - optRETURN: Pearson correlation coefficient for x,y
 *            opt_Q        - optRETURN: X^2 P-value for linear fit
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error;
 *            <eslEINVAL> if a contract condition isn't met;
 *            <eslENORESULT> if the chi-squared test fails.
 *            In these cases, all optional return values are set to 0.
 */
int
esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n,
			   double *opt_a,       double *opt_b,
			   double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab,
			   double *opt_cc,      double *opt_Q)
{
  int     status;
  double *t      = NULL;
  double  S, Sx, Sy, Stt;
  double  Sxy, Sxx, Syy;
  double  a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q;
  double  xdev, ydev;
  double  tmp;
  int     i;

  /* Contract checks. */
  if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting");
  if (sigma != NULL) 
    for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i);
  status = eslEINVAL;
  for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; }
  if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0.");

  /* Allocations */
  ESL_ALLOC(t, sizeof(double) * n);

  /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}.  (S > 0.) */
  if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]);  }
  else S = (double) n;

  /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2}  (Sx real.) */
  for (Sx = 0., i = 0; i < n; i++) { 
    if (sigma == NULL) Sx += x[i];
    else               Sx += x[i] / (sigma[i] * sigma[i]);
  }

  /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2}  (Sy real.) */
  for (Sy = 0., i = 0; i < n; i++) { 
    if (sigma == NULL) Sy += y[i];
    else               Sy += y[i] / (sigma[i] * sigma[i]);
  }

  /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right)   (t_i real) */
  for (i = 0; i < n; i++) {
    t[i] = x[i] - Sx/S;
    if (sigma != NULL) t[i] /= sigma[i];
  }

  /* S_{tt} = \sum_{i=1}^n t_i^2  (if at least one x is != 0, Stt > 0) */
  for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; }

  /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i}  */
  for (b = 0., i = 0; i < n; i++) {
    if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; }
    else               { b += t[i]*y[i]; }
  }
  b /= Stt;

  /* a = \frac{ S_y - S_x b } {S}   */
  a = (Sy - Sx * b) / S;
  
  /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */
  sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S);

  /* \sigma_b = \frac{1}{S_{tt}} */
  sigma_b = sqrt (1. / Stt);

  /* Cov(a,b) = - \frac{S_x}{S S_{tt}}    */
  cov_ab = -Sx / (S * Stt);
  
  /* Pearson correlation coefficient */
  Sxy = Sxx = Syy = 0.;
  for (i = 0; i < n; i++) {
    if (sigma != NULL) { 
      xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n);
      ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n);
    } else {
      xdev = x[i] - (Sx / n);
      ydev = y[i] - (Sy / n);
    }
    Sxy += xdev * ydev;
    Sxx += xdev * xdev;
    Syy += ydev * ydev;
  }
  cc = Sxy / (sqrt(Sxx) * sqrt(Syy));

  /* \chi^2 */
  for (X2 = 0., i = 0; i < n; i++) {
    tmp =  y[i] - a - b*x[i];
    if (sigma != NULL) tmp /= sigma[i];
    X2 += tmp*tmp;
  }
  
  /* We can calculate a goodness of fit if we know the \sigma_i */
  if (sigma != NULL) {
    if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; }
  } else Q = 1.0;

  /* If we didn't use \sigma_i, adjust the sigmas for a,b */
  if (sigma == NULL) {
    tmp = sqrt(X2 / (double)(n-2));
    sigma_a *= tmp;
    sigma_b *= tmp;
  }
    
  /* Done. Set up for normal return.
   */
  free(t);
  if (opt_a       != NULL) *opt_a       = a;
  if (opt_b       != NULL) *opt_b       = b;
  if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a;
  if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b;
  if (opt_cov_ab  != NULL) *opt_cov_ab  = cov_ab;
  if (opt_cc      != NULL) *opt_cc      = cc;
  if (opt_Q       != NULL) *opt_Q       = Q;
  return eslOK;
  
 ERROR:
  if (t != NULL) free(t);
  if (opt_a       != NULL) *opt_a       = 0.;
  if (opt_b       != NULL) *opt_b       = 0.;
  if (opt_sigma_a != NULL) *opt_sigma_a = 0.;
  if (opt_sigma_b != NULL) *opt_sigma_b = 0.;
  if (opt_cov_ab  != NULL) *opt_cov_ab  = 0.;
  if (opt_cc      != NULL) *opt_cc      = 0.;
  if (opt_Q       != NULL) *opt_Q       = 0.;
  return status;
}