/* The DChoose() and FChoose() unit tests. */ static void utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose) { double *pd = NULL; float *pf = NULL; int *ct = NULL; int i; double X2, diff, exp, X2p; if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); if ((pf = malloc(sizeof(float) * nbins)) == NULL) esl_fatal("malloc failed"); if ((ct = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); /* Sample a random multinomial probability vector. */ if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed"); esl_vec_D2F(pd, nbins, pf); /* Sample observed counts using DChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_DChoose(r, pd, nbins)]++; /* X^2 test on those observed counts. */ for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("DChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); /* Repeat above for FChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_FChoose(r, pf, nbins)]++; for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("FChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); free(pd); free(pf); free(ct); return; }
/* The esl_random() unit test: * a binned frequency test. */ static void utest_random(long seed, int n, int nbins, int be_verbose) { ESL_RANDOMNESS *r = NULL; int *counts = NULL; double X2p = 0.; int i; double X2, exp, diff; if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); esl_vec_ISet(counts, nbins, 0); /* This contrived call sequence exercises CreateTimeseeded() and * Init(), while leaving us a reproducible chain. Because it's * reproducible, we know this test succeeds, despite being * statistical in nature. */ if ((r = esl_randomness_CreateTimeseeded()) == NULL) esl_fatal("randomness create failed"); if (esl_randomness_Init(r, seed) != eslOK) esl_fatal("randomness init failed"); for (i = 0; i < n; i++) counts[esl_rnd_Roll(r, nbins)]++; /* X^2 value: \sum (o_i - e_i)^2 / e_i */ for (X2 = 0., i = 0; i < nbins; i++) { exp = (double) n / (double) nbins; diff = (double) counts[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed"); if (be_verbose) printf("random(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); esl_randomness_Destroy(r); free(counts); return; }
/* Function: esl_stats_LinearRegression() * Synopsis: Fit data to a straight line. * Incept: SRE, Sat May 26 11:33:46 2007 [Janelia] * * Purpose: Fit <n> points <x[i]>, <y[i]> to a straight line * $y = a + bx$ by linear regression. * * The $x_i$ are taken to be known, and the $y_i$ are taken * to be observed quantities associated with a sampling * error $\sigma_i$. If known, the standard deviations * $\sigma_i$ for $y_i$ are provided in the <sigma> array. * If they are unknown, pass <sigma = NULL>, and the * routine will proceed with the assumption that $\sigma_i * = 1$ for all $i$. * * The maximum likelihood estimates for $a$ and $b$ are * optionally returned in <opt_a> and <opt_b>. * * The estimated standard deviations of $a$ and $b$ and * their estimated covariance are optionally returned in * <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>. * * The Pearson correlation coefficient is optionally * returned in <opt_cc>. * * The $\chi^2$ P-value for the regression fit is * optionally returned in <opt_Q>. This P-value may only be * obtained when the $\sigma_i$ are known. If <sigma> is * passed as <NULL> and <opt_Q> is requested, <*opt_Q> is * set to 1.0. * * This routine follows the description and algorithm in * \citep[pp.661-666]{Press93}. * * <n> must be greater than 2; at least two x[i] must * differ; and if <sigma> is provided, all <sigma[i]> must * be $>0$. If any of these conditions isn't met, the * routine throws <eslEINVAL>. * * Args: x - x[0..n-1] * y - y[0..n-1] * sigma - sample error in observed y_i * n - number of data points * opt_a - optRETURN: intercept estimate * opt_b - optRETURN: slope estimate * opt_sigma_a - optRETURN: error in estimate of a * opt_sigma_b - optRETURN: error in estimate of b * opt_cov_ab - optRETURN: covariance of a,b estimates * opt_cc - optRETURN: Pearson correlation coefficient for x,y * opt_Q - optRETURN: X^2 P-value for linear fit * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; * <eslEINVAL> if a contract condition isn't met; * <eslENORESULT> if the chi-squared test fails. * In these cases, all optional return values are set to 0. */ int esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n, double *opt_a, double *opt_b, double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab, double *opt_cc, double *opt_Q) { int status; double *t = NULL; double S, Sx, Sy, Stt; double Sxy, Sxx, Syy; double a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q; double xdev, ydev; double tmp; int i; /* Contract checks. */ if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting"); if (sigma != NULL) for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i); status = eslEINVAL; for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; } if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0."); /* Allocations */ ESL_ALLOC(t, sizeof(double) * n); /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}. (S > 0.) */ if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]); } else S = (double) n; /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2} (Sx real.) */ for (Sx = 0., i = 0; i < n; i++) { if (sigma == NULL) Sx += x[i]; else Sx += x[i] / (sigma[i] * sigma[i]); } /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2} (Sy real.) */ for (Sy = 0., i = 0; i < n; i++) { if (sigma == NULL) Sy += y[i]; else Sy += y[i] / (sigma[i] * sigma[i]); } /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right) (t_i real) */ for (i = 0; i < n; i++) { t[i] = x[i] - Sx/S; if (sigma != NULL) t[i] /= sigma[i]; } /* S_{tt} = \sum_{i=1}^n t_i^2 (if at least one x is != 0, Stt > 0) */ for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; } /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i} */ for (b = 0., i = 0; i < n; i++) { if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; } else { b += t[i]*y[i]; } } b /= Stt; /* a = \frac{ S_y - S_x b } {S} */ a = (Sy - Sx * b) / S; /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */ sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S); /* \sigma_b = \frac{1}{S_{tt}} */ sigma_b = sqrt (1. / Stt); /* Cov(a,b) = - \frac{S_x}{S S_{tt}} */ cov_ab = -Sx / (S * Stt); /* Pearson correlation coefficient */ Sxy = Sxx = Syy = 0.; for (i = 0; i < n; i++) { if (sigma != NULL) { xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n); ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n); } else { xdev = x[i] - (Sx / n); ydev = y[i] - (Sy / n); } Sxy += xdev * ydev; Sxx += xdev * xdev; Syy += ydev * ydev; } cc = Sxy / (sqrt(Sxx) * sqrt(Syy)); /* \chi^2 */ for (X2 = 0., i = 0; i < n; i++) { tmp = y[i] - a - b*x[i]; if (sigma != NULL) tmp /= sigma[i]; X2 += tmp*tmp; } /* We can calculate a goodness of fit if we know the \sigma_i */ if (sigma != NULL) { if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; } } else Q = 1.0; /* If we didn't use \sigma_i, adjust the sigmas for a,b */ if (sigma == NULL) { tmp = sqrt(X2 / (double)(n-2)); sigma_a *= tmp; sigma_b *= tmp; } /* Done. Set up for normal return. */ free(t); if (opt_a != NULL) *opt_a = a; if (opt_b != NULL) *opt_b = b; if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a; if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b; if (opt_cov_ab != NULL) *opt_cov_ab = cov_ab; if (opt_cc != NULL) *opt_cc = cc; if (opt_Q != NULL) *opt_Q = Q; return eslOK; ERROR: if (t != NULL) free(t); if (opt_a != NULL) *opt_a = 0.; if (opt_b != NULL) *opt_b = 0.; if (opt_sigma_a != NULL) *opt_sigma_a = 0.; if (opt_sigma_b != NULL) *opt_sigma_b = 0.; if (opt_cov_ab != NULL) *opt_cov_ab = 0.; if (opt_cc != NULL) *opt_cc = 0.; if (opt_Q != NULL) *opt_Q = 0.; return status; }