/** \brief Estimate by ML the effect size of the genotype, the std deviation * of the errors and the std error of the estimated effect size in the * multiple linear regression Y = XB + E with E~MVN(0,sigma^2I) * \note genotype supposed to be 2nd column of X */ void FitSingleGeneWithSingleSnp(const gsl_matrix * X, const gsl_vector * y, double & pve, double & sigmahat, double & betahat_geno, double & sebetahat_geno, double & betapval_geno) { size_t N = X->size1, P = X->size2, rank; double rss; gsl_vector * Bhat = gsl_vector_alloc(P); gsl_matrix * covBhat = gsl_matrix_alloc(P, P); gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc(N, P); gsl_multifit_linear_svd(X, y, GSL_DBL_EPSILON, &rank, Bhat, covBhat, &rss, work); pve = 1 - rss / gsl_stats_tss(y->data, y->stride, y->size); sigmahat = sqrt(rss / (double)(N-rank)); betahat_geno = gsl_vector_get(Bhat, 1); sebetahat_geno = sqrt(gsl_matrix_get (covBhat, 1, 1)); betapval_geno = 2 * gsl_cdf_tdist_Q(fabs(betahat_geno / sebetahat_geno), N-rank); gsl_vector_free(Bhat); gsl_matrix_free(covBhat); gsl_multifit_linear_free(work); }
double rsComputePValueFromTValue(const double T, const int df) { if ( T < 0 ) { return -gsl_cdf_tdist_P(T, df); } else { return gsl_cdf_tdist_Q(T, df); } }
/* Purpose: calculate z test for proportions, two-tailed for two-sided hypothesis H1 != H0 Parameters: p_sample = proportion of events in sample [0..1] p_population = proportion of events in population [0..1] n = sample size Returns: Probability of making an error when rejecting the NULL hypothesis */ double gstats_test_ZP2 (double p_sample, double p_population, int n) { double z; double result = 0.0; if ((p_sample < 0.0) || (p_sample > 1.0)) { gstats_error ("z-stat: sample proportion must be given as 0.0 - 1.0."); } if ((p_population < 0.0) || (p_population > 1.0)) { gstats_error ("z-stat: population proportion must be given as 0.0 - 1.0."); } if (n < 2) { gstats_error ("z-stat: size of population must be a positive integer > 0."); } z = (p_sample - p_population) / (sqrt ( (p_population * (1-p_population)) / n) ); /* determine direction of numeric integration */ if ( p_sample > p_population ) { /* we use a normal distribution, if n is at least 30, */ /* t-distribution with n - 1 degress of freedom, otherwise */ if (n >= 30) { result = gsl_cdf_ugaussian_Q (z); } else { result = gsl_cdf_tdist_Q (z, (double) n-1); } } if ( p_sample < p_population ) { /* we use a normal distribution, if n is at least 30, */ /* t-distribution with n - 1 degress of freedom, otherwise */ if (n >= 30) { result = gsl_cdf_ugaussian_P (z); } else { result = gsl_cdf_tdist_P (z, (double) n-1); } } if ( p_sample == p_population ) { result = 0.5; } return (result * 2); }
int main(){ apop_db_open("data-census.db"); gsl_vector *n = apop_query_to_vector("select in_per_capita from income " "where state= (select state from geography where name ='North Dakota')"); gsl_vector *s = apop_query_to_vector("select in_per_capita from income " "where state= (select state from geography where name ='South Dakota')"); double n_count = n->size, n_mean = apop_vector_mean(n), n_var = apop_vector_var(n), s_count = s->size, s_mean = apop_vector_mean(s), s_var = apop_vector_var(s); double stat = fabs(n_mean - s_mean)/ sqrt(n_var/ (n_count-1) + s_var/(s_count-1)); double confidence = 1 - (2 * gsl_cdf_tdist_Q(stat, n_count + s_count -2)); printf("Reject the null with %g%% confidence\n", confidence*100); }
double significance_of_correlation (double rho, double w) { double t = w - 2; /* |rho| will mathematically always be in the range [0, 1.0]. Inaccurate calculations sometimes cause it to be slightly greater than 1.0, so force it into the correct range to avoid NaN from sqrt(). */ t /= 1 - MIN (1, pow2 (rho)); t = sqrt (t); t *= rho; if (t > 0) return gsl_cdf_tdist_Q (t, w - 2); else return gsl_cdf_tdist_P (t, w - 2); }
bool TTestReductionPop2(WiggleIterator * wi) { if (wi->done) return true; SetComparisonData * data = (SetComparisonData *) wi->data; Multiset * multi = data->multi; if (multi->done) { wi->done = true; return true; } // Go to first position where both of the sets have at least one value while (!multi->inplay[0] || !multi->inplay[1]) { popMultiset(multi); if (multi->done) { wi->done = true; return true; } } wi->chrom = multi->chrom; wi->start = multi->start; wi->finish = multi->finish; // Compute measurements double sum1, sum2, sumSq1, sumSq2; int count1, count2; int index; sum1 = sum2 = 0; sumSq1 = sumSq2 = 0; count1 = multi->multis[0]->count; count2 = multi->multis[1]->count; for (index = 0; index < multi->multis[0]->count; index++) { if (multi->multis[0]->inplay[index]) { sum1 += multi->values[0][index]; sumSq1 += multi->values[0][index] * multi->values[0][index]; } } for (index = 0; index < multi->multis[1]->count; index++) { if (multi->multis[1]->inplay[index]) { sum2 += multi->values[1][index]; sumSq2 += multi->values[1][index] * multi->values[1][index]; } } // To avoid divisions by 0: if (count1 == 0 || count2 == 0) { popMultiset(multi); pop(wi); return false; } double mean1 = sum1 / count1; double mean2 = sum2 / count2; double meanSq1 = sumSq1 / count1; double meanSq2 = sumSq2 / count2; double var1 = meanSq1 - mean1 * mean1; double var2 = meanSq2 - mean2 * mean2; // To avoid divisions by 0: if (var1 + var2 == 0) { popMultiset(multi); return false; } // T-statistic double t = (mean1 - mean2) / sqrt(var1 / count1 + var2 / count2); if (t < 0) t = -t; // Degrees of freedom double nu = (var1 / count1 + var2 / count2) * (var1 / count1 + var2 / count2) / ((var1 * var1) / (count1 * count1 * (count1 - 1)) + (var2 * var2) / (count2 * count2 * (count2 - 1))); // P-value wi->value = 2 * gsl_cdf_tdist_Q(t, nu); // Update inputs popMultiset(multi); return true; }
double gsl_cdf_tdist_Qinv (const double Q, const double nu) { double x, qtail; if (Q == 0.0) { return GSL_POSINF; } else if (Q == 1.0) { return GSL_NEGINF; } if (nu == 1.0) { x = tan (M_PI * (0.5 - Q)); return x; } else if (nu == 2.0) { x = (1 - 2 * Q) / sqrt (2 * Q * (1 - Q)); return x; } qtail = (Q < 0.5) ? Q : 1 - Q; if (sqrt (M_PI * nu / 2) * qtail > pow (0.05, nu / 2)) { double xg = gsl_cdf_ugaussian_Qinv (Q); x = inv_cornish_fisher (xg, nu); } else { /* Use an asymptotic expansion of the tail of integral */ double beta = gsl_sf_beta (0.5, nu / 2); if (Q < 0.5) { x = sqrt (nu) * pow (beta * nu * Q, -1.0 / nu); } else { x = -sqrt (nu) * pow (beta * nu * (1 - Q), -1.0 / nu); } /* Correct nu -> nu/(1+nu/x^2) in the leading term to account for higher order terms. This avoids overestimating x, which makes the iteration unstable due to the rapidly decreasing tails of the distribution. */ x /= sqrt (1 + nu / (x * x)); } { double dQ, phi; unsigned int n = 0; start: dQ = Q - gsl_cdf_tdist_Q (x, nu); phi = gsl_ran_tdist_pdf (x, nu); if (dQ == 0.0 || n++ > 32) goto end; { double lambda = - dQ / phi; double step0 = lambda; double step1 = ((nu + 1) * x / (x * x + nu)) * (lambda * lambda / 4.0); double step = step0; if (fabs (step1) < fabs (step0)) { step += step1; } if (Q < 0.5 && x + step < 0) x /= 2; else if (Q > 0.5 && x + step > 0) x /= 2; else x += step; if (fabs (step) > 1e-10 * fabs (x)) goto start; } } end: return x; }
int main(int argc, char **argv){ distlist distribution = Normal; char msg[10000], c; int pval = 0, qval = 0; double param1 = GSL_NAN, param2 =GSL_NAN, findme = GSL_NAN; char number[1000]; sprintf(msg, "%s [opts] number_to_lookup\n\n" "Look up a probability or p-value for a given standard distribution.\n" "[This is still loosely written and counts as beta. Notably, negative numbers are hard to parse.]\n" "E.g.:\n" "%s -dbin 100 .5 34\n" "sets the distribution to a Binomial(100, .5), and find the odds of 34 appearing.\n" "%s -p 2 \n" "find the area of the Normal(0,1) between -infty and 2. \n" "\n" "-pval Find the p-value: integral from -infinity to your value\n" "-qval Find the q-value: integral from your value to infinity\n" "\n" "After giving an optional -p or -q, specify the distribution. \n" "Default is Normal(0, 1). Other options:\n" "\t\t-binom Binomial(n, p)\n" "\t\t-beta Beta(a, b)\n" "\t\t-f F distribution(df1, df2)\n" "\t\t-norm Normal(mu, sigma)\n" "\t\t-negative bin Negative binomial(n, p)\n" "\t\t-poisson Poisson(L)\n" "\t\t-t t distribution(df)\n" "I just need enough letters to distinctly identify a distribution.\n" , argv[0], argv[0], argv[0]); opterr=0; if(argc==1){ printf("%s", msg); return 0; } while ((c = getopt (argc, argv, "B:b:F:f:N:n:pqT:t:")) != -1){ switch (c){ case 'B': case 'b': if (optarg[0]=='i') distribution = Binomial; else if (optarg[0]=='e') distribution = Beta; else { printf("I can't parse the option -b%s\n", optarg); exit(0); } param1 = atof(argv[optind]); param2 = atof(argv[optind+1]); findme = atof(argv[optind+2]); break; case 'F': case 'f': distribution = F; param1 = atof(argv[optind]); findme = atof(argv[optind+1]); break; case 'H': case 'h': printf("%s", msg); return 0; case 'n': case 'N': if (optarg[0]=='o'){ //normal param1 = atof(argv[optind]); param2 = atof(argv[optind+1]); findme = atof(argv[optind+2]); } else if (optarg[0]=='e'){ distribution = Negbinom; param1 = atof(argv[optind]); param2 = atof(argv[optind+1]); findme = atof(argv[optind+2]); } else { printf("I can't parse the option -n%s\n", optarg); exit(0); } break; case 'p': if (!optarg || optarg[0] == 'v') pval++; else if (optarg[0] == 'o'){ distribution = Poisson; param1 = atof(argv[optind]); findme = atof(argv[optind+1]); } else { printf("I can't parse the option -p%s\n", optarg); exit(0); } break; case 'q': qval++; break; case 'T': case 't': distribution = T; param1 = atof(argv[optind]); findme = atof(argv[optind+1]); break; case '?'://probably a negative number if (optarg) snprintf(number, 1000, "%c%s", optopt, optarg); else snprintf(number, 1000, "%c", optopt); if (gsl_isnan(param1)) param1 = -atof(number); else if (gsl_isnan(param2)) param2 = -atof(number); else if (gsl_isnan(findme)) findme = -atof(number); } } if (gsl_isnan(findme)) findme = atof(argv[optind]); //defaults, as promised if (gsl_isnan(param1)) param1 = 0; if (gsl_isnan(param2)) param2 = 1; if (!pval && !qval){ double val = distribution == Beta ? gsl_ran_beta_pdf(findme, param1, param2) : distribution == Binomial ? gsl_ran_binomial_pdf(findme, param2, param1) : distribution == F ? gsl_ran_fdist_pdf(findme, param1, param2) : distribution == Negbinom ? gsl_ran_negative_binomial_pdf(findme, param2, param1) : distribution == Normal ? gsl_ran_gaussian_pdf(findme, param2)+param1 : distribution == Poisson ? gsl_ran_poisson_pdf(findme, param1) : distribution == T ? gsl_ran_tdist_pdf(findme, param1) : GSL_NAN; printf("%g\n", val); return 0; } if (distribution == Binomial){ printf("Sorry, the GSL doesn't have a Binomial CDF.\n"); return 0; } if (distribution == Negbinom){ printf("Sorry, the GSL doesn't have a Negative Binomial CDF.\n"); return 0; } if (distribution == Poisson){ printf("Sorry, the GSL doesn't have a Poisson CDF.\n"); return 0; } if (pval){ double val = distribution == Beta ? gsl_cdf_beta_P(findme, param1, param2) : distribution == F ? gsl_cdf_fdist_P(findme, param1, param2) : distribution == Normal ? gsl_cdf_gaussian_P(findme-param1, param2) : distribution == T ? gsl_cdf_tdist_P(findme, param1) : GSL_NAN; printf("%g\n", val); return 0; } if (qval){ double val = distribution == Beta ? gsl_cdf_beta_Q(findme, param1, param2) : distribution == F ? gsl_cdf_fdist_Q(findme, param1, param2) : distribution == Normal ? gsl_cdf_gaussian_Q(findme-param1, param2) : distribution == T ? gsl_cdf_tdist_Q(findme, param1) : GSL_NAN; printf("%g\n", val); } }