Example #1
0
/** \brief Estimate by ML the effect size of the genotype, the std deviation 
 *  of the errors and the std error of the estimated effect size in the 
 *  multiple linear regression Y = XB + E with E~MVN(0,sigma^2I)
 *  \note genotype supposed to be 2nd column of X
 */
  void FitSingleGeneWithSingleSnp(const gsl_matrix * X,
				  const gsl_vector * y,
				  double & pve,
				  double & sigmahat,
				  double & betahat_geno,
				  double & sebetahat_geno,
				  double & betapval_geno)
  {
    size_t N = X->size1, P = X->size2, rank;
    double rss;
    gsl_vector * Bhat = gsl_vector_alloc(P);
    gsl_matrix * covBhat = gsl_matrix_alloc(P, P);
    gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc(N, P);
    gsl_multifit_linear_svd(X, y, GSL_DBL_EPSILON, &rank, Bhat, covBhat,
			    &rss, work);
    pve = 1 - rss / gsl_stats_tss(y->data, y->stride, y->size);
    sigmahat = sqrt(rss / (double)(N-rank));
    betahat_geno = gsl_vector_get(Bhat, 1);
    sebetahat_geno = sqrt(gsl_matrix_get (covBhat, 1, 1));
    betapval_geno = 2 * gsl_cdf_tdist_Q(fabs(betahat_geno / sebetahat_geno),
					N-rank);
    gsl_vector_free(Bhat);
    gsl_matrix_free(covBhat);
    gsl_multifit_linear_free(work);
  }
Example #2
0
double rsComputePValueFromTValue(const double T, const int df)
{
	if ( T < 0 ) {
		return -gsl_cdf_tdist_P(T, df);
	} else {
		return gsl_cdf_tdist_Q(T, df);
	}
}
Example #3
0
/* Purpose: calculate z test for proportions, two-tailed for two-sided hypothesis H1 != H0
   Parameters:  p_sample = proportion of events in sample [0..1] 
	 	p_population  = proportion of events in population [0..1] 
 		n  = sample size 
   Returns: Probability of making an error when rejecting the NULL hypothesis 
*/
double gstats_test_ZP2 (double p_sample, double p_population, int n) {
	double z;
	double result = 0.0;
	
	
	if ((p_sample < 0.0) || (p_sample > 1.0)) {
		gstats_error ("z-stat: sample proportion must be given as 0.0 - 1.0.");
	}
	
	if ((p_population < 0.0) || (p_population > 1.0)) {
		gstats_error ("z-stat: population proportion must be given as 0.0 - 1.0.");
	}
	
	if (n < 2) {
		gstats_error ("z-stat: size of population must be a positive integer > 0.");
	}
	
	z = (p_sample - p_population) / 
	    (sqrt ( (p_population * (1-p_population)) / n) );
	
	/* determine direction of numeric integration */
	if ( p_sample > p_population ) {		
		/* we use a normal distribution, if n is at least 30, */
		/* t-distribution with n - 1 degress of freedom, otherwise */
		if (n >= 30) {
			result = gsl_cdf_ugaussian_Q (z);
		} else {
			result = gsl_cdf_tdist_Q (z, (double) n-1);
		}
	}
	if ( p_sample < p_population ) {
		/* we use a normal distribution, if n is at least 30, */
		/* t-distribution with n - 1 degress of freedom, otherwise */
		if (n >= 30) {
			result = gsl_cdf_ugaussian_P (z);
		} else {
			result = gsl_cdf_tdist_P (z, (double) n-1);
		}
	}
	if ( p_sample == p_population ) {
		result = 0.5;
	}
	
	return (result * 2);	
}
Example #4
0
int main(){
    apop_db_open("data-census.db");
    gsl_vector *n = apop_query_to_vector("select in_per_capita from income "
            "where state= (select state from geography where name ='North Dakota')");
    gsl_vector *s = apop_query_to_vector("select in_per_capita from income  "
            "where state= (select state from geography where name ='South Dakota')");

    double n_count = n->size,
           n_mean  = apop_vector_mean(n),
           n_var   = apop_vector_var(n),
           s_count = s->size,
           s_mean  = apop_vector_mean(s),
           s_var   = apop_vector_var(s);

    double  stat      = fabs(n_mean - s_mean)/ sqrt(n_var/ (n_count-1) + s_var/(s_count-1));
    double confidence = 1 - (2 * gsl_cdf_tdist_Q(stat, n_count + s_count -2));
    printf("Reject the null with %g%% confidence\n", confidence*100);
}
Example #5
0
double
significance_of_correlation (double rho, double w)
{
  double t = w - 2;

  /* |rho| will mathematically always be in the range [0, 1.0].  Inaccurate
     calculations sometimes cause it to be slightly greater than 1.0, so
     force it into the correct range to avoid NaN from sqrt(). */
  t /= 1 - MIN (1, pow2 (rho));

  t = sqrt (t);
  t *= rho;
  
  if (t > 0)
    return  gsl_cdf_tdist_Q (t, w - 2);
  else
    return  gsl_cdf_tdist_P (t, w - 2);
}
Example #6
0
bool TTestReductionPop2(WiggleIterator * wi) {
	if (wi->done)
		return true;

	SetComparisonData * data = (SetComparisonData *) wi->data;
	Multiset * multi = data->multi;

	if (multi->done) {
		wi->done = true;
		return true;
	}

	// Go to first position where both of the sets have at least one value
	while (!multi->inplay[0] || !multi->inplay[1]) {
		popMultiset(multi);
		if (multi->done) {
			wi->done = true;
			return true;
		}
	}
	wi->chrom = multi->chrom;
	wi->start = multi->start;
	wi->finish = multi->finish;

	// Compute measurements
	double sum1, sum2, sumSq1, sumSq2;
	int count1, count2;
	int index;

	sum1 = sum2 = 0;
	sumSq1 = sumSq2 = 0;
	count1 = multi->multis[0]->count;
	count2 = multi->multis[1]->count;

	for (index = 0; index < multi->multis[0]->count; index++) {
		if (multi->multis[0]->inplay[index]) {
			sum1 += multi->values[0][index];
			sumSq1 += multi->values[0][index] * multi->values[0][index];
		}
	}

	for (index = 0; index < multi->multis[1]->count; index++) {
		if (multi->multis[1]->inplay[index]) {
			sum2 += multi->values[1][index];
			sumSq2 += multi->values[1][index] * multi->values[1][index];
		}
	}

	// To avoid divisions by 0:
	if (count1 == 0 || count2 == 0) {
		popMultiset(multi);
		pop(wi);	
		return false;
	}

	double mean1 = sum1 / count1;
	double mean2 = sum2 / count2;
	double meanSq1 = sumSq1 / count1;
	double meanSq2 = sumSq2 / count2;
	double var1 = meanSq1 - mean1 * mean1;
	double var2 = meanSq2 - mean2 * mean2;

	// To avoid divisions by 0:
	if (var1 + var2 == 0) {
		popMultiset(multi);
		return false;
	}

	// T-statistic

	double t = (mean1 - mean2) / sqrt(var1 / count1 + var2 / count2);

	if (t < 0)
		t = -t;

	// Degrees of freedom
	
	double nu = (var1 / count1 + var2 / count2) * (var1 / count1 + var2 / count2) / ((var1 * var1) / (count1 * count1 * (count1 - 1)) + (var2 * var2) / (count2 * count2 * (count2 - 1)));

	// P-value

	wi->value = 2 * gsl_cdf_tdist_Q(t, nu);

	// Update inputs
	popMultiset(multi);
	return true;
}
Example #7
0
double
gsl_cdf_tdist_Qinv (const double Q, const double nu)
{
  double x, qtail;

  if (Q == 0.0)
    {
      return GSL_POSINF;
    }
  else if (Q == 1.0)
    {
      return GSL_NEGINF;
    }

  if (nu == 1.0)
    {
      x = tan (M_PI * (0.5 - Q));
      return x;
    }
  else if (nu == 2.0)
    {
      x = (1 - 2 * Q) / sqrt (2 * Q * (1 - Q));
      return x;
    }

  qtail = (Q < 0.5) ? Q : 1 - Q;

  if (sqrt (M_PI * nu / 2) * qtail > pow (0.05, nu / 2))
    {
      double xg = gsl_cdf_ugaussian_Qinv (Q);
      x = inv_cornish_fisher (xg, nu);
    }
  else
    {
      /* Use an asymptotic expansion of the tail of integral */

      double beta = gsl_sf_beta (0.5, nu / 2);

      if (Q < 0.5)
        {
          x = sqrt (nu) * pow (beta * nu * Q, -1.0 / nu);
        }
      else
        {
          x = -sqrt (nu) * pow (beta * nu * (1 - Q), -1.0 / nu);
        }

      /* Correct nu -> nu/(1+nu/x^2) in the leading term to account
         for higher order terms. This avoids overestimating x, which
         makes the iteration unstable due to the rapidly decreasing
         tails of the distribution. */

      x /= sqrt (1 + nu / (x * x));
    }

  {
    double dQ, phi;
    unsigned int n = 0;

  start:
    dQ = Q - gsl_cdf_tdist_Q (x, nu);
    phi = gsl_ran_tdist_pdf (x, nu);

    if (dQ == 0.0 || n++ > 32)
      goto end;

    {
      double lambda = - dQ / phi;
      double step0 = lambda;
      double step1 = ((nu + 1) * x / (x * x + nu)) * (lambda * lambda / 4.0);

      double step = step0;

      if (fabs (step1) < fabs (step0))
        {
          step += step1;
        }

      if (Q < 0.5 && x + step < 0)
        x /= 2;
      else if (Q > 0.5 && x + step > 0)
        x /= 2;
      else
        x += step;

      if (fabs (step) > 1e-10 * fabs (x))
        goto start;
    }
  }

end:

  return x;
}
Example #8
0
int main(int argc, char **argv){
    distlist distribution = Normal;
    char	 msg[10000], c;
    int      pval = 0, qval = 0;
    double   param1 = GSL_NAN, param2 =GSL_NAN, findme = GSL_NAN;
    char     number[1000];
	sprintf(msg, "%s [opts] number_to_lookup\n\n"
    "Look up a probability or p-value for a given standard distribution.\n"
    "[This is still loosely written and counts as beta. Notably, negative numbers are hard to parse.]\n"
    "E.g.:\n"
    "%s -dbin 100 .5 34\n"
    "sets the distribution to a Binomial(100, .5), and find the odds of 34 appearing.\n"
    "%s -p 2     \n"
    "find the area of the Normal(0,1) between -infty and 2.  \n"
    "\n"
    "-pval Find the p-value: integral from -infinity to your value\n"
    "-qval Find the q-value: integral from your value to infinity\n"
    "\n"
    "After giving an optional -p or -q, specify the distribution. \n"
    "Default is Normal(0, 1). Other options:\n"
    "\t\t-binom Binomial(n, p)\n"
    "\t\t-beta Beta(a, b)\n"
    "\t\t-f F distribution(df1, df2)\n"
    "\t\t-norm Normal(mu, sigma)\n"
    "\t\t-negative bin Negative binomial(n, p)\n"
    "\t\t-poisson Poisson(L)\n"
    "\t\t-t t distribution(df)\n"
    "I just need enough letters to distinctly identify a distribution.\n"
, argv[0], argv[0], argv[0]); 

    opterr=0;
	if(argc==1){
		printf("%s", msg);
		return 0;
	}
	while ((c = getopt (argc, argv, "B:b:F:f:N:n:pqT:t:")) != -1){
		switch (c){
		  case 'B':
		  case 'b':
              if (optarg[0]=='i')
                  distribution = Binomial;
              else if (optarg[0]=='e')
                  distribution = Beta;
            else {
                printf("I can't parse the option -b%s\n", optarg);
                exit(0);
            }
              param1 = atof(argv[optind]);
              param2 = atof(argv[optind+1]);
              findme =  atof(argv[optind+2]);
			  break;
          case 'F':
          case 'f':
            distribution = F;
            param1 = atof(argv[optind]);
            findme =  atof(argv[optind+1]);
            break;
          case 'H':
		  case 'h':
			printf("%s", msg);
			return 0;
          case 'n':
          case 'N':
            if (optarg[0]=='o'){ //normal
                  param1 = atof(argv[optind]);
                  param2 = atof(argv[optind+1]);
                  findme =  atof(argv[optind+2]);
            } else if (optarg[0]=='e'){
                  distribution = Negbinom;
                  param1 = atof(argv[optind]);
                  param2 = atof(argv[optind+1]);
                  findme =  atof(argv[optind+2]);
            } else {
                printf("I can't parse the option -n%s\n", optarg);
                exit(0);
            }
			  break;
          case 'p':
            if (!optarg || optarg[0] == 'v')
                pval++;
            else if (optarg[0] == 'o'){
                distribution = Poisson;
                param1 = atof(argv[optind]);
                findme =  atof(argv[optind+1]);
            } else {
                printf("I can't parse the option -p%s\n", optarg);
                exit(0);
            }
            break;
          case 'q':
            qval++;
            break;
          case 'T':
          case 't':
            distribution = T;
            param1 = atof(argv[optind]);
            findme =  atof(argv[optind+1]);
            break;
          case '?'://probably a negative number
            if (optarg)
                 snprintf(number, 1000, "%c%s", optopt, optarg);
            else snprintf(number, 1000, "%c", optopt);
            if (gsl_isnan(param1)) param1 = -atof(number);
            else if (gsl_isnan(param2)) param2 = -atof(number);
            else if (gsl_isnan(findme)) findme = -atof(number);
		}
	}
    if (gsl_isnan(findme)) findme =  atof(argv[optind]);
    //defaults, as promised
    if (gsl_isnan(param1)) param1 = 0;
    if (gsl_isnan(param2)) param2 = 1;
    if (!pval && !qval){
        double val =
        distribution == Beta ? gsl_ran_beta_pdf(findme, param1, param2)
        : distribution == Binomial ? gsl_ran_binomial_pdf(findme, param2, param1)
        : distribution == F ? gsl_ran_fdist_pdf(findme, param1, param2) 
        : distribution == Negbinom ? gsl_ran_negative_binomial_pdf(findme, param2, param1)
        : distribution == Normal ? gsl_ran_gaussian_pdf(findme, param2)+param1
        : distribution == Poisson ? gsl_ran_poisson_pdf(findme, param1) 
        : distribution == T ? gsl_ran_tdist_pdf(findme, param1) : GSL_NAN;
        printf("%g\n", val); 
        return 0;
    }
    if (distribution == Binomial){
        printf("Sorry, the GSL doesn't have a Binomial CDF.\n");
        return 0; }
    if (distribution == Negbinom){
        printf("Sorry, the GSL doesn't have a Negative Binomial CDF.\n");
        return 0; }
    if (distribution == Poisson){
        printf("Sorry, the GSL doesn't have a Poisson CDF.\n");
        return 0; }
    if (pval){
        double val =
        distribution == Beta ? gsl_cdf_beta_P(findme, param1, param2)
        : distribution == F ? gsl_cdf_fdist_P(findme, param1, param2) 
        : distribution == Normal ? gsl_cdf_gaussian_P(findme-param1, param2)
        : distribution == T ? gsl_cdf_tdist_P(findme, param1) : GSL_NAN;
        printf("%g\n", val); 
        return 0;
    }
    if (qval){
        double val =
        distribution == Beta ? gsl_cdf_beta_Q(findme, param1, param2)
        : distribution == F ? gsl_cdf_fdist_Q(findme, param1, param2) 
        : distribution == Normal ? gsl_cdf_gaussian_Q(findme-param1, param2)
        : distribution == T ? gsl_cdf_tdist_Q(findme, param1) : GSL_NAN;
        printf("%g\n", val); 
    }
}