/// Incomlete beta functions. double ibeta(double a, double b, double x) { gsl_sf_result result; int stat = gsl_sf_beta_inc_e(a, b, x, &result); if (stat != GSL_SUCCESS) { std::ostringstream msg("Error in ibeta:"); msg << " a=" << a << " b=" << b << " x=" << x; throw std::runtime_error(msg.str()); } else return result.val; }
/* To adjust for local mappable K-mer density, just reduce the densityWindowSize (from 50,000 etc) to however many sites in that window are uniquely mappable by K-mers. But that throws off the probZ calc in a bad way. I use adjustedNumSitesInCluster as an estimate of how many sites I would observe in the density window if all the sites were actually mappable. */ double calculateZScore( int basesSpannedByCluster, int numSitesInCluster, int densityWindowSize, int numSitesInDensityWindow, int numMappableSites ) { int fewEnoughSites = 2; //densityWinSmall; //densityWindowSize / 2; // Now using exact counts from the interval, so we shouldn't need this anymore: if (numMappableSites < fewEnoughSites) { // let's put a limit on the adjustment std::cerr << "Warning: only " << numMappableSites << " of " << densityWindowSize << " are mappable, increasing to " << fewEnoughSites << "..." << std::endl; numMappableSites = fewEnoughSites; } // Adjust probZ using mappable sites. double probZ = ((double)basesSpannedByCluster) / (double)numMappableSites; double meanZ = numSitesInDensityWindow * probZ; double sdZ = std::sqrt(numSitesInDensityWindow * probZ * (1-probZ)); double zScore = (numSitesInCluster - meanZ) / sdZ; // p-value = P(X >= k), where k = number tags in the cluster // = 1 - P(X < k) = 1 - P(X <= k - 1). For the binomial distribution, the cdf is given by // the normalized (or regularized) incomplete beta function, I_x(a,b) (see wikipedia), and // P(X <= k-1) = I_(1-p)(n - (k-1), k), where p = probability, n = number of tags in the // density window. // Number mappable bases genome-wide, from ~rthurman/proj/dhs-peaks/results/fdr/fdr.R if ( !useGenomeDensWin ) return zScore; else { double probZgw = ((double)basesSpannedByCluster) / mpblGenomeSize; double meanZgw = backgroundTotalTagCount * probZgw; double sdZgw = std::sqrt(backgroundTotalTagCount * probZgw * (1-probZgw)); double zScoregw = (numSitesInCluster - meanZgw) / sdZgw; //std::printf("zScoregw = %f, zScore = %f\n", zScoregw, zScore); gsl_sf_result beta_result; double pVal, pValgw; pVal = 0; pValgw = 0; int status = gsl_sf_beta_inc_e(numSitesInCluster, numSitesInDensityWindow - numSitesInCluster + 1, probZ, &beta_result); if(status == 15){ //std::printf("Warning: underflow in pval computation, setting pVal = 0.\n"); pVal = 0.0; } else if(status){ //std::printf("gsl_sf_beta_inc_e error: status = %d, error = %s, setting pVal = 1.\n", status, gsl_strerror(status)); pVal = 1.0; } else { pVal = beta_result.val; } status = gsl_sf_beta_inc_e(numSitesInCluster, backgroundTotalTagCount - numSitesInCluster + 1, probZgw, &beta_result); if(status == 15){ //std::printf("Warning: underflow in pval computation, setting pValgw = 0.\n"); pValgw = 0.0; } else if(status){ //std::printf("gsl_sf_beta_inc_e error: status = %d, error = %s, setting pValgw = 1.\n", status, gsl_strerror(status)); pValgw = 1.0; } else { pValgw = beta_result.val; } //std::printf("zScoregw = %f, zScore = %f, pVal = %g, pValgw = %g\n", zScoregw, zScore, pVal, pValgw); if (zScoregw < zScore){ //std::printf("Genome-wide density used.\n"); genomeDensZ += zScoregw; numGenomeDens++; return zScoregw; }else{ numLocalDens++; localDensZ += zScore; return zScore; } } }
double gsl_sf_beta_inc(const double a, const double b, const double x) { EVAL_RESULT(gsl_sf_beta_inc_e(a, b, x, &result)); }