Beispiel #1
0
/// Incomlete beta functions.
double
ibeta(double a, double b, double x)
{
  gsl_sf_result result;
  int stat = gsl_sf_beta_inc_e(a, b, x, &result);
  if (stat != GSL_SUCCESS)
    {
      std::ostringstream msg("Error in ibeta:");
      msg << " a=" << a << " b=" << b << " x=" << x;
      throw std::runtime_error(msg.str());
    }
  else
    return result.val;
}
Beispiel #2
0
	/* To adjust for local mappable K-mer density, just reduce the densityWindowSize
	   (from 50,000 etc) to however many sites in that window are uniquely mappable
	   by K-mers.   But that throws off the probZ calc in a bad way.
	   I use adjustedNumSitesInCluster as an estimate of how many sites I would observe
	   in the density window if all the sites were actually mappable.
	*/
	double calculateZScore( int basesSpannedByCluster, int numSitesInCluster,
			int densityWindowSize, int numSitesInDensityWindow, int numMappableSites )
	{

		int fewEnoughSites = 2; //densityWinSmall; //densityWindowSize / 2;
		// Now using exact counts from the interval, so we shouldn't need this anymore:
		if (numMappableSites < fewEnoughSites)
		{
			// let's put a limit on the adjustment
			std::cerr << "Warning: only " << numMappableSites
						<< " of " << densityWindowSize
						<< " are mappable, increasing to "
						<< fewEnoughSites << "..." << std::endl;
			numMappableSites = fewEnoughSites;
		}

		// Adjust probZ using mappable sites.
		double probZ = ((double)basesSpannedByCluster) / (double)numMappableSites;
		double meanZ = numSitesInDensityWindow * probZ;
		double sdZ = std::sqrt(numSitesInDensityWindow * probZ * (1-probZ));
		double zScore = (numSitesInCluster - meanZ) / sdZ;
		// p-value = P(X >= k), where k = number tags in the cluster
		// = 1 - P(X < k) = 1 - P(X <= k - 1).  For the binomial distribution, the cdf is given by
		// the normalized (or regularized) incomplete beta function, I_x(a,b) (see wikipedia), and
		// P(X <= k-1) = I_(1-p)(n - (k-1), k), where p = probability, n = number of tags in the
		// density window.

		// Number mappable bases genome-wide, from ~rthurman/proj/dhs-peaks/results/fdr/fdr.R
		if ( !useGenomeDensWin )
		  return zScore;
		else
		{
			double probZgw = ((double)basesSpannedByCluster) / mpblGenomeSize;
			double meanZgw = backgroundTotalTagCount * probZgw;
			double sdZgw = std::sqrt(backgroundTotalTagCount * probZgw * (1-probZgw));
			double zScoregw = (numSitesInCluster - meanZgw) / sdZgw;
			//std::printf("zScoregw = %f, zScore = %f\n", zScoregw, zScore);
			gsl_sf_result beta_result;
			double pVal, pValgw;
			pVal = 0;
			pValgw = 0;
			int status = gsl_sf_beta_inc_e(numSitesInCluster, numSitesInDensityWindow - numSitesInCluster + 1, probZ, &beta_result);
			if(status == 15){
				//std::printf("Warning: underflow in pval computation, setting pVal = 0.\n");
				pVal = 0.0;
			}
			else if(status){
				//std::printf("gsl_sf_beta_inc_e error: status = %d, error = %s, setting pVal = 1.\n", status, gsl_strerror(status));
				pVal = 1.0;
			}
			else
			{
				pVal = beta_result.val;
			}
			status = gsl_sf_beta_inc_e(numSitesInCluster, backgroundTotalTagCount - numSitesInCluster + 1, probZgw, &beta_result);
			if(status == 15){
				//std::printf("Warning: underflow in pval computation, setting pValgw = 0.\n");
				pValgw = 0.0;
			}
			else if(status){
				//std::printf("gsl_sf_beta_inc_e error: status = %d, error = %s, setting pValgw = 1.\n", status, gsl_strerror(status));
				pValgw = 1.0;
			}
			else
			{
				pValgw = beta_result.val;
			}
			//std::printf("zScoregw = %f, zScore = %f, pVal = %g, pValgw = %g\n", zScoregw, zScore, pVal, pValgw);
			if (zScoregw < zScore){
				//std::printf("Genome-wide density used.\n");
				genomeDensZ += zScoregw;
				numGenomeDens++;
				return zScoregw;
			}else{
				numLocalDens++;
				localDensZ += zScore;
				return zScore;
			}
		}
	}
Beispiel #3
0
double gsl_sf_beta_inc(const double a, const double b, const double x)
{
  EVAL_RESULT(gsl_sf_beta_inc_e(a, b, x, &result));
}