Exemple #1
0
Datum student_t_cdf(PG_FUNCTION_ARGS)
{
    int32		nu;
    float8		t;

    /*
     * Perform all the error checking needed to ensure that no one is
     * trying to call this in some sort of crazy way.
     */
    if (PG_NARGS() != 2)
    {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("function \"%s\" called with invalid parameters",
                        format_procedure(fcinfo->flinfo->fn_oid))));
    }
    if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
        PG_RETURN_NULL();

    nu = PG_GETARG_INT32(0);
    t = PG_GETARG_FLOAT8(1);

    /* We want to ensure nu > 0 */
    if (nu <= 0)
        PG_RETURN_NULL();

    PG_RETURN_FLOAT8(studentT_cdf(nu, t));
}
Exemple #2
0
/**
 * We need to do some additional domain checking for the in-DB function.
 */
AnyValue student_t_cdf(AbstractDBInterface &db, AnyValue args) {
    AnyValue::iterator arg(args);

    // Arguments from SQL call
    const int64_t nu = *arg++;
    const double t = *arg;
    
    /* We want to ensure nu > 0 */
    if (nu <= 0)
        throw std::domain_error("Student-t distribution undefined for "
            "degree of freedom <= 0");

    return studentT_cdf(nu, t);    
}
Exemple #3
0
/*----------
 * Do the computations requested from final functions.
 *
 * Compute regression coefficients, coefficient of determination (R^2),
 * t-statistics, and p-values whenever the respective argument is non-NULL.
 * Since these functions share a lot of computation, they have been distilled
 * into this function.
 *
 * First, we compute the regression coefficients as:
 *   
 *   c = (X^T X)^+ * X^T * y = X^+ * y
 *
 * where:
 * 
 *   X^T = the transpose of X
 *   X^+ = the pseudo-inverse of X
 *
 * The identity X^+ = (X^T X)^+ X^T holds for all matrices X, a proof
 * can be found here:
 * http://en.wikipedia.org/wiki/Proofs_involving_the_Moore%2DPenrose_pseudoinverse
 *
 * Note that when the system X c = y is satisfiable (because (X|c) has rank at
 * most inState->len), then setting c = X^+ y means that |c|_2 <= |d|_2 for all
 * solutions d satisfying X d = y.
 * (See http://en.wikipedia.org/wiki/Moore%2DPenrose_pseudoinverse)
 *
 * Caveat: Explicitly computing (X^T X)^+ can become a significant source of
 * numerical rounding erros (see, e.g., 
 * http://en.wikipedia.org/wiki/Moore%2DPenrose_pseudoinverse#Construction
 * or http://www.mathworks.com/moler/leastsquares.pdf p.16).
 *----------
 */
static void
float8_mregr_compute(MRegrState	*inState,
					 ArrayType	**outCoef,
					 float8		*outR2,
					 ArrayType	**outTStats,
					 ArrayType	**outPValues)
{
	ArrayType   *coef_array, *stdErr_array, *tStats_array = NULL, *pValues_array = NULL;
	float8      ess = 0., /* explained sum of squares (regression sum of squares) */
	            tss = 0., /* total sum of squares */
	            rss, /* residual sum of squares */
	            r2,
	            variance;
	float8      *XtX_inv, *coef, *stdErr, *tStats = NULL, *pValues = NULL;
	uint32      i;
	
	/*
	 * Precondition: inState->len * inState->len * sizeof(float8) < STATE_LEN(inState->len)
	 *           and IS_FEASIBLE_STATE_LEN(STATE_LEN(inState->len))
	 */
	XtX_inv = palloc((uint64) inState->len * inState->len * sizeof(float8));
	pinv(inState->len, inState->len, inState->XtX, XtX_inv);
	
	/*
	 * FIXME: Decide on whether we want to display an info message or rather
	 * provide a second function that tells how well the data is conditioned.
	 *
	 * Check if we should expect multicollineratiy. [MPP-13582]
	 *
	 * See:
	 * Lichtblau, Daniel and Weisstein, Eric W. "Condition Number."
	 * From MathWorld--A Wolfram Web Resource.
	 * http://mathworld.wolfram.com/ConditionNumber.html
	 *
	if (condNoOfXtX > 1000)
		ereport(INFO, 
		        (errmsg("matrix X^T X is ill-conditioned"),
		         errdetail("condition number = %f", condNoOfXtX),
		         errhint("This can indicate strong multicollinearity.")));
	 */
	
	/*
	 * We want to return a one-dimensional array (as opposed to a
	 * two-dimensional array).
	 *
	 * Note: Calling construct_array with NULL as first arguments is a Greenplum
	 * extension
	 */
	coef_array = construct_array(NULL, inState->len,
								 FLOAT8OID, sizeof(float8), true, 'd');
	coef = (float8 *) ARR_DATA_PTR(coef_array);
	symmetricMatrixTimesVector(inState->len, XtX_inv, inState->Xty, coef);

	if (outCoef)
		*outCoef = coef_array;
	
	if (outR2 || outTStats || outPValues)
	{	
		/*----------
		 * Computing the total sum of squares (tss) and the explained sum of squares (ess)
		 *
		 *   ess = y'X * c - sum(y)^2/n
		 *   tss = sum(y^2) - sum(y)^2/n
		 *   R^2 = ess/tss
		 *----------
		 */
		ess = dotProduct(inState->len, inState->Xty, coef)
			  - inState->sumy*inState->sumy/inState->count;
		tss = inState->sumy2 - inState->sumy * inState->sumy / inState->count;
		
		/*
		 * With infinite precision, the following checks are pointless. But due to
		 * floating-point arithmetic, this need not hold at this point.
		 * Without a formal proof convincing us of the contrary, we should
		 * anticipate that numerical peculiarities might occur.
		 */
		if (tss < 0)
			tss = 0;
		if (ess < 0)
			ess = 0;
		/*
		 * Since we know tss with greater accuracy than ess, we do the following
		 * sanity adjustment to ess:
		 */
		if (ess > tss)
			ess = tss;
	}
	
	if (outR2)
	{
		/*
		 * coefficient of determination
		 * If tss == 0, then the regression perfectly fits the data, so the
		 * coefficient of determination is 1.
		 */
		r2 = (tss == 0 ? 1 : ess / tss);
		*outR2 = r2;
	}
	
	if (outTStats || outPValues)
	{
		stdErr_array = construct_array(NULL, inState->len,
		                               FLOAT8OID, sizeof(float8), true, 'd');
		stdErr = (float8 *) ARR_DATA_PTR(stdErr_array);
		tStats_array = construct_array(NULL, inState->len,
		                               FLOAT8OID, sizeof(float8), true, 'd');
		tStats = (float8 *) ARR_DATA_PTR(tStats_array);
		pValues_array = construct_array(NULL, inState->len,
		                                FLOAT8OID, sizeof(float8), true, 'd');
		pValues = (float8 *) ARR_DATA_PTR(pValues_array);

		/*
		 * Computing t-statistics and p-values
		 *
		 * Total sum of squares (tss) = Residual Sum of sqaures (rss) +
		 * Explained Sum of Squares (ess) for linear regression.
		 * Proof: http://en.wikipedia.org/wiki/Sum_of_squares
		 */
		rss = tss - ess;
		
		/* Variance is also called the Mean Square Error */
		variance = rss / (inState->count - inState->len);
		
		/*
		 * The t-statistic for each coef[i] is coef[i] / stdErr[i]
		 * where stdErr[i] is the standard error of coef[ii], i.e.,
		 * the square root of the i'th diagonoal element of
		 * variance * (X^T X)^{-1}.
		 */
		for (i = 0; i < inState->len; i++) {
			/*
			 * In an abundance of caution, we see a tiny possibility that numerical
			 * instabilities in the pinv operation can lead to negative values on
			 * the main diagonal of even a SPD matrix
			 */
			if (XtX_inv[i * (inState->len + 1)] < 0) {
				stdErr[i] = 0;
			} else {
				stdErr[i] = sqrt( variance * XtX_inv[i * (inState->len + 1)] );
			}
			
			if (coef[i] == 0 && stdErr[i] == 0) {
				/*
				 * In this special case, 0/0 should be interpreted as 0:
				 * We know that 0 is the exact value for the coefficient, so
				 * the t-value should be 0 (corresponding to a p-value of 1)
				 */
				tStats[i] = 0;
			} else {
				/*
				 * If stdErr[i] == 0 then abs(tStats[i]) will be infinity, which is
				 * what we need.
				 */
				tStats[i] = coef[i] / stdErr[i];
			}
		}
	}
	
	if (outTStats)
		*outTStats = tStats_array;
	
	if (outPValues) {
		for (i = 0; i < inState->len; i++)
			if (inState->count <= inState->len) {
				/*
				 * This test is purely for detecting long int overflows because
				 * studentT_cdf expects an unsigned long int as first argument.
				 */
				pValues[i] = NAN;
			} else {
				pValues[i] = 2. * (1. - studentT_cdf(
						(uint64) (inState->count - inState->len), fabs( tStats[i] )
					));
			}
		
		*outPValues = pValues_array;
	}
}