Datum student_t_cdf(PG_FUNCTION_ARGS) { int32 nu; float8 t; /* * Perform all the error checking needed to ensure that no one is * trying to call this in some sort of crazy way. */ if (PG_NARGS() != 2) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("function \"%s\" called with invalid parameters", format_procedure(fcinfo->flinfo->fn_oid)))); } if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) PG_RETURN_NULL(); nu = PG_GETARG_INT32(0); t = PG_GETARG_FLOAT8(1); /* We want to ensure nu > 0 */ if (nu <= 0) PG_RETURN_NULL(); PG_RETURN_FLOAT8(studentT_cdf(nu, t)); }
/** * We need to do some additional domain checking for the in-DB function. */ AnyValue student_t_cdf(AbstractDBInterface &db, AnyValue args) { AnyValue::iterator arg(args); // Arguments from SQL call const int64_t nu = *arg++; const double t = *arg; /* We want to ensure nu > 0 */ if (nu <= 0) throw std::domain_error("Student-t distribution undefined for " "degree of freedom <= 0"); return studentT_cdf(nu, t); }
/*---------- * Do the computations requested from final functions. * * Compute regression coefficients, coefficient of determination (R^2), * t-statistics, and p-values whenever the respective argument is non-NULL. * Since these functions share a lot of computation, they have been distilled * into this function. * * First, we compute the regression coefficients as: * * c = (X^T X)^+ * X^T * y = X^+ * y * * where: * * X^T = the transpose of X * X^+ = the pseudo-inverse of X * * The identity X^+ = (X^T X)^+ X^T holds for all matrices X, a proof * can be found here: * http://en.wikipedia.org/wiki/Proofs_involving_the_Moore%2DPenrose_pseudoinverse * * Note that when the system X c = y is satisfiable (because (X|c) has rank at * most inState->len), then setting c = X^+ y means that |c|_2 <= |d|_2 for all * solutions d satisfying X d = y. * (See http://en.wikipedia.org/wiki/Moore%2DPenrose_pseudoinverse) * * Caveat: Explicitly computing (X^T X)^+ can become a significant source of * numerical rounding erros (see, e.g., * http://en.wikipedia.org/wiki/Moore%2DPenrose_pseudoinverse#Construction * or http://www.mathworks.com/moler/leastsquares.pdf p.16). *---------- */ static void float8_mregr_compute(MRegrState *inState, ArrayType **outCoef, float8 *outR2, ArrayType **outTStats, ArrayType **outPValues) { ArrayType *coef_array, *stdErr_array, *tStats_array = NULL, *pValues_array = NULL; float8 ess = 0., /* explained sum of squares (regression sum of squares) */ tss = 0., /* total sum of squares */ rss, /* residual sum of squares */ r2, variance; float8 *XtX_inv, *coef, *stdErr, *tStats = NULL, *pValues = NULL; uint32 i; /* * Precondition: inState->len * inState->len * sizeof(float8) < STATE_LEN(inState->len) * and IS_FEASIBLE_STATE_LEN(STATE_LEN(inState->len)) */ XtX_inv = palloc((uint64) inState->len * inState->len * sizeof(float8)); pinv(inState->len, inState->len, inState->XtX, XtX_inv); /* * FIXME: Decide on whether we want to display an info message or rather * provide a second function that tells how well the data is conditioned. * * Check if we should expect multicollineratiy. [MPP-13582] * * See: * Lichtblau, Daniel and Weisstein, Eric W. "Condition Number." * From MathWorld--A Wolfram Web Resource. * http://mathworld.wolfram.com/ConditionNumber.html * if (condNoOfXtX > 1000) ereport(INFO, (errmsg("matrix X^T X is ill-conditioned"), errdetail("condition number = %f", condNoOfXtX), errhint("This can indicate strong multicollinearity."))); */ /* * We want to return a one-dimensional array (as opposed to a * two-dimensional array). * * Note: Calling construct_array with NULL as first arguments is a Greenplum * extension */ coef_array = construct_array(NULL, inState->len, FLOAT8OID, sizeof(float8), true, 'd'); coef = (float8 *) ARR_DATA_PTR(coef_array); symmetricMatrixTimesVector(inState->len, XtX_inv, inState->Xty, coef); if (outCoef) *outCoef = coef_array; if (outR2 || outTStats || outPValues) { /*---------- * Computing the total sum of squares (tss) and the explained sum of squares (ess) * * ess = y'X * c - sum(y)^2/n * tss = sum(y^2) - sum(y)^2/n * R^2 = ess/tss *---------- */ ess = dotProduct(inState->len, inState->Xty, coef) - inState->sumy*inState->sumy/inState->count; tss = inState->sumy2 - inState->sumy * inState->sumy / inState->count; /* * With infinite precision, the following checks are pointless. But due to * floating-point arithmetic, this need not hold at this point. * Without a formal proof convincing us of the contrary, we should * anticipate that numerical peculiarities might occur. */ if (tss < 0) tss = 0; if (ess < 0) ess = 0; /* * Since we know tss with greater accuracy than ess, we do the following * sanity adjustment to ess: */ if (ess > tss) ess = tss; } if (outR2) { /* * coefficient of determination * If tss == 0, then the regression perfectly fits the data, so the * coefficient of determination is 1. */ r2 = (tss == 0 ? 1 : ess / tss); *outR2 = r2; } if (outTStats || outPValues) { stdErr_array = construct_array(NULL, inState->len, FLOAT8OID, sizeof(float8), true, 'd'); stdErr = (float8 *) ARR_DATA_PTR(stdErr_array); tStats_array = construct_array(NULL, inState->len, FLOAT8OID, sizeof(float8), true, 'd'); tStats = (float8 *) ARR_DATA_PTR(tStats_array); pValues_array = construct_array(NULL, inState->len, FLOAT8OID, sizeof(float8), true, 'd'); pValues = (float8 *) ARR_DATA_PTR(pValues_array); /* * Computing t-statistics and p-values * * Total sum of squares (tss) = Residual Sum of sqaures (rss) + * Explained Sum of Squares (ess) for linear regression. * Proof: http://en.wikipedia.org/wiki/Sum_of_squares */ rss = tss - ess; /* Variance is also called the Mean Square Error */ variance = rss / (inState->count - inState->len); /* * The t-statistic for each coef[i] is coef[i] / stdErr[i] * where stdErr[i] is the standard error of coef[ii], i.e., * the square root of the i'th diagonoal element of * variance * (X^T X)^{-1}. */ for (i = 0; i < inState->len; i++) { /* * In an abundance of caution, we see a tiny possibility that numerical * instabilities in the pinv operation can lead to negative values on * the main diagonal of even a SPD matrix */ if (XtX_inv[i * (inState->len + 1)] < 0) { stdErr[i] = 0; } else { stdErr[i] = sqrt( variance * XtX_inv[i * (inState->len + 1)] ); } if (coef[i] == 0 && stdErr[i] == 0) { /* * In this special case, 0/0 should be interpreted as 0: * We know that 0 is the exact value for the coefficient, so * the t-value should be 0 (corresponding to a p-value of 1) */ tStats[i] = 0; } else { /* * If stdErr[i] == 0 then abs(tStats[i]) will be infinity, which is * what we need. */ tStats[i] = coef[i] / stdErr[i]; } } } if (outTStats) *outTStats = tStats_array; if (outPValues) { for (i = 0; i < inState->len; i++) if (inState->count <= inState->len) { /* * This test is purely for detecting long int overflows because * studentT_cdf expects an unsigned long int as first argument. */ pValues[i] = NAN; } else { pValues[i] = 2. * (1. - studentT_cdf( (uint64) (inState->count - inState->len), fabs( tStats[i] ) )); } *outPValues = pValues_array; } }