/* conditional linear Gaussian mutual information test. */ static double ut_micg(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue, double *df) { int i = 0, xtype = 0, ytype = TYPEOF(yy), llx = 0, lly = 0; double xm = 0, xsd = 0, ym = 0, ysd = 0, statistic = 0; void *xptr = NULL, *yptr = NULL; SEXP xdata; if (ytype == INTSXP) { /* cache the number of levels. */ lly = NLEVELS(yy); yptr = INTEGER(yy); }/*THEN*/ else { /* cache mean and variance. */ yptr = REAL(yy); ym = c_mean(yptr, nobs); ysd = c_sse(yptr, ym, nobs); }/*ELSE*/ for (i = 0; i < ntests; i++) { xdata = VECTOR_ELT(xx, i); xtype = TYPEOF(xdata); if ((ytype == INTSXP) && (xtype == INTSXP)) { /* if both nodes are discrete, the test reverts back to a discrete * mutual information test. */ xptr = INTEGER(xdata); llx = NLEVELS(xdata); DISCRETE_SWAP_X(); statistic = 2 * nobs * c_chisqtest(xptr, llx, yptr, lly, nobs, df, MI); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if ((ytype == REALSXP) && (xtype == REALSXP)) { /* if both nodes are continuous, the test reverts back to a Gaussian * mutual information test. */ xptr = REAL(xdata); xm = c_mean(xptr, nobs); xsd = c_sse(xptr, xm, nobs); statistic = c_fast_cor(xptr, yptr, nobs, xm, ym, xsd, ysd); *df = 1; statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else { if (xtype == INTSXP) { xptr = INTEGER(xdata); llx = NLEVELS(xdata); ysd = sqrt(ysd / (nobs - 1)); statistic = 2 * nobs * c_micg(yptr, ym, ysd, xptr, llx, nobs); *df = llx - 1; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else { xptr = REAL(xdata); xm = c_mean(xptr, nobs); xsd = sqrt(c_sse(xptr, xm, nobs) / (nobs - 1)); statistic = 2 * nobs * c_micg(xptr, xm, xsd, yptr, lly, nobs); *df = lly - 1; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*ELSE*/ }/*THEN*/ }/*FOR*/ return statistic; }/*UT_MICG*/
/* parametric tests for Gaussian variables. */ static double ut_gaustests(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue, double *df, test_e test) { int i = 0; double transform = 0, *xptr = NULL, *yptr = REAL(yy); double xm = 0, ym = 0, xsd = 0, ysd = 0, statistic = 0; /* compute the degrees of freedom for correlation and mutual information. */ if (test == COR) *df = nobs - 2; else if ((test == MI_G) || (test == MI_G_SH)) *df = 1; if (((test == COR) && (*df < 1)) || ((test == ZF) && (nobs - 2 < 2))) { /* if there are not enough degrees of freedom, return independence. */ warning("trying to do an independence test with zero degrees of freedom."); *df = 0; statistic = 0; for (i = 0; i < ntests; i++) pvalue[i] = 1; return statistic; }/*THEN*/ /* cache mean and variance. */ ym = c_mean(yptr, nobs); ysd = c_sse(yptr, ym, nobs); for (i = 0; i < ntests; i++) { GAUSSIAN_SWAP_X(); statistic = c_fast_cor(xptr, yptr, nobs, xm, ym, xsd, ysd); if (test == COR) { transform = cor_t_trans(statistic, *df); pvalue[i] = 2 * pt(fabs(transform), *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G) { statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G_SH) { statistic *= 1 - cor_lambda(xptr, yptr, nobs, xm, ym, xsd, ysd, statistic); statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == ZF) { statistic = cor_zf_trans(statistic, (double)nobs - 2); pvalue[i] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE); }/*THEN*/ }/*FOR*/ return statistic; }/*UT_GAUSTESTS*/
/* remove one variable in each highly-correlated pair. */ SEXP dedup (SEXP data, SEXP threshold, SEXP complete, SEXP debug) { int i = 0, j = 0, k = 0, dropped = 0, nc = 0; int debuglevel = isTRUE(debug); double *mean = NULL, *sse = NULL, *xx = NULL, *yy = NULL; double cur_mean[2], cur_sse[2]; double tol = MACHINE_TOL, t = NUM(threshold); long double sum = 0; SEXP result, colnames; gdata dt = { 0 }; /* extract the columns from the data frame. */ dt = gdata_from_SEXP(data, 0); meta_init_flags(&(dt.m), 0, complete, R_NilValue); meta_copy_names(&(dt.m), 0, data); /* set up the vectors for the pairwise complete observations. */ xx = Calloc1D(dt.m.nobs, sizeof(double)); yy = Calloc1D(dt.m.nobs, sizeof(double)); if (debuglevel > 0) Rprintf("* caching means and variances.\n"); mean = Calloc1D(dt.m.ncols, sizeof(double)); sse = Calloc1D(dt.m.ncols, sizeof(double)); /* cache the mean and variance of complete variables. */ for (j = 0; j < dt.m.ncols; j++) { if (!dt.m.flag[j].complete) continue; mean[j] = c_mean(dt.col[j], dt.m.nobs); sse[j] = c_sse(dt.col[j], mean[j], dt.m.nobs); }/*FOR*/ /* main loop. */ for (j = 0; j < dt.m.ncols - 1; j++) { /* skip variables already flagged for removal. */ if (dt.m.flag[j].drop) continue; if (debuglevel > 0) Rprintf("* looking at %s with %d variables still to check.\n", dt.m.names[j], dt.m.ncols - (j + 1)); for (k = j + 1; k < dt.m.ncols; k++) { /* skip variables already flagged for removal. */ if (dt.m.flag[k].drop) continue; if (dt.m.flag[j].complete && dt.m.flag[k].complete) { /* use the cached means and variances. */ cur_mean[0] = mean[j]; cur_mean[1] = mean[k]; cur_sse[0] = sse[j]; cur_sse[1] = sse[k]; /* compute the covariance. */ for (i = 0, sum = 0; i < dt.m.nobs; i++) sum += (dt.col[j][i] - cur_mean[0]) * (dt.col[k][i] - cur_mean[1]); }/*THEN*/ else { for (i = 0, nc = 0; i < dt.m.nobs; i++) { if (ISNAN(dt.col[j][i]) || ISNAN(dt.col[k][i])) continue; xx[nc] = dt.col[j][i]; yy[nc++] = dt.col[k][i]; }/*FOR*/ /* if there are no complete observations, take the variables to be * independent. */ if (nc == 0) continue; cur_mean[0] = c_mean(xx, nc); cur_mean[1] = c_mean(yy, nc); cur_sse[0] = c_sse(xx, cur_mean[0], nc); cur_sse[1] = c_sse(yy, cur_mean[1], nc); /* compute the covariance. */ for (i = 0, sum = 0; i < nc; i++) sum += (xx[i] - cur_mean[0]) * (yy[i] - cur_mean[1]); }/*ELSE*/ /* safety check against "divide by zero" errors. */ if ((cur_sse[0] < tol) || (cur_sse[1] < tol)) sum = 0; else sum /= sqrt(cur_sse[0] * cur_sse[1]); /* test the correlation against the threshold. */ if (fabsl(sum) > t) { if (debuglevel > 0) Rprintf("%s is collinear with %s, dropping %s with COR = %.4Lf\n", dt.m.names[j], dt.m.names[k], dt.m.names[k], sum); /* flag the variable for removal. */ dt.m.flag[k].drop = TRUE; dropped++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, dt.m.ncols - dropped)); PROTECT(colnames = allocVector(STRSXP, dt.m.ncols - dropped)); for (j = 0, k = 0; j < dt.m.ncols; j++) if (!dt.m.flag[j].drop) { SET_STRING_ELT(colnames, k, mkChar(dt.m.names[j])); SET_VECTOR_ELT(result, k++, VECTOR_ELT(data, j)); }/*THEN*/ setAttrib(result, R_NamesSymbol, colnames); /* make it a data frame. */ minimal_data_frame(result); Free1D(mean); Free1D(sse); Free1D(xx); Free1D(yy); FreeGDT(dt, FALSE); UNPROTECT(2); return result; }/*DEDUP*/