Example #1
0
/* conditional linear Gaussian mutual information test. */
static double ut_micg(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue,
  double *df) {

int i = 0, xtype = 0, ytype = TYPEOF(yy), llx = 0, lly = 0;
double xm = 0, xsd = 0, ym = 0, ysd = 0, statistic = 0;
void *xptr = NULL, *yptr = NULL;
SEXP xdata;

  if (ytype == INTSXP) {

    /* cache the number of levels. */
    lly = NLEVELS(yy);
    yptr = INTEGER(yy);

  }/*THEN*/
  else {

    /* cache mean and variance. */
    yptr = REAL(yy);
    ym = c_mean(yptr, nobs);
    ysd = c_sse(yptr, ym, nobs);

  }/*ELSE*/

  for (i = 0; i < ntests; i++) {

    xdata = VECTOR_ELT(xx, i);
    xtype = TYPEOF(xdata);

    if ((ytype == INTSXP) && (xtype == INTSXP)) {

      /* if both nodes are discrete, the test reverts back to a discrete
       * mutual information test. */
      xptr = INTEGER(xdata);
      llx = NLEVELS(xdata);
      DISCRETE_SWAP_X();
      statistic = 2 * nobs * c_chisqtest(xptr, llx, yptr, lly, nobs, df, MI);
      pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

    }/*THEN*/
    else if ((ytype == REALSXP) && (xtype == REALSXP)) {

      /* if both nodes are continuous, the test reverts back to a Gaussian
       * mutual information test. */
      xptr = REAL(xdata);
      xm = c_mean(xptr, nobs);
      xsd = c_sse(xptr, xm, nobs);
      statistic = c_fast_cor(xptr, yptr, nobs, xm, ym, xsd, ysd);
      *df = 1;
      statistic = 2 * nobs * cor_mi_trans(statistic);
      pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

    }/*THEN*/
    else {

      if (xtype == INTSXP) {

        xptr = INTEGER(xdata);
        llx = NLEVELS(xdata);
        ysd = sqrt(ysd / (nobs - 1));
        statistic = 2 * nobs * c_micg(yptr, ym, ysd, xptr, llx, nobs);
        *df = llx - 1;
        pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

      }/*THEN*/
      else {

        xptr = REAL(xdata);
        xm = c_mean(xptr, nobs);
        xsd = sqrt(c_sse(xptr, xm, nobs) / (nobs - 1));
        statistic = 2 * nobs * c_micg(xptr, xm, xsd, yptr, lly, nobs);
        *df = lly - 1;
        pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

      }/*ELSE*/

    }/*THEN*/

  }/*FOR*/

  return statistic;

}/*UT_MICG*/
Example #2
0
/* parametric tests for Gaussian variables. */
static double ut_gaustests(SEXP xx, SEXP yy, int nobs, int ntests,
    double *pvalue, double *df, test_e test) {

int i = 0;
double transform = 0, *xptr = NULL, *yptr = REAL(yy);
double xm = 0, ym = 0, xsd = 0, ysd = 0, statistic = 0;

  /* compute the degrees of freedom for correlation and mutual information. */
  if (test == COR)
    *df = nobs - 2;
  else if ((test == MI_G) || (test == MI_G_SH))
    *df = 1;

  if (((test == COR) && (*df < 1)) || ((test == ZF) && (nobs - 2 < 2))) {

    /* if there are not enough degrees of freedom, return independence. */
    warning("trying to do an independence test with zero degrees of freedom.");

    *df = 0;
    statistic = 0;
    for (i = 0; i < ntests; i++)
      pvalue[i] = 1;

     return statistic;

  }/*THEN*/

  /* cache mean and variance. */
  ym = c_mean(yptr, nobs);
  ysd = c_sse(yptr, ym, nobs);

  for (i = 0; i < ntests; i++) {

    GAUSSIAN_SWAP_X();
    statistic = c_fast_cor(xptr, yptr, nobs, xm, ym, xsd, ysd);

    if (test == COR) {

      transform = cor_t_trans(statistic, *df);
      pvalue[i] = 2 * pt(fabs(transform), *df, FALSE, FALSE);

    }/*THEN*/
    else if (test == MI_G) {

      statistic = 2 * nobs * cor_mi_trans(statistic);
      pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

    }/*THEN*/
    else if (test == MI_G_SH) {

      statistic *= 1 - cor_lambda(xptr, yptr, nobs, xm, ym, xsd, ysd, statistic);
      statistic = 2 * nobs * cor_mi_trans(statistic);
      pvalue[i] = pchisq(statistic, *df, FALSE, FALSE);

    }/*THEN*/
    else if (test == ZF) {

      statistic = cor_zf_trans(statistic, (double)nobs - 2);
      pvalue[i] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE);

    }/*THEN*/

  }/*FOR*/

  return statistic;

}/*UT_GAUSTESTS*/
Example #3
0
File: dedup.c Project: cran/bnlearn
/* remove one variable in each highly-correlated pair. */
SEXP dedup (SEXP data, SEXP threshold, SEXP complete, SEXP debug) {

int i = 0, j = 0, k = 0, dropped = 0, nc = 0;
int debuglevel = isTRUE(debug);
double *mean = NULL, *sse = NULL, *xx = NULL, *yy = NULL;
double cur_mean[2], cur_sse[2];
double tol = MACHINE_TOL, t = NUM(threshold);
long double sum = 0;
SEXP result, colnames;
gdata dt = { 0 };

  /* extract the columns from the data frame. */
  dt = gdata_from_SEXP(data, 0);
  meta_init_flags(&(dt.m), 0, complete, R_NilValue);
  meta_copy_names(&(dt.m), 0, data);
  /* set up the vectors for the pairwise complete observations. */
  xx = Calloc1D(dt.m.nobs, sizeof(double));
  yy = Calloc1D(dt.m.nobs, sizeof(double));

  if (debuglevel > 0)
    Rprintf("* caching means and variances.\n");

  mean = Calloc1D(dt.m.ncols, sizeof(double));
  sse = Calloc1D(dt.m.ncols, sizeof(double));

  /* cache the mean and variance of complete variables. */
  for (j = 0; j < dt.m.ncols; j++) {

    if (!dt.m.flag[j].complete)
      continue;

    mean[j] = c_mean(dt.col[j], dt.m.nobs);
    sse[j] = c_sse(dt.col[j], mean[j], dt.m.nobs);

  }/*FOR*/

  /* main loop. */
  for (j = 0; j < dt.m.ncols - 1; j++) {

    /* skip variables already flagged for removal. */
    if (dt.m.flag[j].drop)
      continue;

    if (debuglevel > 0)
      Rprintf("* looking at %s with %d variables still to check.\n",
        dt.m.names[j], dt.m.ncols - (j + 1));

    for (k = j + 1; k < dt.m.ncols; k++) {

      /* skip variables already flagged for removal. */
      if (dt.m.flag[k].drop)
        continue;

      if (dt.m.flag[j].complete && dt.m.flag[k].complete) {

        /* use the cached means and variances. */
        cur_mean[0] = mean[j];
        cur_mean[1] = mean[k];
        cur_sse[0] = sse[j];
        cur_sse[1] = sse[k];

        /* compute the covariance. */
        for (i = 0, sum = 0; i < dt.m.nobs; i++)
          sum += (dt.col[j][i] - cur_mean[0]) * (dt.col[k][i] - cur_mean[1]);

      }/*THEN*/
      else {

        for (i = 0, nc = 0; i < dt.m.nobs; i++) {

          if (ISNAN(dt.col[j][i]) || ISNAN(dt.col[k][i]))
            continue;

          xx[nc] = dt.col[j][i];
          yy[nc++] = dt.col[k][i];

        }/*FOR*/


        /* if there are no complete observations, take the variables to be
         * independent. */
        if (nc == 0)
          continue;

        cur_mean[0] = c_mean(xx, nc);
        cur_mean[1] = c_mean(yy, nc);
        cur_sse[0] = c_sse(xx, cur_mean[0], nc);
        cur_sse[1] = c_sse(yy, cur_mean[1], nc);

        /* compute the covariance. */
        for (i = 0, sum = 0; i < nc; i++)
          sum += (xx[i] - cur_mean[0]) * (yy[i] - cur_mean[1]);

      }/*ELSE*/

      /* safety check against "divide by zero" errors. */
      if ((cur_sse[0] < tol) || (cur_sse[1] < tol))
        sum = 0;
      else
        sum /= sqrt(cur_sse[0] * cur_sse[1]);

      /* test the correlation against the threshold. */
      if (fabsl(sum) > t) {

        if (debuglevel > 0)
          Rprintf("%s is collinear with %s, dropping %s with COR = %.4Lf\n",
            dt.m.names[j], dt.m.names[k], dt.m.names[k], sum);

        /* flag the variable for removal. */
        dt.m.flag[k].drop = TRUE;
        dropped++;

      }/*THEN*/

    }/*FOR*/

  }/*FOR*/

  /* set up the return value. */
  PROTECT(result = allocVector(VECSXP, dt.m.ncols - dropped));
  PROTECT(colnames = allocVector(STRSXP, dt.m.ncols - dropped));

  for (j = 0, k = 0; j < dt.m.ncols; j++)
    if (!dt.m.flag[j].drop) {

      SET_STRING_ELT(colnames, k, mkChar(dt.m.names[j]));
      SET_VECTOR_ELT(result, k++, VECTOR_ELT(data, j));

    }/*THEN*/

  setAttrib(result, R_NamesSymbol, colnames);

  /* make it a data frame. */
  minimal_data_frame(result);

  Free1D(mean);
  Free1D(sse);
  Free1D(xx);
  Free1D(yy);
  FreeGDT(dt, FALSE);

  UNPROTECT(2);

  return result;

}/*DEDUP*/