SEXP cdlik(SEXP x, SEXP y) { int i = 0, j = 0, k = 0; int **n = NULL, *nj = NULL; int llx = NLEVELS(x), lly = NLEVELS(y), num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table and the marginal frequencies. */ n = alloc2dcont(llx, lly); nj = alloc1dcont(lly); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) { n[xx[k] - 1][yy[k] - 1]++; }/*FOR*/ /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { nj[j] += n[i][j]; }/*FOR*/ /* compute the conditional entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { if (n[i][j] != 0) *res += (double)n[i][j] * log((double)n[i][j] / (double)nj[j]); }/*FOR*/ UNPROTECT(1); return result; }/*CDLIK*/
SEXP dlik(SEXP x) { int i = 0, k = 0; int *n = NULL, *xx = INTEGER(x), llx = NLEVELS(x), num = LENGTH(x); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc1dcont(llx); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) { n[xx[k] - 1]++; }/*FOR*/ /* compute the entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) { if (n[i] != 0) *res += (double)n[i] * log((double)n[i] / num); }/*FOR*/ UNPROTECT(1); return result; }/*DLIK*/
/* get the number of parameters of the whole network (mixed case, also handles * discrete and Gaussian networks). */ SEXP nparams_cgnet(SEXP graph, SEXP data, SEXP debug) { int i = 0, j = 0, nnodes = 0, debuglevel = isTRUE(debug); int *nlevels = NULL, *index = NULL, ngp = 0; double nconfig = 0, node_params = 0, all_params = 0; SEXP nodes = R_NilValue, node_data, parents, temp; /* get nodes' number and data. */ node_data = getListElement(graph, "nodes"); nnodes = length(node_data); nodes = getAttrib(node_data, R_NamesSymbol); /* cache the number of levels of each variables (zero = continuous). */ nlevels = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) { temp = VECTOR_ELT(data, i); if (TYPEOF(temp) == INTSXP) nlevels[i] = NLEVELS(temp); }/*FOR*/ for (i = 0; i < nnodes; i++) { /* extract the parents of the node and match them. */ parents = getListElement(VECTOR_ELT(node_data, i), "parents"); PROTECT(temp = match(nodes, parents, 0)); index = INTEGER(temp); /* compute the number of regressors and of configurations. */ for (j = 0, ngp = 0, nconfig = 1; j < length(parents); j++) { if (nlevels[index[j] - 1] == 0) ngp++; else nconfig *= nlevels[index[j] - 1]; }/*FOR*/ /* compute the overall number of parameters as regressors plus intercept * times configurations. */ node_params = nconfig * (nlevels[i] == 0 ? ngp + 1 : nlevels[i] - 1); if (debuglevel > 0) Rprintf("* node %s has %.0lf parameter(s).\n", NODE(i), node_params); /* update the return value. */ all_params += node_params; UNPROTECT(1); }/*FOR*/ Free1D(nlevels); return ScalarReal(all_params); }/*NPARAMS_CGNET*/
/* unconditional mutual information, to be used for the asymptotic test. */ SEXP mi(SEXP x, SEXP y, SEXP gsquare) { int llx = NLEVELS(x), lly = NLEVELS(y), num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); res[0] = c_mi(xx, &llx, yy, &lly, &num); res[1] = (double)(llx - 1) * (double)(lly - 1); /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*MI*/
double cdlik(SEXP x, SEXP y, double *nparams) { int i = 0, j = 0, k = 0; int **n = NULL, *nj = NULL; int llx = NLEVELS(x), lly = NLEVELS(y), num = length(x); int *xx = INTEGER(x), *yy = INTEGER(y); double res = 0; /* initialize the contingency table and the marginal frequencies. */ n = (int **) Calloc2D(llx, lly, sizeof(int)); nj = Calloc1D(lly, sizeof(int)); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) n[xx[k] - 1][yy[k] - 1]++; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) nj[j] += n[i][j]; /* compute the conditional entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) if (n[i][j] != 0) res += (double)n[i][j] * log((double)n[i][j] / (double)nj[j]); /* we may want to store the number of parameters. */ if (nparams) *nparams = (llx - 1) * lly; Free1D(nj); Free2D(n, llx); return res; }/*CDLIK*/
/* conditional mutual information, to be used for the asymptotic test. */ SEXP cmi(SEXP x, SEXP y, SEXP z, SEXP gsquare) { int llx = NLEVELS(x), lly = NLEVELS(y), llz = NLEVELS(z); int num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y), *zz = INTEGER(z); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); res[0] = c_cmi(xx, &llx, yy, &lly, zz, &llz, &num); res[1] = (double)(llx - 1) * (double)(lly - 1) * (double)llz; /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*CMI*/
/* unconditional mutual information, to be used for the asymptotic test. */ SEXP mi(SEXP x, SEXP y, SEXP gsquare, SEXP adjusted) { int llx = NLEVELS(x), lly = NLEVELS(y), num = length(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); if (isTRUE(adjusted)) res[0] = c_chisqtest(xx, llx, yy, lly, num, res + 1, MI_ADF); else res[0] = c_chisqtest(xx, llx, yy, lly, num, res + 1, MI); /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*MI*/
/* parametric tests for discrete variables. */ static double ct_discrete(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df, test_e test) { int i = 0, llx = 0, lly = NLEVELS(yy), llz = 0; int *xptr = NULL, *yptr = INTEGER(yy), *zptr = NULL; double statistic = 0; SEXP xdata, config; DISCRETE_CACHE(); for (i = 0; i < ntests; i++) { DISCRETE_SWAP_X(); if (test == MI || test == MI_ADF || test == X2 || test == X2_ADF) { /* mutual information and Pearson's X^2 asymptotic tests. */ statistic = c_cchisqtest(xptr, llx, yptr, lly, zptr, llz, nobs, df, test); if ((test == MI) || (test == MI_ADF)) statistic = 2 * nobs * statistic; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_SH) { /* shrinkage mutual information test. */ statistic = 2 * nobs * c_shcmi(xptr, llx, yptr, lly, zptr, llz, nobs, df); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == JT) { /* Jonckheere-Terpstra test. */ statistic = c_cjt(xptr, llx, yptr, lly, zptr, llz, nobs); pvalue[i] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE); }/*THEN*/ }/*FOR*/ UNPROTECT(1); return statistic; }/*CT_DISCRETE*/
/* discrete permutation tests. */ static double ut_dperm(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue, double *df, test_e type, int B, double a) { int i = 0, *xptr = NULL, *yptr = INTEGER(yy); int llx = 0, lly = NLEVELS(yy); double statistic = 0; SEXP xdata; for (i = 0; i < ntests; i++) { DISCRETE_SWAP_X(); statistic = 0; c_mcarlo(xptr, llx, yptr, lly, nobs, B, &statistic, pvalue + i, a, type, df); }/*FOR*/ return statistic; }/*UT_DPERM*/
/* parametric tests for discrete variables. */ static double ut_discrete(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue, double *df, test_e test) { int i = 0, llx = 0, lly = NLEVELS(yy), *xptr = NULL, *yptr = INTEGER(yy); double statistic = 0; SEXP xdata; for (i = 0; i < ntests; i++) { DISCRETE_SWAP_X(); if (test == MI || test == MI_ADF || test == X2 || test == X2_ADF) { /* mutual information and Pearson's X^2 asymptotic tests. */ statistic = c_chisqtest(xptr, llx, yptr, lly, nobs, df, test); if ((test == MI) || (test == MI_ADF)) statistic = 2 * nobs * statistic; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_SH) { /* shrinkage mutual information test. */ statistic = 2 * nobs * c_shmi(xptr, llx, yptr, lly, nobs); *df = ((double)(llx - 1) * (double)(lly - 1)); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == JT) { /* Jonckheere-Terpstra test. */ statistic = c_jt(xptr, llx, yptr, lly, nobs); pvalue[i] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE); }/*THEN*/ }/*FOR*/ return statistic; }/*UT_DISCRETE*/
double dlik(SEXP x, double *nparams) { int i = 0; int *n = NULL, *xx = INTEGER(x), llx = NLEVELS(x), num = length(x); double res = 0; /* initialize the contingency table. */ fill_1d_table(xx, &n, llx, num); /* compute the entropy from the marginal frequencies. */ for (i = 0; i < llx; i++) if (n[i] != 0) res += (double)n[i] * log((double)n[i] / num); /* we may want to store the number of parameters. */ if (nparams) *nparams = llx - 1; Free1D(n); return res; }/*DLIK*/
/* discrete permutation tests. */ static double ct_dperm(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df, test_e type, int B, double a) { int i = 0, *xptr = NULL, *yptr = INTEGER(yy), *zptr = NULL; int llx = 0, lly = NLEVELS(yy), llz = 0; double statistic = 0; SEXP xdata, config; DISCRETE_CACHE(); for (i = 0; i < ntests; i++) { DISCRETE_SWAP_X(); statistic = 0; c_cmcarlo(xptr, llx, yptr, lly, zptr, llz, nobs, B, &statistic, pvalue + i, a, type, df); }/*FOR*/ UNPROTECT(1); return statistic; }/*CT_DPERM*/
/* conditional linear Gaussian mutual information test. */ static double ut_micg(SEXP xx, SEXP yy, int nobs, int ntests, double *pvalue, double *df) { int i = 0, xtype = 0, ytype = TYPEOF(yy), llx = 0, lly = 0; double xm = 0, xsd = 0, ym = 0, ysd = 0, statistic = 0; void *xptr = NULL, *yptr = NULL; SEXP xdata; if (ytype == INTSXP) { /* cache the number of levels. */ lly = NLEVELS(yy); yptr = INTEGER(yy); }/*THEN*/ else { /* cache mean and variance. */ yptr = REAL(yy); ym = c_mean(yptr, nobs); ysd = c_sse(yptr, ym, nobs); }/*ELSE*/ for (i = 0; i < ntests; i++) { xdata = VECTOR_ELT(xx, i); xtype = TYPEOF(xdata); if ((ytype == INTSXP) && (xtype == INTSXP)) { /* if both nodes are discrete, the test reverts back to a discrete * mutual information test. */ xptr = INTEGER(xdata); llx = NLEVELS(xdata); DISCRETE_SWAP_X(); statistic = 2 * nobs * c_chisqtest(xptr, llx, yptr, lly, nobs, df, MI); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if ((ytype == REALSXP) && (xtype == REALSXP)) { /* if both nodes are continuous, the test reverts back to a Gaussian * mutual information test. */ xptr = REAL(xdata); xm = c_mean(xptr, nobs); xsd = c_sse(xptr, xm, nobs); statistic = c_fast_cor(xptr, yptr, nobs, xm, ym, xsd, ysd); *df = 1; statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else { if (xtype == INTSXP) { xptr = INTEGER(xdata); llx = NLEVELS(xdata); ysd = sqrt(ysd / (nobs - 1)); statistic = 2 * nobs * c_micg(yptr, ym, ysd, xptr, llx, nobs); *df = llx - 1; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else { xptr = REAL(xdata); xm = c_mean(xptr, nobs); xsd = sqrt(c_sse(xptr, xm, nobs) / (nobs - 1)); statistic = 2 * nobs * c_micg(xptr, xm, xsd, yptr, lly, nobs); *df = lly - 1; pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*ELSE*/ }/*THEN*/ }/*FOR*/ return statistic; }/*UT_MICG*/
/* conditional posterior dirichlet probability (covers BDe and K2 scores). */ SEXP cdpost(SEXP x, SEXP y, SEXP iss, SEXP exp, SEXP nparams) { int i = 0, j = 0, k = 0, imaginary = 0, num = LENGTH(x); int llx = NLEVELS(x), lly = NLEVELS(y), *xx = INTEGER(x), *yy = INTEGER(y); int **n = NULL, *nj = NULL; double alpha = 0, *res = NULL, *p = REAL(nparams); SEXP result; if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = (int) *p; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INT(iss); alpha = (double) imaginary / *p; }/*ELSE*/ /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc2dcont(llx, lly); nj = alloc1dcont(lly); /* compute the joint frequency of x and y. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*FOR*/ }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*THEN*/ else { k++; }/*ELSE*/ }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= LENGTH(exp); }/*ELSE*/ /* compute the conditional posterior probability. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) *res += lgammafn(n[i][j] + alpha) - lgammafn(alpha); for (j = 0; j < lly; j++) *res += lgammafn((double)imaginary / lly) - lgammafn(nj[j] + (double)imaginary / lly); UNPROTECT(1); return result; }/*CDPOST*/
/* posterior dirichlet probability (covers BDe and K2 scores). */ SEXP dpost(SEXP x, SEXP iss, SEXP exp) { int i = 0, k = 0, num = LENGTH(x); int llx = NLEVELS(x), *xx = INTEGER(x), *n = NULL, *imaginary = NULL; double alpha = 0, *res = NULL; SEXP result; /* the correct vaules for the hyperparameters alpha are documented in * "Learning Bayesian Networks: The Combination of Knowledge and Statistical * Data" by Heckerman, Geiger & Chickering (1995), page 17. */ if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = &llx; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INTEGER(iss); alpha = (double) *imaginary / (double) llx; }/*ELSE*/ /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc1dcont(llx); /* compute the frequency table of x, disregarding experimental data. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) n[xx[i] - 1]++; }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) n[xx[i] - 1]++; else k++; }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= LENGTH(exp); }/*ELSE*/ /* compute the posterior probability. */ for (i = 0; i < llx; i++) *res += lgammafn(n[i] + alpha) - lgammafn(alpha); *res += lgammafn((double)(*imaginary)) - lgammafn((double)(*imaginary + num)); UNPROTECT(1); return result; }/*DPOST*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncol = length(data); int num = length(VECTOR_ELT(data, 0)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, clevels = 0, *est = INTEGER(estimator), *depth = NULL; int *wl = NULL, *bl = NULL, *poset = NULL, debuglevel = isTRUE(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); include = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, cond, &clevels, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS(); /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = length(wlist); for (i = 0; i < nwl; i++) { if (debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = length(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(int)); for (i = 0; i < UPTRI3_MATRIX(ncol); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncol)); depth = Calloc1D(ncol, sizeof(int)); for (i = UPTRI3_MATRIX(ncol) - 1; i >= 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncol, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncol - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, depth, debug_coord[0], debug_coord[1], ncol, nodes, FALSE)) { if (debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (length(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncol - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncol - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncol - 1)); Free1D(depth); Free1D(mim); Free1D(include); Free1D(poset); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); return arcs; }/*CHOW_LIU*/
/* conditional linear Gaussian mutual information test. */ static double ct_micg(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df) { int xtype = 0, ytype = TYPEOF(yy), *nlvls = NULL, llx = 0, lly = 0, llz = 0; int ndp = 0, ngp = 0, nsx = length(zz), **dp = NULL, *dlvls = NULL, j = 0, k = 0; int i = 0, *zptr = 0; void *xptr = NULL, *yptr = NULL, **columns = NULL; double **gp = NULL; double statistic = 0; SEXP xdata; if (ytype == INTSXP) { /* cache the number of levels. */ lly = NLEVELS(yy); yptr = INTEGER(yy); }/*THEN*/ else { yptr = REAL(yy); }/*ELSE*/ /* extract the conditioning variables and cache their types. */ columns = Calloc1D(nsx, sizeof(void *)); nlvls = Calloc1D(nsx, sizeof(int)); df2micg(zz, columns, nlvls, &ndp, &ngp); dp = Calloc1D(ndp + 1, sizeof(int *)); gp = Calloc1D(ngp + 1, sizeof(double *)); dlvls = Calloc1D(ndp + 1, sizeof(int)); for (i = 0, j = 0, k = 0; i < nsx; i++) if (nlvls[i] > 0) { dlvls[1 + j] = nlvls[i]; dp[1 + j++] = columns[i]; }/*THEN*/ else { gp[1 + k++] = columns[i]; }/*ELSE*/ /* allocate vector for the configurations of the discrete parents; or, if * there no discrete parents, for the means of the continuous parents. */ if (ndp > 0) { zptr = Calloc1D(nobs, sizeof(int)); c_fast_config(dp + 1, nobs, ndp, dlvls + 1, zptr, &llz, 1); }/*THEN*/ for (i = 0; i < ntests; i++) { xdata = VECTOR_ELT(xx, i); xtype = TYPEOF(xdata); if (xtype == INTSXP) { xptr = INTEGER(xdata); llx = NLEVELS(xdata); }/*THEN*/ else { xptr = REAL(xdata); }/*ELSE*/ if ((ytype == INTSXP) && (xtype == INTSXP)) { if (ngp > 0) { /* need to reverse conditioning to actually compute the test. */ statistic = 2 * nobs * nobs * c_cmicg_unroll(xptr, llx, yptr, lly, zptr, llz, gp + 1, ngp, df, nobs); }/*THEN*/ else { /* the test reverts back to a discrete mutual information test. */ statistic = 2 * nobs * c_cchisqtest(xptr, llx, yptr, lly, zptr, llz, nobs, df, MI); }/*ELSE*/ }/*THEN*/ else if ((ytype == REALSXP) && (xtype == REALSXP)) { gp[0] = xptr; statistic = 2 * nobs * c_cmicg(yptr, gp, ngp + 1, NULL, 0, zptr, llz, dlvls, nobs); /* one regression coefficient for each conditioning level is added; * if all conditioning variables are continuous that's just one global * regression coefficient. */ *df = (llz == 0) ? 1 : llz; }/*THEN*/ else if ((ytype == INTSXP) && (xtype == REALSXP)) { dp[0] = yptr; dlvls[0] = lly; statistic = 2 * nobs * c_cmicg(xptr, gp + 1, ngp, dp, ndp + 1, zptr, llz, dlvls, nobs); /* for each additional configuration of the discrete conditioning * variables plus the discrete yptr, one whole set of regression * coefficients (plus the intercept) is added. */ *df = (lly - 1) * ((llz == 0) ? 1 : llz) * (ngp + 1); }/*THEN*/ else if ((ytype == REALSXP) && (xtype == INTSXP)) { dp[0] = xptr; dlvls[0] = llx; statistic = 2 * nobs * c_cmicg(yptr, gp + 1, ngp, dp, ndp + 1, zptr, llz, dlvls, nobs); /* same as above, with xptr and yptr swapped. */ *df = (llx - 1) * ((llz == 0) ? 1 : llz) * (ngp + 1); }/*ELSE*/ pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*FOR*/ Free1D(columns); Free1D(nlvls); Free1D(dlvls); Free1D(zptr); Free1D(dp); Free1D(gp); return statistic; }/*CT_MICG*/
/* predict the value of the training variable in a naive Bayes or Tree-Augmented * naive Bayes classifier. */ SEXP naivepred(SEXP fitted, SEXP data, SEXP parents, SEXP training, SEXP prior, SEXP prob, SEXP debug) { int i = 0, j = 0, k = 0, n = 0, nvars = LENGTH(fitted), nmax = 0, tr_nlevels = 0; int *res = NULL, **ex = NULL, *ex_nlevels = NULL; int idx = 0, *tr_id = INTEGER(training); int *iscratch = NULL, *maxima = NULL, *prn = NULL, *debuglevel = LOGICAL(debug); int *include_prob = LOGICAL(prob); double **cpt = NULL, *pr = NULL, *scratch = NULL, *buf = NULL, *pt = NULL; double sum = 0; SEXP class, temp, tr, tr_levels, result, nodes, probtab, dimnames; /* cache the node labels. */ nodes = getAttrib(fitted, R_NamesSymbol); /* cache the pointers to all the variables. */ ex = (int **) alloc1dpointer(nvars); ex_nlevels = alloc1dcont(nvars); for (i = 0; i < nvars; i++) { temp = VECTOR_ELT(data, i); ex[i] = INTEGER(temp); ex_nlevels[i] = NLEVELS(temp); }/*FOR*/ /* get the training variable and its levels. */ n = LENGTH(VECTOR_ELT(data, 0)); tr = getListElement(VECTOR_ELT(fitted, *tr_id - 1), "prob"); tr_levels = VECTOR_ELT(getAttrib(tr, R_DimNamesSymbol), 0); tr_nlevels = LENGTH(tr_levels); /* get the prior distribution. */ pr = REAL(prior); if (*debuglevel > 0) { Rprintf("* the prior distribution for the target variable is:\n"); PrintValue(prior); }/*THEN*/ /* allocate the scratch space used to compute posterior probabilities. */ scratch = alloc1dreal(tr_nlevels); buf = alloc1dreal(tr_nlevels); /* cache the pointers to the conditional probability tables. */ cpt = (double **) alloc1dpointer(nvars); for (i = 0; i < nvars; i++) cpt[i] = REAL(getListElement(VECTOR_ELT(fitted, i), "prob")); /* dereference the parents' vector. */ prn = INTEGER(parents); /* create the vector of indexes. */ iscratch = alloc1dcont(tr_nlevels); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(tr_nlevels); /* allocate the return value. */ PROTECT(result = allocVector(INTSXP, n)); res = INTEGER(result); /* allocate and initialize the table of the posterior probabilities. */ if (*include_prob > 0) { PROTECT(probtab = allocMatrix(REALSXP, tr_nlevels, n)); pt = REAL(probtab); memset(pt, '\0', n * tr_nlevels * sizeof(double)); }/*THEN*/ /* initialize the random seed, just in case we need it for tie breaking. */ GetRNGstate(); /* for each observation... */ for (i = 0; i < n; i++) { /* ... reset the scratch space and the indexes array... */ for (k = 0; k < tr_nlevels; k++) { scratch[k] = log(pr[k]); iscratch[k] = k + 1; }/*FOR*/ if (*debuglevel > 0) Rprintf("* predicting the value of observation %d.\n", i + 1); /* ... and for each conditional probability table... */ for (j = 0; j < nvars; j++) { /* ... skip the training variable... */ if (*tr_id == j + 1) continue; /* ... (this is the root node of the Chow-Liu tree) ... */ if (prn[j] == NA_INTEGER) { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d) from the CPT (p = %lf).\n", NODE(j), CMC(ex[j][i] - 1, k, ex_nlevels[j]), ex[j][i], k + 1, cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*FOR*/ }/*THEN*/ else { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { /* (the first dimension corresponds to the current node [X], the second * to the training node [Y], the third to the only parent of the current * node [Z]; CMC coordinates are computed as X + Y * NX + Z * NX * NY. */ idx = (ex[j][i] - 1) + k * ex_nlevels[j] + (ex[prn[j] - 1][i] - 1) * ex_nlevels[j] * tr_nlevels; if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d, %d) from the CPT (p = %lf).\n", NODE(j), idx, ex[j][i], k + 1, ex[prn[j] - 1][i], cpt[j][idx]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][idx]); }/*FOR*/ }/*ELSE*/ }/*FOR*/ /* find out the mode(s). */ all_max(scratch, tr_nlevels, maxima, &nmax, iscratch, buf); /* compute the posterior probabilities on the right scale, to attach them * to the return value. */ if (*include_prob) { /* copy the log-probabilities from scratch. */ memcpy(pt + i * tr_nlevels, scratch, tr_nlevels * sizeof(double)); /* transform log-probabilitiees into plain probabilities. */ for (k = 0, sum = 0; k < tr_nlevels; k++) sum += pt[i * tr_nlevels + k] = exp(pt[i * tr_nlevels + k] - scratch[maxima[0] - 1]); /* rescale them to sum up to 1. */ for (k = 0; k < tr_nlevels; k++) pt[i * tr_nlevels + k] /= sum; }/*THEN*/ if (nmax == 1) { res[i] = maxima[0]; if (*debuglevel > 0) { Rprintf(" @ prediction for observation %d is '%s' with (log-)posterior:\n", i + 1, CHAR(STRING_ELT(tr_levels, res[i] - 1))); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ SampleReplace(1, nmax, res + i, maxima); if (*debuglevel > 0) { Rprintf(" @ there are %d levels tied for prediction of observation %d, applying tie breaking.\n", nmax, i + 1); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); Rprintf(" @ tied levels are:"); for (k = 0; k < nmax; k++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[k] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ }/*FOR*/ /* save the state of the random number generator. */ PutRNGstate(); /* add back the attributes and the class to the return value. */ PROTECT(class = allocVector(STRSXP, 1)); SET_STRING_ELT(class, 0, mkChar("factor")); setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, class); if (*include_prob > 0) { /* set the levels of the taregt variable as rownames. */ PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, tr_levels); setAttrib(probtab, R_DimNamesSymbol, dimnames); /* add the posterior probabilities to the return value. */ setAttrib(result, install("prob"), probtab); UNPROTECT(4); }/*THEN*/ else { UNPROTECT(2); }/*ELSE*/ return result; }/*NAIVEPRED*/
/* conditional posterior Dirichlet probability (covers BDe and K2 scores). */ double cdpost(SEXP x, SEXP y, SEXP iss, SEXP exp) { int i = 0, j = 0, k = 0, imaginary = 0, num = length(x); int llx = NLEVELS(x), lly = NLEVELS(y), p = llx * lly; int *xx = INTEGER(x), *yy = INTEGER(y), **n = NULL, *nj = NULL; double alpha = 0, res = 0; if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = p; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INT(iss); alpha = ((double) imaginary) / ((double) p); }/*ELSE*/ /* initialize the contingency table. */ n = (int **) Calloc2D(llx, lly, sizeof(int)); nj = Calloc1D(lly, sizeof(int)); /* compute the joint frequency of x and y. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*FOR*/ }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*THEN*/ else { k++; }/*ELSE*/ }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= length(exp); }/*ELSE*/ /* compute the conditional posterior probability. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) res += lgammafn(n[i][j] + alpha) - lgammafn(alpha); for (j = 0; j < lly; j++) res += lgammafn((double)imaginary / lly) - lgammafn(nj[j] + (double)imaginary / lly); Free1D(nj); Free2D(n, llx); return res; }/*CDPOST*/