/* C-level wrapper around the dgesdd() F77 routine. Note that the input * matrix A is overwritten by dgesdd(), so it's sensible to have a * backup copy in case it's needed later. */ void c_svd(double *A, double *U, double *D, double *V, int *nrows, int *ncols, int *mindim, int strict, int *errcode) { int lwork = -1, *iwork = NULL; char jobz = 'A'; double tmp = 0, *work = NULL; iwork = Calloc1D(8 * (*mindim), sizeof(int)); /* ask for the optimal size of the work array. */ F77_CALL(dgesdd)(&jobz, nrows, ncols, A, nrows, D, U, nrows, V, mindim, &tmp, &lwork, iwork, errcode); lwork = (int)tmp; work = Calloc1D(lwork, sizeof(double)); /* actual call */ F77_NAME(dgesdd)(&jobz, nrows, ncols, A, nrows, D, U, nrows, V, mindim, work, &lwork, iwork, errcode); Free1D(work); Free1D(iwork); if (*errcode && strict) error("an error (%d) occurred in the call to dgesdd().\n", *errcode); }/*C_SVD*/
SEXP smart_network_averaging(SEXP arcs, SEXP nodes, SEXP weights) { int k = 0, from = 0, to = 0, nrows = length(arcs) / 2, dims = length(nodes); int *a = NULL, *coords = NULL, *poset = NULL, *path = NULL, *scratch = NULL; double *w = NULL; SEXP weights2, amat, try, acyclic; /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, dims, dims)); a = INTEGER(amat); memset(a, '\0', sizeof(int) * dims * dims); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); /* duplicate the weights to preserve the original ones. */ PROTECT(weights2 = duplicate(weights)); w = REAL(weights2); /* sort the strength coefficients. */ poset = Calloc1D(nrows, sizeof(int)); for (k = 0; k < nrows; k++) poset[k] = k; R_qsort_I(w, poset, 1, nrows); /* allocate buffers for c_has_path(). */ path = Calloc1D(dims, sizeof(int)); scratch = Calloc1D(dims, sizeof(int)); /* iterate over the arcs in reverse order wrt their strength coefficients. */ for (k = 0; k < nrows; k++) { from = coords[poset[k]] - 1; to = coords[poset[k] + nrows] - 1; /* add an arc only if it does not introduce cycles. */ if (!c_has_path(to, from, a, dims, nodes, FALSE, TRUE, path, scratch, FALSE)) a[CMC(from, to, dims)] = 1; else warning("arc %s -> %s would introduce cycles in the graph, ignoring.", NODE(from), NODE(to)); }/*FOR*/ /* convert the adjacency matrix back to an arc set and return it. */ acyclic = amat2arcs(amat, nodes); Free1D(path); Free1D(scratch); Free1D(poset); UNPROTECT(3); return acyclic; }/*SMART_NETWORK_AVERAGING*/
/* C-level function to compute Moore-Penrose Generalized Inverse of a square matrix. */ void c_ginv(double *covariance, int ncols, double *mpinv) { int i = 0, j = 0, errcode = 0; double *u = NULL, *d = NULL, *vt = NULL, *backup = NULL; double sv_tol = 0, zero = 0, one = 1; char transa = 'N', transb = 'N'; c_udvt(&u, &d, &vt, ncols); if (covariance != mpinv) { backup = Calloc1D(ncols * ncols, sizeof(double)); memcpy(backup, covariance, ncols * ncols * sizeof(double)); }/*THEN*/ /* compute the SVD decomposition. */ c_svd(covariance, u, d, vt, &ncols, &ncols, &ncols, FALSE, &errcode); /* if SVD fails, catch the error code and free all buffers. */ if (errcode == 0) { /* set the threshold for the singular values as in corpcor. */ sv_tol = ncols * d[0] * MACHINE_TOL * MACHINE_TOL; /* the first multiplication, U * D^{-1} is easy. */ for (i = 0; i < ncols; i++) for (j = 0; j < ncols; j++) u[CMC(i, j, ncols)] = u[CMC(i, j, ncols)] * ((d[j] > sv_tol) ? 1/d[j] : 0); /* the second one, (U * D^{-1}) * Vt is a real matrix multiplication. */ F77_CALL(dgemm)(&transa, &transb, &ncols, &ncols, &ncols, &one, u, &ncols, vt, &ncols, &zero, mpinv, &ncols); }/*THEN*/ if (covariance != mpinv) { memcpy(covariance, backup, ncols * ncols * sizeof(double)); Free1D(backup); }/*THEN*/ Free1D(u); Free1D(d); Free1D(vt); if (errcode) error("an error (%d) occurred in the call to c_ginv().\n", errcode); }/*C_GINV*/
/* C-level function to perform OLS via QR decomposition. */ void c_qr_ols (double *qr, double *y, int nrow, int ncol, double *fitted, long double *sd) { int i = 0, job = 1, rank = 0, info = 0, *pivot = NULL; double tol = MACHINE_TOL, *qraux = NULL, *work = NULL; /* safety check for sample size = 1. */ if (nrow == 1) { *sd = 0; *fitted = *y; return; }/*THEN*/ /* allocate the working space. */ qraux = Calloc1D(ncol, sizeof(double)); work = Calloc1D(2 * ncol, sizeof(double)); pivot = Calloc1D(ncol, sizeof(int)); for (i = 0; i < ncol; i++) pivot[i] = i + 1; /* perform the QR decomposition. */ F77_CALL(dqrdc2)(qr, &nrow, &nrow, &ncol, &tol, &rank, qraux, pivot, work); /* operate on a backup copy of the response variable. */ memcpy(fitted, y, nrow * sizeof(double)); /* compute the fitted values. */ /* dqrsl( x, ldx, n, k, qraux, y, qy, qty, */ F77_CALL(dqrsl)(qr, &nrow, &nrow, &rank, qraux, fitted, NULL, fitted, /* b, rsd, xb, job, info) */ NULL, NULL, fitted, &job, &info); if (info != 0) error("an error (%d) occurred in the call to dqrsl().\n", &info); /* compute the standard deviation of the residuals. */ for (i = 0, *sd = 0; i < nrow; i++) *sd += (y[i] - fitted[i]) * (y[i] - fitted[i]); *sd = sqrt(*sd / (nrow - 1)); Free1D(pivot); Free1D(work); Free1D(qraux); }/*C_QR_OLS*/
/* continuous permutation tests. */ static double ct_gperm(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df, test_e type, int B, double a) { int i = 0, nsx = length(zz), ncols = nsx + 2; double **column = NULL, *yptr = REAL(yy), statistic = 0; GAUSSIAN_COLUMN_CACHE(); for (i = 0; i < ntests; i++) { /* swap the first column and restore the second, which is that undergoing * permutation (backward compatibility from set random seed). */ column[0] = REAL(VECTOR_ELT(xx, i)); column[1] = yptr; statistic = 0; c_gauss_cmcarlo(column, ncols, nobs, B, &statistic, pvalue + i, a, type); }/*FOR*/ Free1D(column); return statistic; }/*CT_GPERM*/
/* shrinked mutual information, to be used in C code. */ double c_shmi(int *xx, int llx, int *yy, int lly, int num) { int i = 0, j = 0, k = 0; double **n = NULL, *ni = NULL, *nj = NULL; double lambda = 0, target = 1/(double)(llx * lly); double res = 0; /* initialize the contingency table and the marginal frequencies. */ n = (double **) Calloc2D(llx, lly, sizeof(double)); ni = Calloc1D(llx, sizeof(double)); nj = Calloc1D(lly, sizeof(double)); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) n[xx[k] - 1][yy[k] - 1]++; /* estimate the optimal lambda for the data. */ mi_lambda((double *)n, &lambda, target, num, llx, lly, 0); /* switch to the probability scale and shrink the estimates. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) n[i][j] = lambda * target + (1 - lambda) * n[i][j] / num; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { ni[i] += n[i][j]; nj[j] += n[i][j]; }/*FOR*/ /* compute the mutual information from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) if (n[i][j] != 0) res += n[i][j] * log(n[i][j] / (ni[i] * nj[j])); Free1D(ni); Free1D(nj); Free2D(n, llx); return res; }/*C_SHMI*/
/* get the number of parameters of the whole network (mixed case, also handles * discrete and Gaussian networks). */ SEXP nparams_cgnet(SEXP graph, SEXP data, SEXP debug) { int i = 0, j = 0, nnodes = 0, debuglevel = isTRUE(debug); int *nlevels = NULL, *index = NULL, ngp = 0; double nconfig = 0, node_params = 0, all_params = 0; SEXP nodes = R_NilValue, node_data, parents, temp; /* get nodes' number and data. */ node_data = getListElement(graph, "nodes"); nnodes = length(node_data); nodes = getAttrib(node_data, R_NamesSymbol); /* cache the number of levels of each variables (zero = continuous). */ nlevels = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) { temp = VECTOR_ELT(data, i); if (TYPEOF(temp) == INTSXP) nlevels[i] = NLEVELS(temp); }/*FOR*/ for (i = 0; i < nnodes; i++) { /* extract the parents of the node and match them. */ parents = getListElement(VECTOR_ELT(node_data, i), "parents"); PROTECT(temp = match(nodes, parents, 0)); index = INTEGER(temp); /* compute the number of regressors and of configurations. */ for (j = 0, ngp = 0, nconfig = 1; j < length(parents); j++) { if (nlevels[index[j] - 1] == 0) ngp++; else nconfig *= nlevels[index[j] - 1]; }/*FOR*/ /* compute the overall number of parameters as regressors plus intercept * times configurations. */ node_params = nconfig * (nlevels[i] == 0 ? ngp + 1 : nlevels[i] - 1); if (debuglevel > 0) Rprintf("* node %s has %.0lf parameter(s).\n", NODE(i), node_params); /* update the return value. */ all_params += node_params; UNPROTECT(1); }/*FOR*/ Free1D(nlevels); return ScalarReal(all_params); }/*NPARAMS_CGNET*/
/* unconditional parametric asymptotic tests for categorical data. */ double c_chisqtest(int *xx, int llx, int *yy, int lly, int num, double *df, test_e test) { int **n = NULL, *ni = NULL, *nj = NULL, adj = IS_ADF(test); double res = 0; if (adj) { /* if there are less than 5 observations per cell on average, assume the * test does not have enough power and return independence. */ if (num < 5 * llx * lly) { if (df) *df = 1; return 0; }/*THEN*/ }/*THEN*/ /* initialize the contingency table and the marginal frequencies. */ fill_2d_table(xx, yy, &n, &ni, &nj, llx, lly, num); /* compute the mutual information or Pearson's X^2. */ if ((test == MI) || (test == MI_ADF)) res = mi_kernel(n, ni, nj, llx, lly, num) / num; else if ((test == X2) || (test == X2_ADF)) res = x2_kernel(n, ni, nj, llx, lly, num); /* compute the degrees of freedom. */ if (df) *df = adj ? df_adjust(ni, llx, nj, lly) : (llx - 1) * (lly - 1); Free2D(n, llx); Free1D(ni); Free1D(nj); return res; }/*C_CHISQTEST*/
double cdlik(SEXP x, SEXP y, double *nparams) { int i = 0, j = 0, k = 0; int **n = NULL, *nj = NULL; int llx = NLEVELS(x), lly = NLEVELS(y), num = length(x); int *xx = INTEGER(x), *yy = INTEGER(y); double res = 0; /* initialize the contingency table and the marginal frequencies. */ n = (int **) Calloc2D(llx, lly, sizeof(int)); nj = Calloc1D(lly, sizeof(int)); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) n[xx[k] - 1][yy[k] - 1]++; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) nj[j] += n[i][j]; /* compute the conditional entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) if (n[i][j] != 0) res += (double)n[i][j] * log((double)n[i][j] / (double)nj[j]); /* we may want to store the number of parameters. */ if (nparams) *nparams = (llx - 1) * lly; Free1D(nj); Free2D(n, llx); return res; }/*CDLIK*/
double dlik(SEXP x, double *nparams) { int i = 0; int *n = NULL, *xx = INTEGER(x), llx = NLEVELS(x), num = length(x); double res = 0; /* initialize the contingency table. */ fill_1d_table(xx, &n, llx, num); /* compute the entropy from the marginal frequencies. */ for (i = 0; i < llx; i++) if (n[i] != 0) res += (double)n[i] * log((double)n[i] / num); /* we may want to store the number of parameters. */ if (nparams) *nparams = llx - 1; Free1D(n); return res; }/*DLIK*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncol = length(data); int num = length(VECTOR_ELT(data, 0)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, clevels = 0, *est = INTEGER(estimator), *depth = NULL; int *wl = NULL, *bl = NULL, *poset = NULL, debuglevel = isTRUE(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); include = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, cond, &clevels, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS(); /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = length(wlist); for (i = 0; i < nwl; i++) { if (debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = length(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(int)); for (i = 0; i < UPTRI3_MATRIX(ncol); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncol)); depth = Calloc1D(ncol, sizeof(int)); for (i = UPTRI3_MATRIX(ncol) - 1; i >= 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncol, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncol - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, depth, debug_coord[0], debug_coord[1], ncol, nodes, FALSE)) { if (debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (length(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncol - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncol - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncol - 1)); Free1D(depth); Free1D(mim); Free1D(include); Free1D(poset); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); return arcs; }/*CHOW_LIU*/
/* ARACNE structure learning algorithm. */ SEXP aracne(SEXP data, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP debug) { int i = 0, j = 0, k = 0, coord = 0, ncol = length(data); int num = length(VECTOR_ELT(data, i)), narcs = ncol * (ncol - 1) / 2; int *nlevels = NULL, *est = INTEGER(estimator), *wl = NULL, *bl = NULL; int debuglevel = isTRUE(debug); void **columns = NULL; short int *exclude = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, nodes, wlist, blist; PROTECT(nodes = getAttrib(data, R_NamesSymbol)); /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); exclude = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, NULL, NULL, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS() /* compare all the triplets. */ for (i = 0; i < ncol; i++) { for (j = i + 1; j < ncol; j++) { for (k = 0; k < ncol; k++) { if ((k == i) || (k == j)) continue; /* cache the UPTRI3 coordinates of the arc. */ coord = UPTRI3(i + 1, j + 1, ncol); /* if MI(X, Y) < min(MI(X, Z), MI(Z, Y)) drop arc X - Y. */ if ((mim[coord] < mim[UPTRI3(i + 1, k + 1, ncol)]) && (mim[coord] < mim[UPTRI3(j + 1, k + 1, ncol)])) { if (debuglevel > 0) { Rprintf("* dropping arc %s - %s because of %s, %lf < min(%lf, %lf)\n", NODE(i), NODE(j), NODE(k), mim[UPTRI3(i + 1, j + 1, ncol)], mim[UPTRI3(i + 1, k + 1, ncol)], mim[UPTRI3(j + 1, k + 1, ncol)]); }/*THEN*/ /* update the status vector. */ exclude[coord] = 1; /* decrement the number of arcs. */ narcs--; break; }/*THEN*/ }/*FOR*/ }/*FOR*/ }/*FOR*/ /* add back whitelisted arcs. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); for (i = 0; i < length(wlist); i++) { if (debuglevel > 0) { Rprintf("* adding back whitelisted arcs.\n"); if (exclude[wl[i]] == 1) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[wl[i]] == 1) narcs++; /* include the arc in the graph. */ exclude[wl[i]] = 0; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* remove blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); for (i = 0; i < length(blist); i++) { if (debuglevel > 0) { Rprintf("* removing blacklisted arcs.\n"); if (exclude[bl[i]] == 0) { Rprintf(" > arc %s - %s has been dropped from the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was not present in the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[bl[i]] == 0) narcs--; /* remove the arc from the graph. */ exclude[bl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ CONVERT_TO_ARC_SET(exclude, 1, 2 * narcs); Free1D(mim); Free1D(exclude); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); UNPROTECT(1); return arcs; }/*ARACNE*/
/* return the skeleton of a DAG/PDAG. */ SEXP dag2ug(SEXP bn, SEXP moral, SEXP debug) { int i = 0, j = 0, k = 0, nnodes = 0, narcs = 0, row = 0; int debuglevel = isTRUE(debug), *moralize = LOGICAL(moral); int *nparents = NULL, *nnbr = NULL; SEXP node_data, current, nodes, result, temp; /* get the nodes' data. */ node_data = getListElement(bn, "nodes"); nnodes = length(node_data); PROTECT(nodes = getAttrib(node_data, R_NamesSymbol)); /* allocate and initialize parents' and neighbours' counters. */ nnbr = Calloc1D(nnodes, sizeof(int)); if (*moralize > 0) nparents = Calloc1D(nnodes, sizeof(int)); /* first pass: count neighbours, parents and resulting arcs. */ for (i = 0; i < nnodes; i++) { /* get the number of neighbours. */ current = VECTOR_ELT(node_data, i); nnbr[i] = length(getListElement(current, "nbr")); /* update the number of arcs to be returned. */ if (*moralize > 0) { /* get also the number of parents, needed to account for the arcs added * for their moralization. */ nparents[i] = length(getListElement(current, "parents")); narcs += nnbr[i] + nparents[i] * (nparents[i] - 1); }/*THEN*/ else { narcs += nnbr[i]; }/*ELSE*/ if (debuglevel > 0) { if (*moralize > 0) { Rprintf("* scanning node %s, found %d neighbours and %d parents.\n", NODE(i), nnbr[i], nparents[i]); Rprintf(" > adding %d arcs, for a total of %d.\n", nnbr[i] + nparents[i] * (nparents[i] - 1), narcs); }/*THEN*/ else { Rprintf("* scanning node %s, found %d neighbours.\n", NODE(i), nnbr[i]); Rprintf(" > adding %d arcs, for a total of %d.\n", nnbr[i], narcs); }/*ELSE*/ }/*THEN*/ }/*FOR*/ /* allocate the return value. */ PROTECT(result = allocMatrix(STRSXP, narcs, 2)); /* allocate and set the column names. */ setDimNames(result, R_NilValue, mkStringVec(2, "from", "to")); /* second pass: fill the return value. */ for (i = 0; i < nnodes; i++) { /* get to the current node. */ current = VECTOR_ELT(node_data, i); /* get the neighbours. */ temp = getListElement(current, "nbr"); for (j = 0; j < nnbr[i]; j++) { SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(nodes, i)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, j)); row++; }/*FOR*/ /* if we are not creating a moral graph we are done with this node. */ if (*moralize == 0) continue; /* get the parents. */ temp = getListElement(current, "parents"); for (j = 0; j < nparents[i]; j++) { for (k = j+1; k < nparents[i]; k++) { SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(temp, k)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, j)); row++; SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(temp, j)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, k)); row++; }/*FOR*/ }/*FOR*/ }/*FOR*/ Free1D(nnbr); if (*moralize > 0) { /* be really sure not to return duplicate arcs in moral graphs when * shielded parents are present (the "shielding" nodes are counted * twice). */ result = c_unique_arcs(result, nodes, FALSE); Free1D(nparents); }/*THEN*/ UNPROTECT(2); return result; }/*DAG2UG*/
SEXP hc_to_be_added(SEXP arcs, SEXP blacklist, SEXP whitelist, SEXP nparents, SEXP maxp, SEXP nodes, SEXP convert) { int i = 0, j = 0, narcs = 0, dims = length(nodes); int *a = NULL, *coords = NULL; double *mp = REAL(maxp), *np = NULL; short int referenced = 0; SEXP try, result = R_NilValue, result2; /* transform the arc set into an adjacency matrix, if it's not one already. */ if (isInteger(arcs)) { if ((referenced = MAYBE_REFERENCED(arcs))) PROTECT(result = duplicate(arcs)); }/*THEN*/ else { PROTECT(result = arcs2amat(arcs, nodes)); }/*ELSE*/ /* dereference the adjacency matrix once and for all. */ a = INTEGER(result); /* compute the number the parents of each node, unless provided. */ if (nparents == R_NilValue) { np = Calloc1D(dims, sizeof(double)); for (i = 0; i < dims; i++) for (j = 0; j < dims; j++) np[j] = a[CMC(i, j, dims)]; }/*THEN*/ else { np = REAL(nparents); }/*ELSE*/ /* flip all the nondiagonal cells. */ for (j = 0; j < dims; j++) { for (i = 0; i < dims; i++) { /* diagonal elements are always equal to zero, skip them. */ if (i == j) continue; a[CMC(i, j, dims)] = 1 - a[CMC(i, j, dims)]; }/*FOR*/ }/*FOR*/ /* if an arc is present in the graph in one direction, you cannot add it in * the other direction (it would be a reversal); flip both in the adjacency * matrix. */ for (j = 0; j < dims; j++) for (i = j + 1; i < dims; i++) a[CMC(j, i, dims)] = a[CMC(i, j, dims)] = a[CMC(i, j, dims)] * a[CMC(j, i, dims)]; /* if a node has already reached its maximum number parents, do not add * more arcs pointing to that node. */ for (j = 0; j < dims; j++) if (np[j] >= *mp) memset(a + j * dims, '\0', dims * sizeof(int)); #define FLIP_FROM_LIST(list, value) \ if (!isNull(list)) { \ if (!isInteger(list)) { \ PROTECT(try = match(nodes, list, 0)); \ coords = INTEGER(try); \ narcs = length(try)/2; \ for (i = 0; i < narcs; i++) \ a[CMC(coords[i] - 1, coords[i + narcs] - 1, dims)] = value; \ UNPROTECT(1); \ }/*THEN*/ \ else { \ coords = INTEGER(list); \ for (i = 0; i < dims * dims; i ++) \ if (coords[i] == 1) \ a[i] = value; \ }/*ELSE*/ \ }/*THEN*/ /* now the blacklist gets involved. */ FLIP_FROM_LIST(blacklist, 0); /* and, last but not least, the whitelist gets involved. */ FLIP_FROM_LIST(whitelist, 1); if (nparents == R_NilValue) Free1D(np); /* return either the adjacency matrix or the arc set. */ if (isTRUE(convert)) { PROTECT(result2 = amat2arcs(result, nodes)); if (referenced || !isInteger(arcs)) UNPROTECT(2); else UNPROTECT(1); return result2; }/*THEN*/ else { if (referenced || !isInteger(arcs)) UNPROTECT(1); return result; }/*ELSE*/ }/*HC_TO_BE_ADDED*/
SEXP score_cache_fill(SEXP nodes, SEXP data, SEXP network, SEXP score, SEXP extra, SEXP reference, SEXP equivalence, SEXP decomposability, SEXP updated, SEXP amat, SEXP cache, SEXP blmat, SEXP debug) { int *colsum = NULL, nnodes = length(nodes), lupd = length(updated); int *a = NULL, *upd = NULL, *b = NULL, debuglevel = isTRUE(debug); int i = 0, j = 0, k = 0; double *cache_value = NULL; SEXP arc, delta, op, temp; /* save a pointer to the adjacency matrix, the blacklist and the * updated nodes. */ a = INTEGER(amat); b = INTEGER(blmat); upd = INTEGER(updated); /* if there are no nodes to update, return. */ if (lupd == 0) return cache; /* set up row and column total to check for score equivalence; * zero means no parent nodes. */ if (isTRUE(equivalence)) { colsum = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) colsum[j] += a[CMC(i, j, nnodes)]; }/*THEN*/ /* allocate and initialize the cache. */ cache_value = REAL(cache); /* allocate a two-slot character vector. */ PROTECT(arc = allocVector(STRSXP, 2)); /* allocate and initialize the fake score delta. */ PROTECT(delta = ScalarReal(0)); /* allocate and initialize the score.delta() operator. */ PROTECT(op = mkString("set")); for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* incident nodes must be different from each other. */ if (i == j) continue; /* if only one or two nodes' caches need updating, skip the rest. */ for (k = 0; k < lupd; k++) if (upd[k] == j) goto there; continue; there: /* no need to compute the score delta for blacklisted arcs. */ if (b[CMC(i, j, nnodes)] == 1) continue; /* use score equivalence if possible to check only one orientation. */ if (isTRUE(equivalence)) { /* if the following conditions are met, look up the score delta of * the reverse of the current arc: * 1) that score delta has already been computed. * 2) both incident nodes have no parent, so the arc is really * score equivalent (no v-structures). * 3) the reversed arc has not been blacklisted, as the score delta * is not computed in this case. */ if ((i > j) && (colsum[i] + colsum[j] == 0) && (b[CMC(j, i, nnodes)] == 0)) { cache_value[CMC(i, j, nnodes)] = cache_value[CMC(j, i, nnodes)]; continue; }/*THEN*/ }/*THEN*/ /* save the nodes incident on the arc. */ SET_STRING_ELT(arc, 0, STRING_ELT(nodes, i)); SET_STRING_ELT(arc, 1, STRING_ELT(nodes, j)); /* if the arc is not present in the graph it should be added; * otherwise it should be removed. */ if (a[CMC(i, j, nnodes)] == 0) SET_STRING_ELT(op, 0, mkChar("set")); else SET_STRING_ELT(op, 0, mkChar("drop")); /* checkpoint allocated memory. */ /* evaluate the call to score.delta() for the arc. */ PROTECT(temp = score_delta(arc, network, data, score, delta, reference, op, extra, decomposability)); cache_value[CMC(i, j, nnodes)] = NUM(VECTOR_ELT(temp, 1)); UNPROTECT(1); if (debuglevel > 0) Rprintf("* caching score delta for arc %s -> %s (%lf).\n", CHAR(STRING_ELT(nodes, i)), CHAR(STRING_ELT(nodes, j)), cache_value[CMC(i, j, nnodes)]); }/*FOR*/ }/*FOR*/ UNPROTECT(3); if (isTRUE(equivalence)) Free1D(colsum); return cache; }/*HC_CACHE_FILL*/
/* conditional posterior Dirichlet probability (covers BDe and K2 scores). */ double cdpost(SEXP x, SEXP y, SEXP iss, SEXP exp) { int i = 0, j = 0, k = 0, imaginary = 0, num = length(x); int llx = NLEVELS(x), lly = NLEVELS(y), p = llx * lly; int *xx = INTEGER(x), *yy = INTEGER(y), **n = NULL, *nj = NULL; double alpha = 0, res = 0; if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = p; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INT(iss); alpha = ((double) imaginary) / ((double) p); }/*ELSE*/ /* initialize the contingency table. */ n = (int **) Calloc2D(llx, lly, sizeof(int)); nj = Calloc1D(lly, sizeof(int)); /* compute the joint frequency of x and y. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*FOR*/ }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*THEN*/ else { k++; }/*ELSE*/ }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= length(exp); }/*ELSE*/ /* compute the conditional posterior probability. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) res += lgammafn(n[i][j] + alpha) - lgammafn(alpha); for (j = 0; j < lly; j++) res += lgammafn((double)imaginary / lly) - lgammafn(nj[j] + (double)imaginary / lly); Free1D(nj); Free2D(n, llx); return res; }/*CDPOST*/
/* shrinked conditional mutual information, to be used in C code. */ double c_shcmi(int *xx, int llx, int *yy, int lly, int *zz, int llz, int num, double *df) { int i = 0, j = 0, k = 0; double ***n = NULL, **ni = NULL, **nj = NULL, *nk = NULL; double lambda = 0, target = 1/(double)(llx * lly * llz); double res = 0; /* compute the degrees of freedom. */ *df = (double)(llx - 1) * (double)(lly - 1) * (double)(llz); /* initialize the contingency table and the marginal frequencies. */ n = (double ***) Calloc3D(llx, lly, llz, sizeof(double)); ni = (double **) Calloc2D(llx, llz, sizeof(double)); nj = (double **) Calloc2D(lly, llz, sizeof(double)); nk = Calloc1D(llz, sizeof(double)); /* compute the joint frequency of x, y, and z. */ for (k = 0; k < num; k++) n[xx[k] - 1][yy[k] - 1][zz[k] - 1]++; /* estimate the optimal lambda for the data. */ mi_lambda((double *)n, &lambda, target, num, llx, lly, llz); /* switch to the probability scale and shrink the estimates. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) for (k = 0; k < llz; k++) n[i][j][k] = lambda * target + (1 - lambda) * n[i][j][k] / num; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) for (k = 0; k < llz; k++) { ni[i][k] += n[i][j][k]; nj[j][k] += n[i][j][k]; nk[k] += n[i][j][k]; }/*FOR*/ for (k = 0; k < llz; k++) { /* check each level of the conditioning variable to avoid (again) * "divide by zero" errors. */ if (nk[k] == 0) continue; for (j = 0; j < lly; j++) { for (i = 0; i < llx; i++) { if (n[i][j][k] > 0) res += n[i][j][k] * log( (n[i][j][k] * nk[k]) / (ni[i][k] * nj[j][k]) ); }/*FOR*/ }/*FOR*/ }/*FOR*/ Free1D(nk); Free2D(ni, llx); Free2D(nj, lly); Free3D(n, llx, lly); return res; }/*C_SHCMI*/
/* set the directions of the arcs in a tree given the root node. */ SEXP tree_directions(SEXP arcs, SEXP nodes, SEXP root, SEXP debug) { int i = 0, j = 0, d = 0, traversed = 1; int narcs = length(arcs)/2, nnodes = length(nodes); int *a = NULL, *depth = 0, debuglevel = isTRUE(debug); SEXP try, try2, result; /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); a = INTEGER(try); /* match the root node. */ PROTECT(try2 = match(nodes, root, 0)); /* allocate and initialize the statust vector. */ depth = Calloc1D(nnodes, sizeof(int)); depth[INT(try2) - 1] = 1; if (debuglevel > 0) Rprintf("> root node (depth 1) is %s.\n", NODE(INT(try2) - 1)); for (d = 1; d <= nnodes; d++) { if (debuglevel > 0) Rprintf("> considering nodes at depth %d.\n", d + 1); for (i = 0; i < narcs; i++) { for (j = 0; j < nnodes; j++) { /* disregard nodes at the wrong depth. */ if (depth[j] != d) continue; if ((a[i + narcs] == (j + 1)) && (depth[a[i] - 1] == 0)) { if (debuglevel > 0) Rprintf(" * found node %s.\n", NODE(a[i] - 1)); /* save the depth at which the node was found. */ depth[a[i] - 1] = d + 1; /* update the counter of the traversed nodes. */ traversed++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* check whether all nodes have been traversed. */ if (traversed == nnodes) break; }/*FOR*/ /* allocate and initialize the return value. */ PROTECT(result = allocMatrix(STRSXP, narcs/2, 2)); for (i = 0, j = 0; i < narcs; i++) { if (depth[a[i] - 1] < depth[a[i + narcs] - 1]) { SET_STRING_ELT(result, j, STRING_ELT(arcs, i)); SET_STRING_ELT(result, j + narcs/2, STRING_ELT(arcs, i + narcs)); j++; }/*THEN*/ }/*FOR*/ UNPROTECT(3); Free1D(depth); return result; }/*TREE_DIRECTIONS*/
/* parametric tests for Gaussian variables. */ static double ct_gaustests(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df, test_e test) { int i = 0, nsx = length(zz), ncols = nsx + 2; double transform = 0, **column = NULL, *mean = NULL, statistic = 0, lambda = 0; double *u = NULL, *d = NULL, *vt = NULL, *cov = NULL, *basecov = 0; /* compute the degrees of freedom for correlation and mutual information. */ if (test == COR) *df = nobs - ncols; else if ((test == MI_G) || (test == MI_G_SH)) *df = 1; if (((test == COR) && (*df < 1)) || ((test == ZF) && (nobs - ncols < 2))) { /* if there are not enough degrees of freedom, return independence. */ warning("trying to do a conditional independence test with zero degrees of freedom."); *df = 0; statistic = 0; for (i = 0; i < ntests; i++) pvalue[i] = 1; return statistic; }/*THEN*/ GAUSSIAN_CACHE(); if (ntests > 1) { /* allocate and compute mean values and the covariance matrix. */ mean = Calloc1D(ncols, sizeof(double)); c_meanvec(column, mean, nobs, ncols, 1); c_covmat(column, mean, ncols, nobs, cov, 1); memcpy(basecov, cov, ncols * ncols * sizeof(double)); for (i = 0; i < ntests; i++) { GAUSSIAN_PCOR_CACHE(); if (test == COR) { COMPUTE_PCOR(); transform = cor_t_trans(statistic, *df); pvalue[i] = 2 * pt(fabs(transform), *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G) { COMPUTE_PCOR(); statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G_SH) { lambda = covmat_lambda(column, mean, cov, nobs, ncols); covmat_shrink(cov, ncols, lambda); COMPUTE_PCOR(); statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == ZF) { COMPUTE_PCOR(); statistic = cor_zf_trans(statistic, (double)nobs - ncols); pvalue[i] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE); }/*THEN*/ }/*FOR*/ }/*THEN*/ else { GAUSSIAN_PCOR_NOCACHE(); if (test == COR) { COMPUTE_PCOR(); transform = cor_t_trans(statistic, *df); pvalue[0] = 2 * pt(fabs(transform), *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G) { COMPUTE_PCOR(); statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[0] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == MI_G_SH) { lambda = covmat_lambda(column, mean, cov, nobs, ncols); covmat_shrink(cov, ncols, lambda); COMPUTE_PCOR(); statistic = 2 * nobs * cor_mi_trans(statistic); pvalue[0] = pchisq(statistic, *df, FALSE, FALSE); }/*THEN*/ else if (test == ZF) { COMPUTE_PCOR(); statistic = cor_zf_trans(statistic, (double)nobs - ncols); pvalue[0] = 2 * pnorm(fabs(statistic), 0, 1, FALSE, FALSE); }/*THEN*/ }/*ELSE*/ GAUSSIAN_FREE(); Free1D(mean); Free1D(column); return statistic; }/*CT_GAUSTESTS*/
/* conditional linear Gaussian mutual information test. */ static double ct_micg(SEXP xx, SEXP yy, SEXP zz, int nobs, int ntests, double *pvalue, double *df) { int xtype = 0, ytype = TYPEOF(yy), *nlvls = NULL, llx = 0, lly = 0, llz = 0; int ndp = 0, ngp = 0, nsx = length(zz), **dp = NULL, *dlvls = NULL, j = 0, k = 0; int i = 0, *zptr = 0; void *xptr = NULL, *yptr = NULL, **columns = NULL; double **gp = NULL; double statistic = 0; SEXP xdata; if (ytype == INTSXP) { /* cache the number of levels. */ lly = NLEVELS(yy); yptr = INTEGER(yy); }/*THEN*/ else { yptr = REAL(yy); }/*ELSE*/ /* extract the conditioning variables and cache their types. */ columns = Calloc1D(nsx, sizeof(void *)); nlvls = Calloc1D(nsx, sizeof(int)); df2micg(zz, columns, nlvls, &ndp, &ngp); dp = Calloc1D(ndp + 1, sizeof(int *)); gp = Calloc1D(ngp + 1, sizeof(double *)); dlvls = Calloc1D(ndp + 1, sizeof(int)); for (i = 0, j = 0, k = 0; i < nsx; i++) if (nlvls[i] > 0) { dlvls[1 + j] = nlvls[i]; dp[1 + j++] = columns[i]; }/*THEN*/ else { gp[1 + k++] = columns[i]; }/*ELSE*/ /* allocate vector for the configurations of the discrete parents; or, if * there no discrete parents, for the means of the continuous parents. */ if (ndp > 0) { zptr = Calloc1D(nobs, sizeof(int)); c_fast_config(dp + 1, nobs, ndp, dlvls + 1, zptr, &llz, 1); }/*THEN*/ for (i = 0; i < ntests; i++) { xdata = VECTOR_ELT(xx, i); xtype = TYPEOF(xdata); if (xtype == INTSXP) { xptr = INTEGER(xdata); llx = NLEVELS(xdata); }/*THEN*/ else { xptr = REAL(xdata); }/*ELSE*/ if ((ytype == INTSXP) && (xtype == INTSXP)) { if (ngp > 0) { /* need to reverse conditioning to actually compute the test. */ statistic = 2 * nobs * nobs * c_cmicg_unroll(xptr, llx, yptr, lly, zptr, llz, gp + 1, ngp, df, nobs); }/*THEN*/ else { /* the test reverts back to a discrete mutual information test. */ statistic = 2 * nobs * c_cchisqtest(xptr, llx, yptr, lly, zptr, llz, nobs, df, MI); }/*ELSE*/ }/*THEN*/ else if ((ytype == REALSXP) && (xtype == REALSXP)) { gp[0] = xptr; statistic = 2 * nobs * c_cmicg(yptr, gp, ngp + 1, NULL, 0, zptr, llz, dlvls, nobs); /* one regression coefficient for each conditioning level is added; * if all conditioning variables are continuous that's just one global * regression coefficient. */ *df = (llz == 0) ? 1 : llz; }/*THEN*/ else if ((ytype == INTSXP) && (xtype == REALSXP)) { dp[0] = yptr; dlvls[0] = lly; statistic = 2 * nobs * c_cmicg(xptr, gp + 1, ngp, dp, ndp + 1, zptr, llz, dlvls, nobs); /* for each additional configuration of the discrete conditioning * variables plus the discrete yptr, one whole set of regression * coefficients (plus the intercept) is added. */ *df = (lly - 1) * ((llz == 0) ? 1 : llz) * (ngp + 1); }/*THEN*/ else if ((ytype == REALSXP) && (xtype == INTSXP)) { dp[0] = xptr; dlvls[0] = llx; statistic = 2 * nobs * c_cmicg(yptr, gp + 1, ngp, dp, ndp + 1, zptr, llz, dlvls, nobs); /* same as above, with xptr and yptr swapped. */ *df = (llx - 1) * ((llz == 0) ? 1 : llz) * (ngp + 1); }/*ELSE*/ pvalue[i] = pchisq(statistic, *df, FALSE, FALSE); }/*FOR*/ Free1D(columns); Free1D(nlvls); Free1D(dlvls); Free1D(zptr); Free1D(dp); Free1D(gp); return statistic; }/*CT_MICG*/
/* remove one variable in each highly-correlated pair. */ SEXP dedup (SEXP data, SEXP threshold, SEXP complete, SEXP debug) { int i = 0, j = 0, k = 0, dropped = 0, nc = 0; int debuglevel = isTRUE(debug); double *mean = NULL, *sse = NULL, *xx = NULL, *yy = NULL; double cur_mean[2], cur_sse[2]; double tol = MACHINE_TOL, t = NUM(threshold); long double sum = 0; SEXP result, colnames; gdata dt = { 0 }; /* extract the columns from the data frame. */ dt = gdata_from_SEXP(data, 0); meta_init_flags(&(dt.m), 0, complete, R_NilValue); meta_copy_names(&(dt.m), 0, data); /* set up the vectors for the pairwise complete observations. */ xx = Calloc1D(dt.m.nobs, sizeof(double)); yy = Calloc1D(dt.m.nobs, sizeof(double)); if (debuglevel > 0) Rprintf("* caching means and variances.\n"); mean = Calloc1D(dt.m.ncols, sizeof(double)); sse = Calloc1D(dt.m.ncols, sizeof(double)); /* cache the mean and variance of complete variables. */ for (j = 0; j < dt.m.ncols; j++) { if (!dt.m.flag[j].complete) continue; mean[j] = c_mean(dt.col[j], dt.m.nobs); sse[j] = c_sse(dt.col[j], mean[j], dt.m.nobs); }/*FOR*/ /* main loop. */ for (j = 0; j < dt.m.ncols - 1; j++) { /* skip variables already flagged for removal. */ if (dt.m.flag[j].drop) continue; if (debuglevel > 0) Rprintf("* looking at %s with %d variables still to check.\n", dt.m.names[j], dt.m.ncols - (j + 1)); for (k = j + 1; k < dt.m.ncols; k++) { /* skip variables already flagged for removal. */ if (dt.m.flag[k].drop) continue; if (dt.m.flag[j].complete && dt.m.flag[k].complete) { /* use the cached means and variances. */ cur_mean[0] = mean[j]; cur_mean[1] = mean[k]; cur_sse[0] = sse[j]; cur_sse[1] = sse[k]; /* compute the covariance. */ for (i = 0, sum = 0; i < dt.m.nobs; i++) sum += (dt.col[j][i] - cur_mean[0]) * (dt.col[k][i] - cur_mean[1]); }/*THEN*/ else { for (i = 0, nc = 0; i < dt.m.nobs; i++) { if (ISNAN(dt.col[j][i]) || ISNAN(dt.col[k][i])) continue; xx[nc] = dt.col[j][i]; yy[nc++] = dt.col[k][i]; }/*FOR*/ /* if there are no complete observations, take the variables to be * independent. */ if (nc == 0) continue; cur_mean[0] = c_mean(xx, nc); cur_mean[1] = c_mean(yy, nc); cur_sse[0] = c_sse(xx, cur_mean[0], nc); cur_sse[1] = c_sse(yy, cur_mean[1], nc); /* compute the covariance. */ for (i = 0, sum = 0; i < nc; i++) sum += (xx[i] - cur_mean[0]) * (yy[i] - cur_mean[1]); }/*ELSE*/ /* safety check against "divide by zero" errors. */ if ((cur_sse[0] < tol) || (cur_sse[1] < tol)) sum = 0; else sum /= sqrt(cur_sse[0] * cur_sse[1]); /* test the correlation against the threshold. */ if (fabsl(sum) > t) { if (debuglevel > 0) Rprintf("%s is collinear with %s, dropping %s with COR = %.4Lf\n", dt.m.names[j], dt.m.names[k], dt.m.names[k], sum); /* flag the variable for removal. */ dt.m.flag[k].drop = TRUE; dropped++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, dt.m.ncols - dropped)); PROTECT(colnames = allocVector(STRSXP, dt.m.ncols - dropped)); for (j = 0, k = 0; j < dt.m.ncols; j++) if (!dt.m.flag[j].drop) { SET_STRING_ELT(colnames, k, mkChar(dt.m.names[j])); SET_VECTOR_ELT(result, k++, VECTOR_ELT(data, j)); }/*THEN*/ setAttrib(result, R_NamesSymbol, colnames); /* make it a data frame. */ minimal_data_frame(result); Free1D(mean); Free1D(sse); Free1D(xx); Free1D(yy); FreeGDT(dt, FALSE); UNPROTECT(2); return result; }/*DEDUP*/
/* a single step of the optimized hill climbing (one arc addition/removal/reversal). */ SEXP hc_opt_step(SEXP amat, SEXP nodes, SEXP added, SEXP cache, SEXP reference, SEXP wlmat, SEXP blmat, SEXP nparents, SEXP maxp, SEXP debug) { int nnodes = length(nodes), i = 0, j = 0; int *am = NULL, *ad = NULL, *w = NULL, *b = NULL, debuglevel = isTRUE(debug); int counter = 0, update = 1, from = 0, to = 0, *path = NULL, *scratch = NULL; double *cache_value = NULL, temp = 0, max = 0, tol = MACHINE_TOL; double *mp = REAL(maxp), *np = REAL(nparents); SEXP bestop; /* allocate and initialize the return value (use FALSE as a canary value). */ PROTECT(bestop = allocVector(VECSXP, 3)); setAttrib(bestop, R_NamesSymbol, mkStringVec(3, "op", "from", "to")); /* allocate and initialize a dummy FALSE object. */ SET_VECTOR_ELT(bestop, 0, ScalarLogical(FALSE)); /* allocate buffers for c_has_path(). */ path = Calloc1D(nnodes, sizeof(int)); scratch = Calloc1D(nnodes, sizeof(int)); /* save pointers to the numeric/integer matrices. */ cache_value = REAL(cache); ad = INTEGER(added); am = INTEGER(amat); w = INTEGER(wlmat); b = INTEGER(blmat); if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0; i < nnodes * nnodes; i++) counter += ad[i]; Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to add one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (ad[CMC(i, j, nnodes)] == 0) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)]; if (debuglevel > 0) { Rprintf(" > trying to add %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ /* this score delta is the best one at the moment, so add the arc if it * does not introduce cycles in the graph. */ if (temp - max > tol) { if (c_has_path(j, i, am, nnodes, nodes, FALSE, FALSE, path, scratch, FALSE)) { if (debuglevel > 0) Rprintf(" > not adding, introduces cycles in the graph.\n"); continue; }/*THEN*/ if (debuglevel > 0) Rprintf(" @ adding %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "set", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0, counter = 0; i < nnodes * nnodes; i++) counter += am[i] * (1 - w[i]); Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to remove one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (am[CMC(i, j, nnodes)] == 0) continue; /* whitelisted arcs are not to be removed, ever. */ if (w[CMC(i, j, nnodes)] == 1) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)]; if (debuglevel > 0) { Rprintf(" > trying to remove %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ if (temp - max > tol) { if (debuglevel > 0) Rprintf(" @ removing %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "drop", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0, counter = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) counter += am[CMC(i, j, nnodes)] * (1 - b[CMC(j, i, nnodes)]); Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to reverse one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (am[CMC(i, j, nnodes)] == 0) continue; /* don't reverse an arc if the one in the opposite direction is * blacklisted, ever. */ if (b[CMC(j, i, nnodes)] == 1) continue; /* do not reverse an arc if that means violating the limit on the * maximum number of parents. */ if (np[i] >= *mp) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)] + cache_value[CMC(j, i, nnodes)]; /* nuke small values and negative zeroes. */ if (fabs(temp) < tol) temp = 0; if (debuglevel > 0) { Rprintf(" > trying to reverse %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ if (temp - max > tol) { if (c_has_path(i, j, am, nnodes, nodes, FALSE, TRUE, path, scratch, FALSE)) { if (debuglevel > 0) Rprintf(" > not reversing, introduces cycles in the graph.\n"); continue; }/*THEN*/ if (debuglevel > 0) Rprintf(" @ reversing %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "reverse", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* both nodes' reference scores must be updated. */ update = 2; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* update the reference scores. */ REAL(reference)[to] += cache_value[CMC(from, to, nnodes)]; if (update == 2) REAL(reference)[from] += cache_value[CMC(to, from, nnodes)]; Free1D(path); Free1D(scratch); UNPROTECT(1); return bestop; }/*HC_OPT_STEP*/
/* construct a consistent DAG extension of a CPDAG. */ SEXP pdag_extension(SEXP arcs, SEXP nodes, SEXP debug) { int i = 0, j = 0, k = 0, t = 0, nnodes = length(nodes); int changed = 0, left = nnodes; int *a = NULL, *nbr = NULL, debuglevel = isTRUE(debug); short int *matched = NULL; SEXP amat, result; /* build and dereference the adjacency matrix. */ PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* allocate and initialize the neighbours and matched vectors. */ nbr = Calloc1D(nnodes, sizeof(int)); matched = Calloc1D(nnodes, sizeof(short int)); for (t = 0; t < nnodes; t++) { if (debuglevel > 0) { Rprintf("----------------------------------------------------------------\n"); Rprintf("> performing pass %d.\n", t + 1); Rprintf("> candidate nodes: "); for (j = 0; j < nnodes; j++) if (matched[j] == 0) Rprintf("%s ", NODE(j)); Rprintf("\n"); }/*THEN*/ for (i = 0; i < nnodes; i++) { /* if the node is already ok, skip it. */ if (matched[i] != 0) continue; /* check whether the node is a sink (that is, whether is does not have * any child). */ is_a_sink(a, i, &k, nnodes, nbr, matched); /* if the node is not a sink move on. */ if (k == -1) { if (debuglevel > 0) Rprintf(" * node %s is not a sink.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) Rprintf(" * node %s is a sink.\n", NODE(i)); }/*ELSE*/ if (!all_adjacent(a, i, k, nnodes, nbr)) { if (debuglevel > 0) Rprintf(" * not all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) { if (k == 0) Rprintf(" * no node is linked to %s by an undirected arc.\n", NODE(i)); else Rprintf(" * all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); }/*THEN*/ }/*ELSE*/ /* the current node meets all the conditions, direct all the arcs towards it. */ if (k == 0) { if (debuglevel > 0) Rprintf(" @ no undirected arc to direct towards %s.\n", NODE(i)); }/*THEN*/ else { for (j = 0; j < k; j++) a[CMC(i, nbr[j], nnodes)] = 0; if (debuglevel > 0) Rprintf(" @ directing all incident undirected arcs towards %s.\n", NODE(i)); }/*ELSE*/ /* set the changed flag. */ changed = 1; /* exclude the node from later iterations. */ matched[i] = 1; left--; }/*FOR*/ /* if nothing changed in the last iteration or there are no more candidate * nodes, there is nothing else to do. */ if ((changed == 0) || (left == 0)) break; else changed = 0; }/*FOR*/ /* build the new arc set from the adjacency matrix. */ PROTECT(result = amat2arcs(amat, nodes)); Free1D(nbr); Free1D(matched); UNPROTECT(2); return result; }/*PDAG_EXTENSION*/
/* posterior Dirichlet probability (covers BDe and K2 scores). */ double dpost(SEXP x, SEXP iss, SEXP exp) { int i = 0, k = 0, num = length(x); int llx = NLEVELS(x), *xx = INTEGER(x), *n = NULL, *imaginary = NULL; double alpha = 0, res = 0; /* the correct vaules for the hyperparameters alpha are documented in * "Learning Bayesian Networks: The Combination of Knowledge and Statistical * Data" by Heckerman, Geiger & Chickering (1995), page 17. */ if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = &llx; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INTEGER(iss); alpha = (double) *imaginary / (double) llx; }/*ELSE*/ /* initialize the contingency table. */ n = Calloc1D(llx, sizeof(int)); /* compute the frequency table of x, disregarding experimental data. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) n[xx[i] - 1]++; }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) n[xx[i] - 1]++; else k++; }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= length(exp); }/*ELSE*/ /* compute the posterior probability. */ for (i = 0; i < llx; i++) res += lgammafn(n[i] + alpha) - lgammafn(alpha); res += lgammafn((double)(*imaginary)) - lgammafn((double)(*imaginary + num)); Free1D(n); return res; }/*DPOST*/