/* generate a connected graph with uniform probability, subject to some * constraints on the degree of the nodes. */ SEXP ide_cozman_graph(SEXP nodes, SEXP num, SEXP burn_in, SEXP max_in_degree, SEXP max_out_degree, SEXP max_degree, SEXP connected, SEXP debug) { SEXP graphlist; switch(LENGTH(nodes)) { case 1: /* there is only one graph with 1 node, and it's empty. */ graphlist = empty_graph(nodes, num); break; case 2: /* Ide-Cozman has no mixing with only 2 nodes, work around with i.i.d. * sampling.*/ if (isTRUE(connected)) { graphlist = ic_2nodes(nodes, num, burn_in, max_in_degree, max_out_degree, max_degree, connected, debug); break; }/*THEN*/ default: graphlist = c_ide_cozman(nodes, num, burn_in, max_in_degree, max_out_degree, max_degree, connected, debug); }/*SWITCH*/ return graphlist; }/*IDE_COZMAN_GRAPH*/
/* compute the cached values fro all nodes. */ SEXP cache_structure(SEXP nodes, SEXP amat, SEXP debug) { int i = 0, debuglevel = LOGICAL(debug)[0], length_nodes = LENGTH(nodes); int *status = NULL, *a = INTEGER(amat); SEXP bn, temp; /* allocate the list and set its attributes.*/ PROTECT(bn = allocVector(VECSXP, length_nodes)); setAttrib(bn, R_NamesSymbol, nodes); /* allocate and intialize the status vector. */ status = alloc1dcont(length_nodes); if (isTRUE(debug)) Rprintf("* (re)building cached information about network structure.\n"); /* populate the list with nodes' data. */ for (i = 0; i < length_nodes; i++) { /* (re)initialize the status vector. */ memset(status, '\0', sizeof(int) * length_nodes); temp = cache_node_structure(i, nodes, a, length_nodes, status, debuglevel); /* save the returned list. */ SET_VECTOR_ELT(bn, i, temp); }/*FOR*/ UNPROTECT(1); return bn; }/*CACHE_STRUCTURE*/
static SEXP build_return_array(SEXP nodes, short int *status, int nrows, int check_status, SEXP return_nodes) { int i = 0, j = 0; SEXP res; if (check_status < 3) { if (isTRUE(return_nodes)) { PROTECT(res = allocVector(STRSXP, 0)); }/*THEN*/ else { PROTECT(res = allocVector(LGLSXP, 1)); LOGICAL(res)[0] = TRUE; }/*ELSE*/ }/*THEN*/ else { if (isTRUE(return_nodes)) { PROTECT(res = allocVector(STRSXP, check_status)); for (i = 0; i < nrows; i++) if (status[i] == BAD) SET_STRING_ELT(res, j++, STRING_ELT(nodes, i)); }/*THEN*/ else { PROTECT(res = allocVector(LGLSXP, 1)); LOGICAL(res)[0] = FALSE; }/*ELSE*/ }/*ELSE*/ UNPROTECT(1); return res; }/*BUILD_RETURN_ARRAY*/
/* get the number of parameters of the whole network (mixed case, also handles * discrete and Gaussian networks). */ SEXP nparams_cgnet(SEXP graph, SEXP data, SEXP debug) { int i = 0, j = 0, nnodes = 0, debuglevel = isTRUE(debug); int *nlevels = NULL, *index = NULL, ngp = 0; double nconfig = 0, node_params = 0, all_params = 0; SEXP nodes = R_NilValue, node_data, parents, temp; /* get nodes' number and data. */ node_data = getListElement(graph, "nodes"); nnodes = length(node_data); nodes = getAttrib(node_data, R_NamesSymbol); /* cache the number of levels of each variables (zero = continuous). */ nlevels = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) { temp = VECTOR_ELT(data, i); if (TYPEOF(temp) == INTSXP) nlevels[i] = NLEVELS(temp); }/*FOR*/ for (i = 0; i < nnodes; i++) { /* extract the parents of the node and match them. */ parents = getListElement(VECTOR_ELT(node_data, i), "parents"); PROTECT(temp = match(nodes, parents, 0)); index = INTEGER(temp); /* compute the number of regressors and of configurations. */ for (j = 0, ngp = 0, nconfig = 1; j < length(parents); j++) { if (nlevels[index[j] - 1] == 0) ngp++; else nconfig *= nlevels[index[j] - 1]; }/*FOR*/ /* compute the overall number of parameters as regressors plus intercept * times configurations. */ node_params = nconfig * (nlevels[i] == 0 ? ngp + 1 : nlevels[i] - 1); if (debuglevel > 0) Rprintf("* node %s has %.0lf parameter(s).\n", NODE(i), node_params); /* update the return value. */ all_params += node_params; UNPROTECT(1); }/*FOR*/ Free1D(nlevels); return ScalarReal(all_params); }/*NPARAMS_CGNET*/
/* unconditional mutual information, to be used for the asymptotic test. */ SEXP mi(SEXP x, SEXP y, SEXP gsquare, SEXP adjusted) { int llx = NLEVELS(x), lly = NLEVELS(y), num = length(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); if (isTRUE(adjusted)) res[0] = c_chisqtest(xx, llx, yy, lly, num, res + 1, MI_ADF); else res[0] = c_chisqtest(xx, llx, yy, lly, num, res + 1, MI); /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*MI*/
/* R frontend: compute the score component for each target node. */ SEXP per_node_score(SEXP network, SEXP data, SEXP score, SEXP targets, SEXP extra_args, SEXP debug) { SEXP result; /* allocate the return value. */ PROTECT(result = allocVector(REALSXP, length(targets))); /* compute the score componenets. */ c_per_node_score(network, data, score, targets, extra_args, isTRUE(debug), REAL(result)); /* set labels on the computed score components. */ setAttrib(result, R_NamesSymbol, targets); UNPROTECT(1); return result; }/*PER_NODE_SCORE*/
/* compute the cached values for a single node (R-friendly). */ SEXP cache_partial_structure(SEXP nodes, SEXP target, SEXP amat, SEXP debug) { int i = 0, debuglevel = LOGICAL(debug)[0], length_nodes = LENGTH(nodes); char *t = (char *)CHAR(STRING_ELT(target, 0)); int *status = NULL, *a = INTEGER(amat); if (isTRUE(debug)) Rprintf("* (re)building cached information about node %s.\n", t); /* allocate and initialize the status vector. */ status = alloc1dcont(length_nodes); /* iterate fo find the node position in the array. */ for (i = 0; i < length_nodes; i++) if (!strcmp(t, CHAR(STRING_ELT(nodes, i)))) break; /* return the corresponding part of the bn structure. */ return cache_node_structure(i, nodes, a, length_nodes, status, debuglevel); }/*CACHE_PARTIAL_STRUCTURE*/
/* unconditional mutual information, to be used for the asymptotic test. */ SEXP mi(SEXP x, SEXP y, SEXP gsquare) { int llx = NLEVELS(x), lly = NLEVELS(y), num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); res[0] = c_mi(xx, &llx, yy, &lly, &num); res[1] = (double)(llx - 1) * (double)(lly - 1); /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*MI*/
/* conditional mutual information, to be used for the asymptotic test. */ SEXP cmi(SEXP x, SEXP y, SEXP z, SEXP gsquare) { int llx = NLEVELS(x), lly = NLEVELS(y), llz = NLEVELS(z); int num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y), *zz = INTEGER(z); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 2)); res = REAL(result); res[0] = c_cmi(xx, &llx, yy, &lly, zz, &llz, &num); res[1] = (double)(llx - 1) * (double)(lly - 1) * (double)llz; /* rescale to match the G^2 test. */ if (isTRUE(gsquare)) res[0] *= 2 * num; UNPROTECT(1); return result; }/*CMI*/
/* faster rbind() implementation for arc sets. */ SEXP arcs_rbind (SEXP matrix1, SEXP matrix2, SEXP reverse2) { int i = 0, j = 0, m1 = length(matrix1)/2, m2 = length(matrix2)/2; SEXP res; /* allocate the return value. */ PROTECT(res = allocMatrix(STRSXP, m1 + m2, 2)); /* allocate and initialize the column names. */ finalize_arcs(res); /* copy the elements of the first matrix. */ for (i = 0; i < m1; i++) for (j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i, j, m1 + m2), STRING_ELT(matrix1, CMC(i, j, m1))); /* copy the elements of the second matrix, reversing the order of the * columns as needed. */ if (isTRUE(reverse2)) { for (i = 0; i < m2; i++) for(j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i + m1, j, m1 + m2), STRING_ELT(matrix2, CMC(i, 1 - j, m2))); }/*THEN*/ else { for (i = 0; i < m2; i++) for(j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i + m1, j, m1 + m2), STRING_ELT(matrix2, CMC(i, j, m2))); }/*ELSE*/ UNPROTECT(1); return res; }/*ARCS_RBIND*/
/* unconditional independence tests. */ SEXP utest(SEXP x, SEXP y, SEXP data, SEXP test, SEXP B, SEXP alpha, SEXP learning) { int ntests = length(x), nobs = 0; double *pvalue = NULL, statistic = 0, df = NA_REAL; const char *t = CHAR(STRING_ELT(test, 0)); test_e test_type = test_label(t); SEXP xx, yy, result; /* allocate the return value, which has the same length as x. */ PROTECT(result = allocVector(REALSXP, ntests)); setAttrib(result, R_NamesSymbol, x); pvalue = REAL(result); /* set all elements to zero. */ memset(pvalue, '\0', ntests * sizeof(double)); /* extract the variables from the data. */ PROTECT(xx = c_dataframe_column(data, x, FALSE, FALSE)); PROTECT(yy = c_dataframe_column(data, y, TRUE, FALSE)); nobs = length(yy); if (IS_DISCRETE_ASYMPTOTIC_TEST(test_type)) { /* parametric tests for discrete variables. */ statistic = ut_discrete(xx, yy, nobs, ntests, pvalue, &df, test_type); }/*THEN*/ else if ((test_type == COR) || (test_type == ZF) || (test_type == MI_G) || (test_type == MI_G_SH)) { /* parametric tests for Gaussian variables. */ statistic = ut_gaustests(xx, yy, nobs, ntests, pvalue, &df, test_type); }/*THEN*/ else if (test_type == MI_CG) { /* conditional linear Gaussian mutual information test. */ statistic = ut_micg(xx, yy, nobs, ntests, pvalue, &df); }/*THEN*/ else if (IS_DISCRETE_PERMUTATION_TEST(test_type)) { statistic = ut_dperm(xx, yy, nobs, ntests, pvalue, &df, test_type, INT(B), IS_SMC(test_type) ? NUM(alpha) : 1); }/*THEN*/ else if (IS_CONTINUOUS_PERMUTATION_TEST(test_type)) { statistic = ut_gperm(xx, yy, nobs, ntests, pvalue, test_type, INT(B), IS_SMC(test_type) ? NUM(alpha) : 1); }/*THEN*/ UNPROTECT(3); /* catch-all for unknown tests (after deallocating memory.) */ if (test_type == ENOTEST) error("unknown test statistic '%s'.", t); /* increase the test counter. */ test_counter += ntests; if (isTRUE(learning)) return result; else return c_create_htest(statistic, test, pvalue[ntests - 1], df, B); }/*UTEST*/
/* check neighbourhood sets and markov blankets for consistency.. */ SEXP bn_recovery(SEXP bn, SEXP strict, SEXP mb, SEXP debug, SEXP filter) { int i = 0, j = 0, k = 0, n = 0, counter = 0; short int *checklist = NULL, err = 0; int *debuglevel = NULL, *checkmb = NULL, *nbrfilter = NULL; SEXP temp, temp2, nodes, elnames = NULL, fixed; /* get the names of the nodes. */ nodes = getAttrib(bn, R_NamesSymbol); n = LENGTH(nodes); /* allocate and initialize the checklist. */ checklist = allocstatus(UPTRI_MATRIX(n)); /* dereference the debug, mb and filter parameters. */ debuglevel = LOGICAL(debug); checkmb = LOGICAL(mb); nbrfilter = INTEGER(filter); if (*debuglevel > 0) { Rprintf("----------------------------------------------------------------\n"); if (*checkmb) Rprintf("* checking consistency of markov blankets.\n"); else Rprintf("* checking consistency of neighbourhood sets.\n"); }/*THEN*/ /* scan the structure to determine the number of arcs. */ for (i = 0; i < n; i++) { if (*debuglevel > 0) Rprintf(" > checking node %s.\n", NODE(i)); /* get the entry for the (neighbours|elements of the markov blanket) of the node.*/ temp = getListElement(bn, (char *)NODE(i)); if (!(*checkmb)) temp = getListElement(temp, "nbr"); /* check each element of the array and identify which variable it corresponds to. */ for (j = 0; j < LENGTH(temp); j++) { for (k = 0; k < n; k++) { /* increment the right element of checklist. */ if (!strcmp(NODE(k), (char *)CHAR(STRING_ELT(temp, j)))) checklist[UPTRI(i + 1, k + 1, n)]++; }/*FOR*/ }/*FOR*/ }/*FOR*/ /* if A is a neighbour of B, B is a neighbour of A; therefore each entry in * the checklist array must be equal to either zero (if the corresponding * nodes are not neighbours) or two (if the corresponding nodes are neighbours). * Any other value (typically one) is caused by an incorrect (i.e. asymmetric) * neighbourhood structure. The same logic holds for the markov blankets. */ for (i = 0; i < n; i++) for (j = i; j < n; j++) { if ((checklist[UPTRI(i + 1, j + 1, n)] != 0) && (checklist[UPTRI(i + 1, j + 1, n)] != 2)) { if (*debuglevel > 0) { if (*checkmb) Rprintf("@ asymmetry in the markov blankets for %s and %s.\n", NODE(i), NODE(j)); else Rprintf("@ asymmetry in the neighbourhood sets for %s and %s.\n", NODE(i), NODE(j)); }/*THEN*/ err = 1; }/*THEN*/ }/*FOR*/ /* no need to go on if the (neighbourhood sets|markov blankets) are symmetric; * otherwise throw either an error or a warning according to the value of the * strict parameter. */ if (!err) { return bn; }/*THEN*/ else if (isTRUE(strict)) { if (*checkmb) error("markov blankets are not symmetric.\n"); else error("neighbourhood sets are not symmetric.\n"); }/*THEN*/ else { if (*checkmb) warning("markov blankets are not symmetric.\n"); else warning("neighbourhood sets are not symmetric.\n"); }/*ELSE*/ /* build a correct structure to return. */ PROTECT(fixed = allocVector(VECSXP, n)); setAttrib(fixed, R_NamesSymbol, nodes); if (!(*checkmb)) { /* allocate colnames. */ PROTECT(elnames = allocVector(STRSXP, 2)); SET_STRING_ELT(elnames, 0, mkChar("mb")); SET_STRING_ELT(elnames, 1, mkChar("nbr")); }/*THEN*/ for (i = 0; i < n; i++) { if (!(*checkmb)) { /* allocate the "mb" and "nbr" elements of the node. */ PROTECT(temp = allocVector(VECSXP, 2)); SET_VECTOR_ELT(fixed, i, temp); setAttrib(temp, R_NamesSymbol, elnames); /* copy the "mb" part from the old structure. */ temp2 = getListElement(bn, (char *)NODE(i)); temp2 = getListElement(temp2, "mb"); SET_VECTOR_ELT(temp, 0, temp2); }/*THEN*/ /* fix the neighbourhoods with an AND or an OR filter. * AND means that both neighbours have to see each other * to be neighbours, OR means that one neighbour at least * as to see the other one for both to be neighbours. */ switch(*nbrfilter) { case AND_FILTER: /* rescan the checklist. */ for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] == 2) if (i != j) counter++; /* allocate and fill the "nbr" element. */ PROTECT(temp2 = allocVector(STRSXP, counter)); for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] == 2) if (i != j) SET_STRING_ELT(temp2, --counter, STRING_ELT(nodes, j)); break; case OR_FILTER: /* rescan the checklist. */ for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] >= 1) if (i != j) counter++; /* allocate and fill the "nbr" element. */ PROTECT(temp2 = allocVector(STRSXP, counter)); for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] >= 1) if (i != j) SET_STRING_ELT(temp2, --counter, STRING_ELT(nodes, j)); break; } if (*checkmb) { SET_VECTOR_ELT(fixed, i, temp2); UNPROTECT(1); }/*THEN*/ else { SET_VECTOR_ELT(temp, 1, temp2); UNPROTECT(2); }/*ELSE*/ }/*FOR*/ if (*checkmb) UNPROTECT(1); else UNPROTECT(2); return fixed; }/*BN_RECOVERY*/
/* set the directions of the arcs in a tree given the root node. */ SEXP tree_directions(SEXP arcs, SEXP nodes, SEXP root, SEXP debug) { int i = 0, j = 0, d = 0, traversed = 1; int narcs = length(arcs)/2, nnodes = length(nodes); int *a = NULL, *depth = 0, debuglevel = isTRUE(debug); SEXP try, try2, result; /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); a = INTEGER(try); /* match the root node. */ PROTECT(try2 = match(nodes, root, 0)); /* allocate and initialize the statust vector. */ depth = Calloc1D(nnodes, sizeof(int)); depth[INT(try2) - 1] = 1; if (debuglevel > 0) Rprintf("> root node (depth 1) is %s.\n", NODE(INT(try2) - 1)); for (d = 1; d <= nnodes; d++) { if (debuglevel > 0) Rprintf("> considering nodes at depth %d.\n", d + 1); for (i = 0; i < narcs; i++) { for (j = 0; j < nnodes; j++) { /* disregard nodes at the wrong depth. */ if (depth[j] != d) continue; if ((a[i + narcs] == (j + 1)) && (depth[a[i] - 1] == 0)) { if (debuglevel > 0) Rprintf(" * found node %s.\n", NODE(a[i] - 1)); /* save the depth at which the node was found. */ depth[a[i] - 1] = d + 1; /* update the counter of the traversed nodes. */ traversed++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* check whether all nodes have been traversed. */ if (traversed == nnodes) break; }/*FOR*/ /* allocate and initialize the return value. */ PROTECT(result = allocMatrix(STRSXP, narcs/2, 2)); for (i = 0, j = 0; i < narcs; i++) { if (depth[a[i] - 1] < depth[a[i + narcs] - 1]) { SET_STRING_ELT(result, j, STRING_ELT(arcs, i)); SET_STRING_ELT(result, j + narcs/2, STRING_ELT(arcs, i + narcs)); j++; }/*THEN*/ }/*FOR*/ UNPROTECT(3); Free1D(depth); return result; }/*TREE_DIRECTIONS*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncol = length(data); int num = length(VECTOR_ELT(data, 0)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, clevels = 0, *est = INTEGER(estimator), *depth = NULL; int *wl = NULL, *bl = NULL, *poset = NULL, debuglevel = isTRUE(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); include = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, cond, &clevels, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS(); /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = length(wlist); for (i = 0; i < nwl; i++) { if (debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = length(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(int)); for (i = 0; i < UPTRI3_MATRIX(ncol); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncol)); depth = Calloc1D(ncol, sizeof(int)); for (i = UPTRI3_MATRIX(ncol) - 1; i >= 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncol, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncol - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, depth, debug_coord[0], debug_coord[1], ncol, nodes, FALSE)) { if (debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (length(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncol - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncol - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncol - 1)); Free1D(depth); Free1D(mim); Free1D(include); Free1D(poset); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); return arcs; }/*CHOW_LIU*/
/* ARACNE structure learning algorithm. */ SEXP aracne(SEXP data, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP debug) { int i = 0, j = 0, k = 0, coord = 0, ncol = length(data); int num = length(VECTOR_ELT(data, i)), narcs = ncol * (ncol - 1) / 2; int *nlevels = NULL, *est = INTEGER(estimator), *wl = NULL, *bl = NULL; int debuglevel = isTRUE(debug); void **columns = NULL; short int *exclude = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, nodes, wlist, blist; PROTECT(nodes = getAttrib(data, R_NamesSymbol)); /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); exclude = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, NULL, NULL, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS() /* compare all the triplets. */ for (i = 0; i < ncol; i++) { for (j = i + 1; j < ncol; j++) { for (k = 0; k < ncol; k++) { if ((k == i) || (k == j)) continue; /* cache the UPTRI3 coordinates of the arc. */ coord = UPTRI3(i + 1, j + 1, ncol); /* if MI(X, Y) < min(MI(X, Z), MI(Z, Y)) drop arc X - Y. */ if ((mim[coord] < mim[UPTRI3(i + 1, k + 1, ncol)]) && (mim[coord] < mim[UPTRI3(j + 1, k + 1, ncol)])) { if (debuglevel > 0) { Rprintf("* dropping arc %s - %s because of %s, %lf < min(%lf, %lf)\n", NODE(i), NODE(j), NODE(k), mim[UPTRI3(i + 1, j + 1, ncol)], mim[UPTRI3(i + 1, k + 1, ncol)], mim[UPTRI3(j + 1, k + 1, ncol)]); }/*THEN*/ /* update the status vector. */ exclude[coord] = 1; /* decrement the number of arcs. */ narcs--; break; }/*THEN*/ }/*FOR*/ }/*FOR*/ }/*FOR*/ /* add back whitelisted arcs. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); for (i = 0; i < length(wlist); i++) { if (debuglevel > 0) { Rprintf("* adding back whitelisted arcs.\n"); if (exclude[wl[i]] == 1) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[wl[i]] == 1) narcs++; /* include the arc in the graph. */ exclude[wl[i]] = 0; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* remove blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); for (i = 0; i < length(blist); i++) { if (debuglevel > 0) { Rprintf("* removing blacklisted arcs.\n"); if (exclude[bl[i]] == 0) { Rprintf(" > arc %s - %s has been dropped from the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was not present in the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[bl[i]] == 0) narcs--; /* remove the arc from the graph. */ exclude[bl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ CONVERT_TO_ARC_SET(exclude, 1, 2 * narcs); Free1D(mim); Free1D(exclude); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); UNPROTECT(1); return arcs; }/*ARACNE*/
/* compute the number of parameters of a fitted model. */ SEXP nparams_fitted(SEXP bn, SEXP effective, SEXP debug) { int i = 0, j = 0, k = 0, node_params = 0, nnodes = length(bn), *pd = NULL; int res = 0, debuglevel = isTRUE(debug), eff = isTRUE(effective); double *ps = NULL, counter = 0; SEXP nodes = R_NilValue, node_data, param_set, param_dims; if (debuglevel > 0) nodes = getAttrib(bn, R_NamesSymbol); for (i = 0; i < nnodes; i++) { /* get the node's data. */ node_data = VECTOR_ELT(bn, i); /* get its probability distribution (if discrete). */ param_set = getListElement(node_data, "prob"); if (!isNull(param_set)) { /* get the dimensions of the conditional probability table. */ param_dims = getAttrib(param_set, R_DimSymbol); pd = INTEGER(param_dims); ps = REAL(param_set); if (eff) { /* count the number of non-zero free parameters. */ for (node_params = 0, k = 0; k < length(param_set) / pd[0]; k++) { /* check the elements of each conditional probability distribution. */ for (counter = 0, j = 0; j < pd[0]; j++) counter += !ISNAN(ps[CMC(j, k, pd[0])]) && (ps[CMC(j, k, pd[0])] > 0); /* subtract the column total to get the free parameters. */ if (counter > 0) counter--; node_params += counter; }/*FOR*/ }/*THEN*/ else { /* compute the number of parameters. */ for (node_params = 1, j = 1; j < length(param_dims); j++) node_params *= pd[j]; node_params *= pd[0] - 1; }/*ELSE*/ }/*THEN*/ else { /* get the vector (or matrix) of regression coefficients. */ param_set = getListElement(node_data, "coefficients"); ps = REAL(param_set); /* this is a continuous node, so it's a lot easier. */ if (eff) { /* count the number of non-zero regression coefficients. */ for (node_params = 0, j = 0; j < length(param_set); j++) node_params += (ps[j] != 0); }/*THEN*/ else { /* compute the number of parameters. */ node_params = length(param_set); }/*ELSE*/ }/*ELSE*/ if (debuglevel > 0) Rprintf("* node %s has %d parameter(s).\n", NODE(i), node_params); res += node_params; }/*FOR*/ return ScalarInteger(res); }/*NPARAMS_FITTED*/
SEXP hc_to_be_added(SEXP arcs, SEXP blacklist, SEXP whitelist, SEXP nparents, SEXP maxp, SEXP nodes, SEXP convert) { int i = 0, j = 0, narcs = 0, dims = length(nodes); int *a = NULL, *coords = NULL; double *mp = REAL(maxp), *np = NULL; short int referenced = 0; SEXP try, result = R_NilValue, result2; /* transform the arc set into an adjacency matrix, if it's not one already. */ if (isInteger(arcs)) { if ((referenced = MAYBE_REFERENCED(arcs))) PROTECT(result = duplicate(arcs)); }/*THEN*/ else { PROTECT(result = arcs2amat(arcs, nodes)); }/*ELSE*/ /* dereference the adjacency matrix once and for all. */ a = INTEGER(result); /* compute the number the parents of each node, unless provided. */ if (nparents == R_NilValue) { np = Calloc1D(dims, sizeof(double)); for (i = 0; i < dims; i++) for (j = 0; j < dims; j++) np[j] = a[CMC(i, j, dims)]; }/*THEN*/ else { np = REAL(nparents); }/*ELSE*/ /* flip all the nondiagonal cells. */ for (j = 0; j < dims; j++) { for (i = 0; i < dims; i++) { /* diagonal elements are always equal to zero, skip them. */ if (i == j) continue; a[CMC(i, j, dims)] = 1 - a[CMC(i, j, dims)]; }/*FOR*/ }/*FOR*/ /* if an arc is present in the graph in one direction, you cannot add it in * the other direction (it would be a reversal); flip both in the adjacency * matrix. */ for (j = 0; j < dims; j++) for (i = j + 1; i < dims; i++) a[CMC(j, i, dims)] = a[CMC(i, j, dims)] = a[CMC(i, j, dims)] * a[CMC(j, i, dims)]; /* if a node has already reached its maximum number parents, do not add * more arcs pointing to that node. */ for (j = 0; j < dims; j++) if (np[j] >= *mp) memset(a + j * dims, '\0', dims * sizeof(int)); #define FLIP_FROM_LIST(list, value) \ if (!isNull(list)) { \ if (!isInteger(list)) { \ PROTECT(try = match(nodes, list, 0)); \ coords = INTEGER(try); \ narcs = length(try)/2; \ for (i = 0; i < narcs; i++) \ a[CMC(coords[i] - 1, coords[i + narcs] - 1, dims)] = value; \ UNPROTECT(1); \ }/*THEN*/ \ else { \ coords = INTEGER(list); \ for (i = 0; i < dims * dims; i ++) \ if (coords[i] == 1) \ a[i] = value; \ }/*ELSE*/ \ }/*THEN*/ /* now the blacklist gets involved. */ FLIP_FROM_LIST(blacklist, 0); /* and, last but not least, the whitelist gets involved. */ FLIP_FROM_LIST(whitelist, 1); if (nparents == R_NilValue) Free1D(np); /* return either the adjacency matrix or the arc set. */ if (isTRUE(convert)) { PROTECT(result2 = amat2arcs(result, nodes)); if (referenced || !isInteger(arcs)) UNPROTECT(2); else UNPROTECT(1); return result2; }/*THEN*/ else { if (referenced || !isInteger(arcs)) UNPROTECT(1); return result; }/*ELSE*/ }/*HC_TO_BE_ADDED*/
/* return the skeleton of a DAG/PDAG. */ SEXP dag2ug(SEXP bn, SEXP moral, SEXP debug) { int i = 0, j = 0, k = 0, nnodes = 0, narcs = 0, row = 0; int debuglevel = isTRUE(debug), *moralize = LOGICAL(moral); int *nparents = NULL, *nnbr = NULL; SEXP node_data, current, nodes, result, temp; /* get the nodes' data. */ node_data = getListElement(bn, "nodes"); nnodes = length(node_data); PROTECT(nodes = getAttrib(node_data, R_NamesSymbol)); /* allocate and initialize parents' and neighbours' counters. */ nnbr = Calloc1D(nnodes, sizeof(int)); if (*moralize > 0) nparents = Calloc1D(nnodes, sizeof(int)); /* first pass: count neighbours, parents and resulting arcs. */ for (i = 0; i < nnodes; i++) { /* get the number of neighbours. */ current = VECTOR_ELT(node_data, i); nnbr[i] = length(getListElement(current, "nbr")); /* update the number of arcs to be returned. */ if (*moralize > 0) { /* get also the number of parents, needed to account for the arcs added * for their moralization. */ nparents[i] = length(getListElement(current, "parents")); narcs += nnbr[i] + nparents[i] * (nparents[i] - 1); }/*THEN*/ else { narcs += nnbr[i]; }/*ELSE*/ if (debuglevel > 0) { if (*moralize > 0) { Rprintf("* scanning node %s, found %d neighbours and %d parents.\n", NODE(i), nnbr[i], nparents[i]); Rprintf(" > adding %d arcs, for a total of %d.\n", nnbr[i] + nparents[i] * (nparents[i] - 1), narcs); }/*THEN*/ else { Rprintf("* scanning node %s, found %d neighbours.\n", NODE(i), nnbr[i]); Rprintf(" > adding %d arcs, for a total of %d.\n", nnbr[i], narcs); }/*ELSE*/ }/*THEN*/ }/*FOR*/ /* allocate the return value. */ PROTECT(result = allocMatrix(STRSXP, narcs, 2)); /* allocate and set the column names. */ setDimNames(result, R_NilValue, mkStringVec(2, "from", "to")); /* second pass: fill the return value. */ for (i = 0; i < nnodes; i++) { /* get to the current node. */ current = VECTOR_ELT(node_data, i); /* get the neighbours. */ temp = getListElement(current, "nbr"); for (j = 0; j < nnbr[i]; j++) { SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(nodes, i)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, j)); row++; }/*FOR*/ /* if we are not creating a moral graph we are done with this node. */ if (*moralize == 0) continue; /* get the parents. */ temp = getListElement(current, "parents"); for (j = 0; j < nparents[i]; j++) { for (k = j+1; k < nparents[i]; k++) { SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(temp, k)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, j)); row++; SET_STRING_ELT(result, CMC(row, 0, narcs), STRING_ELT(temp, j)); SET_STRING_ELT(result, CMC(row, 1, narcs), STRING_ELT(temp, k)); row++; }/*FOR*/ }/*FOR*/ }/*FOR*/ Free1D(nnbr); if (*moralize > 0) { /* be really sure not to return duplicate arcs in moral graphs when * shielded parents are present (the "shielding" nodes are counted * twice). */ result = c_unique_arcs(result, nodes, FALSE); Free1D(nparents); }/*THEN*/ UNPROTECT(2); return result; }/*DAG2UG*/
/* construct a consistent DAG extension of a CPDAG. */ SEXP pdag_extension(SEXP arcs, SEXP nodes, SEXP debug) { int i = 0, j = 0, k = 0, t = 0, nnodes = length(nodes); int changed = 0, left = nnodes; int *a = NULL, *nbr = NULL, debuglevel = isTRUE(debug); short int *matched = NULL; SEXP amat, result; /* build and dereference the adjacency matrix. */ PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* allocate and initialize the neighbours and matched vectors. */ nbr = Calloc1D(nnodes, sizeof(int)); matched = Calloc1D(nnodes, sizeof(short int)); for (t = 0; t < nnodes; t++) { if (debuglevel > 0) { Rprintf("----------------------------------------------------------------\n"); Rprintf("> performing pass %d.\n", t + 1); Rprintf("> candidate nodes: "); for (j = 0; j < nnodes; j++) if (matched[j] == 0) Rprintf("%s ", NODE(j)); Rprintf("\n"); }/*THEN*/ for (i = 0; i < nnodes; i++) { /* if the node is already ok, skip it. */ if (matched[i] != 0) continue; /* check whether the node is a sink (that is, whether is does not have * any child). */ is_a_sink(a, i, &k, nnodes, nbr, matched); /* if the node is not a sink move on. */ if (k == -1) { if (debuglevel > 0) Rprintf(" * node %s is not a sink.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) Rprintf(" * node %s is a sink.\n", NODE(i)); }/*ELSE*/ if (!all_adjacent(a, i, k, nnodes, nbr)) { if (debuglevel > 0) Rprintf(" * not all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) { if (k == 0) Rprintf(" * no node is linked to %s by an undirected arc.\n", NODE(i)); else Rprintf(" * all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); }/*THEN*/ }/*ELSE*/ /* the current node meets all the conditions, direct all the arcs towards it. */ if (k == 0) { if (debuglevel > 0) Rprintf(" @ no undirected arc to direct towards %s.\n", NODE(i)); }/*THEN*/ else { for (j = 0; j < k; j++) a[CMC(i, nbr[j], nnodes)] = 0; if (debuglevel > 0) Rprintf(" @ directing all incident undirected arcs towards %s.\n", NODE(i)); }/*ELSE*/ /* set the changed flag. */ changed = 1; /* exclude the node from later iterations. */ matched[i] = 1; left--; }/*FOR*/ /* if nothing changed in the last iteration or there are no more candidate * nodes, there is nothing else to do. */ if ((changed == 0) || (left == 0)) break; else changed = 0; }/*FOR*/ /* build the new arc set from the adjacency matrix. */ PROTECT(result = amat2arcs(amat, nodes)); Free1D(nbr); Free1D(matched); UNPROTECT(2); return result; }/*PDAG_EXTENSION*/
SEXP is_pdag_acyclic(SEXP arcs, SEXP nodes, SEXP return_nodes, SEXP directed, SEXP debug) { int i = 0, j = 0, z = 0; int nrows = LENGTH(nodes); int check_status = nrows, check_status_old = nrows; int *rowsums = NULL, *colsums = NULL, *crossprod = NULL, *a = NULL; int *debuglevel = NULL; short int *status = NULL; SEXP amat; /* dereference the debug parameter. */ debuglevel = LOGICAL(debug); /* build the adjacency matrix from the arc set. */ if (*debuglevel > 0) Rprintf("* building the adjacency matrix.\n"); PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* should we consider only directed arcs? */ if (isTRUE(directed)) { /* removing undirected arcs, so that only cycles made only of directed * arcs will make the function return TRUE. */ for (i = 0; i < nrows; i++) for (j = 0; j < nrows; j++) if ((a[CMC(i, j, nrows)] == 1) && (a[CMC(j, i, nrows)] == 1)) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; }/*THEN*/ /* initialize the status, {row,col}sums and crossprod arrays. */ status = allocstatus(nrows); rowsums = alloc1dcont(nrows); colsums = alloc1dcont(nrows); crossprod = alloc1dcont(nrows); if (*debuglevel > 0) Rprintf("* checking whether the partially directed graph is acyclic.\n"); /* even in the worst case scenario at least two nodes are marked as * good at each iteration, so even ceil(nrows/2) iterations should be * enough. */ for (z = 0; z < nrows; z++) { start: if (*debuglevel > 0) Rprintf("* beginning iteration %d.\n", z + 1); for (i = 0; i < nrows; i++) { /* skip known-good nodes. */ if (status[i] == GOOD) continue; /* reset and update row and column totals. */ rowsums[i] = colsums[i] = crossprod[i] = 0; /* compute row and column totals for the i-th node. */ for (j = 0; j < nrows; j++) { rowsums[i] += a[CMC(i, j, nrows)]; colsums[i] += a[CMC(j, i, nrows)]; crossprod[i] += a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)]; }/*FOR*/ there: if (*debuglevel > 0) Rprintf(" > checking node %s (%d child(ren), %d parent(s), %d neighbours).\n", NODE(i), rowsums[i], colsums[i], crossprod[i]); /* if either total is zero, the node is either a root node or a * leaf node, and is not part of any cycle. */ if (((rowsums[i] == 0) || (colsums[i] == 0)) || ((crossprod[i] == 1) && (rowsums[i] == 1) && (colsums[i] == 1))) { if (*debuglevel > 0) Rprintf(" @ node %s is cannot be part of a cycle.\n", NODE(i)); /* update the adjacency matrix and the row/column totals. */ for (j = 0; j < nrows; j++) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; rowsums[i] = colsums[i] = crossprod[i] = 0; /* mark the node as good. */ status[i] = GOOD; check_status--; }/*THEN*/ else if (crossprod[i] == 1) { /* find the other of the undirected arc. */ for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) break; /* safety check, just in case. */ if (i == j) continue; if (((colsums[i] == 1) && (colsums[j] == 1)) || ((rowsums[i] == 1) && (rowsums[j] == 1))) { if (*debuglevel > 0) Rprintf(" @ arc %s - %s is cannot be part of a cycle.\n", NODE(i), NODE(j)); /* update the adjacency matrix and the row/column totals. */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; crossprod[i] = 0; rowsums[i]--; colsums[i]--; rowsums[j]--; colsums[j]--; /* jump back to the first check; if either the row or column total * was equal to 1 only because of the undirected arc, the node can * now be marked as good. */ if ((rowsums[i] == 0) || (colsums[i] == 0)) goto there; }/*THEN*/ }/*THEN*/ }/*FOR*/ /* at least three nodes are needed to have a cycle. */ if (check_status < 3) { if (*debuglevel > 0) Rprintf("@ at least three nodes are needed to have a cycle.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ /* if there are three or more bad nodes and there was no change in * the last iteration, the algorithm is stuck on a cycle. */ if (check_status_old == check_status) { if (*debuglevel > 0) Rprintf("@ no change in the last iteration.\n"); /* give up and call c_has_path() to kill some undirected arcs. */ for (i = 0; i < nrows; i++) for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) { /* remove the arc from the adjacency matrix while testing it, * there's a path is always found (the arc itself). */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; if(!c_has_path(i, j, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE) && !c_has_path(j, i, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE)) { if (*debuglevel > 0) Rprintf("@ arc %s - %s is not part of any cycle, removing.\n", NODE(i), NODE(j)); /* increase the iteration counter and start again. */ z++; goto start; }/*THEN*/ else { /* at least one cycle is really present; give up and return. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*ELSE*/ }/*THEN*/ /* give up if there are no undirected arcs, cycles composed * entirely by directed arcs are never false positives. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ else { check_status_old = check_status; }/*ELSE*/ }/*FOR*/ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*IS_PDAG_ACYCLIC*/
/* remove duplicate arcs from the arc set. */ SEXP unique_arcs(SEXP arcs, SEXP nodes, SEXP warn) { return c_unique_arcs(arcs, nodes, isTRUE(warn)); }/*UNIQUE_ARCS*/
/* predict the values of one or more variables given one or more variables by * maximum a posteriori (MAP). */ SEXP mappred(SEXP node, SEXP fitted, SEXP data, SEXP n, SEXP from, SEXP debug) { int i = 0, j = 0, k = 0, nobs = 0, nev = 0, nlvls = 0; int *vartypes = NULL, nsims = INT(n), debuglevel = isTRUE(debug); void **varptrs = NULL, **evptrs = NULL, *pred = NULL, *res = NULL; SEXP result, colnames, evidence, evmatch, temp = R_NilValue; SEXP cpdist, predicted, lvls = R_NilValue; double *wgt = NULL; long double *lvls_counts = NULL; /* extract the names of the variables in the data. */ colnames = getAttrib(data, R_NamesSymbol); /* remove the name of the variable to predict. */ nev = length(from); PROTECT(evmatch = match(colnames, from, 0)); /* cache variable types and pointers. */ vartypes = alloc1dcont(nev); varptrs = alloc1dpointer(nev); for (j = 0, k = 0; j < nev; j++) { temp = VECTOR_ELT(data, INTEGER(evmatch)[j] - 1); vartypes[k] = TYPEOF(temp); varptrs[k++] = DATAPTR(temp); }/*FOR*/ /* cache the sample size. */ nobs = length(temp); /* allocate a list to hold the evidence. */ PROTECT(evidence = allocVector(VECSXP, nev)); setAttrib(evidence, R_NamesSymbol, from); /* cache pointers to the elements of the evidence .*/ evptrs = alloc1dpointer(nev); for (j = 0; j < nev; j++) { PROTECT(temp = allocVector(vartypes[j], 1)); evptrs[j] = DATAPTR(temp); SET_VECTOR_ELT(evidence, j, temp); UNPROTECT(1); }/*FOR*/ /* make the evidence a data frame to compact debugging output. */ minimal_data_frame(evidence); /* allocate the return value. */ PROTECT(result = fitnode2df(fitted, STRING_ELT(node, 0), nobs)); res = DATAPTR(result); /* in the case of discrete variables, allocate scratch space for levels' * frequencies. */ if (TYPEOF(result) == INTSXP) { lvls = getAttrib(result, R_LevelsSymbol); nlvls = length(lvls); lvls_counts = allocldouble(nlvls); }/*THEN*/ /* allocate the weights. */ wgt = alloc1dreal(nsims); /* allocate sratch space for the random samplings. */ PROTECT(cpdist = fit2df(fitted, nsims)); predicted = getListElement(cpdist, (char *)CHAR(STRING_ELT(node, 0))); pred = DATAPTR(predicted); /* iterate over the observations. */ for (i = 0; i < nobs; i++) { /* copy the values into the list. */ for (j = 0; j < nev; j++) { switch(vartypes[j]) { case REALSXP: *((double *)evptrs[j]) = ((double *)varptrs[j])[i]; break; case INTSXP: *((int *)evptrs[j]) = ((int *)varptrs[j])[i]; break; }/*SWITCH*/ }/*FOR*/ if (debuglevel > 0) { Rprintf("* predicting observation %d conditional on:\n", i); PrintValue(evidence); }/*THEN*/ /* generate samples from the conditional posterior distribution. */ c_rbn_master(fitted, cpdist, n, evidence, FALSE); /* compute the weights. */ c_lw_weights(fitted, cpdist, nsims, wgt, from, FALSE); /* compute the posterior estimate. */ switch(TYPEOF(predicted)) { case REALSXP: /* average the predicted values. */ ((double *)res)[i] = posterior_mean((double *)pred, wgt, nsims, debuglevel); break; case INTSXP: /* pick the most frequent value. */ ((int *)res)[i] = posterior_mode((int *)pred, wgt, nsims, lvls_counts, lvls, nlvls, debuglevel); break; }/*SWITCH*/ }/*FOR*/ UNPROTECT(4); return result; }/*MAPPRED*/
SEXP score_cache_fill(SEXP nodes, SEXP data, SEXP network, SEXP score, SEXP extra, SEXP reference, SEXP equivalence, SEXP decomposability, SEXP updated, SEXP amat, SEXP cache, SEXP blmat, SEXP debug) { int *colsum = NULL, nnodes = length(nodes), lupd = length(updated); int *a = NULL, *upd = NULL, *b = NULL, debuglevel = isTRUE(debug); int i = 0, j = 0, k = 0; double *cache_value = NULL; SEXP arc, delta, op, temp; /* save a pointer to the adjacency matrix, the blacklist and the * updated nodes. */ a = INTEGER(amat); b = INTEGER(blmat); upd = INTEGER(updated); /* if there are no nodes to update, return. */ if (lupd == 0) return cache; /* set up row and column total to check for score equivalence; * zero means no parent nodes. */ if (isTRUE(equivalence)) { colsum = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) colsum[j] += a[CMC(i, j, nnodes)]; }/*THEN*/ /* allocate and initialize the cache. */ cache_value = REAL(cache); /* allocate a two-slot character vector. */ PROTECT(arc = allocVector(STRSXP, 2)); /* allocate and initialize the fake score delta. */ PROTECT(delta = ScalarReal(0)); /* allocate and initialize the score.delta() operator. */ PROTECT(op = mkString("set")); for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* incident nodes must be different from each other. */ if (i == j) continue; /* if only one or two nodes' caches need updating, skip the rest. */ for (k = 0; k < lupd; k++) if (upd[k] == j) goto there; continue; there: /* no need to compute the score delta for blacklisted arcs. */ if (b[CMC(i, j, nnodes)] == 1) continue; /* use score equivalence if possible to check only one orientation. */ if (isTRUE(equivalence)) { /* if the following conditions are met, look up the score delta of * the reverse of the current arc: * 1) that score delta has already been computed. * 2) both incident nodes have no parent, so the arc is really * score equivalent (no v-structures). * 3) the reversed arc has not been blacklisted, as the score delta * is not computed in this case. */ if ((i > j) && (colsum[i] + colsum[j] == 0) && (b[CMC(j, i, nnodes)] == 0)) { cache_value[CMC(i, j, nnodes)] = cache_value[CMC(j, i, nnodes)]; continue; }/*THEN*/ }/*THEN*/ /* save the nodes incident on the arc. */ SET_STRING_ELT(arc, 0, STRING_ELT(nodes, i)); SET_STRING_ELT(arc, 1, STRING_ELT(nodes, j)); /* if the arc is not present in the graph it should be added; * otherwise it should be removed. */ if (a[CMC(i, j, nnodes)] == 0) SET_STRING_ELT(op, 0, mkChar("set")); else SET_STRING_ELT(op, 0, mkChar("drop")); /* checkpoint allocated memory. */ /* evaluate the call to score.delta() for the arc. */ PROTECT(temp = score_delta(arc, network, data, score, delta, reference, op, extra, decomposability)); cache_value[CMC(i, j, nnodes)] = NUM(VECTOR_ELT(temp, 1)); UNPROTECT(1); if (debuglevel > 0) Rprintf("* caching score delta for arc %s -> %s (%lf).\n", CHAR(STRING_ELT(nodes, i)), CHAR(STRING_ELT(nodes, j)), cache_value[CMC(i, j, nnodes)]); }/*FOR*/ }/*FOR*/ UNPROTECT(3); if (isTRUE(equivalence)) Free1D(colsum); return cache; }/*HC_CACHE_FILL*/
/* convert an arc set to a weighted edge list. */ SEXP arcs2welist(SEXP arcs, SEXP nodes, SEXP weights, SEXP nid, SEXP sublist, SEXP parents) { int i = 0, j = 0, k = 0, nnodes = length(nodes), narcs = length(arcs)/2; int *e = NULL, *coords = NULL, *adjacent = NULL; int num_id = isTRUE(nid), sub = isTRUE(sublist), up = isTRUE(parents); double *w = REAL(weights), *ew = NULL; SEXP try, elist, edges, edge_weights, temp, temp_name = R_NilValue; /* allocate the return value. */ PROTECT(elist = allocVector(VECSXP, nnodes)); /* set the node names. */ setAttrib(elist, R_NamesSymbol, nodes); /* allocate and initialize the subset name. */ if (sub > 0) PROTECT(temp_name = mkStringVec(2, "edges", "weight")); /* allocate the scratch space to keep track adjacent nodes. */ adjacent = alloc1dcont(nnodes); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); for (i = 0; i < narcs; i++) adjacent[coords[i + up * narcs] - 1]++; for (i = 0; i < nnodes; i++) { /* allocate and set up the edge array. */ if (num_id > 0) { PROTECT(edges = allocVector(INTSXP, adjacent[i])); e = INTEGER(edges); }/*THEN*/ else { PROTECT(edges = allocVector(STRSXP, adjacent[i])); }/*ELSE*/ /* allocate and set up the weights array. */ PROTECT(edge_weights = allocVector(REALSXP, adjacent[i])); ew = REAL(edge_weights); /* copy the coordinates or the labels of adjacent nodes. */ for (j = 0, k = 0; j < narcs; j++) { if (coords[j + up * narcs] != i + 1) continue; /* copy the weight as well. */ ew[k] = w[j]; if (num_id > 0) e[k++] = coords[(1 - up) * narcs + j]; else SET_STRING_ELT(edges, k++, STRING_ELT(arcs, (1 - up) * narcs + j)); if (k == adjacent[i]) break; }/*FOR*/ if (sub > 0) { /* allocate and set up the "edge" sublist for graphNEL. */ PROTECT(temp = allocVector(VECSXP, 2)); setAttrib(temp, R_NamesSymbol, temp_name); SET_VECTOR_ELT(temp, 0, edges); SET_VECTOR_ELT(temp, 1, edge_weights); SET_VECTOR_ELT(elist, i, temp); UNPROTECT(1); }/*THEN*/ else { /* save weights with edges as names. */ setAttrib(edge_weights, R_NamesSymbol, edges); SET_VECTOR_ELT(elist, i, edge_weights); }/*ELSE*/ UNPROTECT(2); }/*FOR*/ if (sub > 0) UNPROTECT(3); else UNPROTECT(2); return elist; }/*ARCS2WELIST*/
/* does "arc" match any elements of "set"? */ SEXP is_listed(SEXP arc, SEXP set, SEXP either, SEXP both, SEXP debug) { int i = 0, matched = 0, nrows = length(set) / 2; const char *from = CHAR(STRING_ELT(arc, 0)); const char *to = CHAR(STRING_ELT(arc, 1)); int debuglevel = isTRUE(debug); /* if the arc set is NULL, return immediately. */ if (isNull(set)) return ScalarLogical(FALSE); for (i = 0; i < nrows; i++) { if (debuglevel > 0) Rprintf("* checking %s -> %s\n", ARC(i, 0), ARC(i, 1)); /* check the first element; if it does not match skip to the second one. */ if (!strcmp(from, ARC(i, 0)) ) { /* if the second element matches, return if "both = FALSE" or if * "both = TRUE" and the reversed arc has been already found out. */ if (!strcmp(to, ARC(i, 1)) ) { /* increment the "matched" counter, which is needed to be sure both * A -> B and B -> A are in the arc set when "both = TRUE". */ matched++; if (debuglevel > 0) Rprintf(" > matched %s -> %s (matched is %d).\n", ARC(i, 0), ARC(i, 1), matched); /* return TRUE if one of the following conditions is met: * * 1) exact match (either = both = FALSE). * 2) match regardless of direction (either = TRUE). * 3) match both directions (both = TRUE) when the other * one has already been found (matched = 2). */ if ((!isTRUE(either) && !isTRUE(both)) || isTRUE(either) || ((matched == 2) && isTRUE(both))) goto success; }/*THEN*/ }/*THEN*/ else if (isTRUE(either) || isTRUE(both)) { /* the same as above, but with the reversed arc; this part is * skipped if "both = FALSE" and "either = FALSE", since the * reversed arc should not be matched in that case. */ if (!strcmp(to, ARC(i, 0)) ) { if (!strcmp(from, ARC(i, 1)) ) { /* increment the "matched" counter, which is needed to be sure both * A -> B and B -> A are in the arc set when "both = TRUE". */ matched++; if (debuglevel > 0) Rprintf(" > matched %s -> %s (matched is %d).\n", ARC(i, 0), ARC(i, 1), matched); /* return TRUE if one of the following conditions is met: * * 1) match regardless of direction (either = TRUE). * 2) match both directions (both = TRUE) when the other * one has already been found (matched = 2). */ if (isTRUE(either) || ((matched == 2) && isTRUE(both))) goto success; }/*THEN*/ }/*THEN */ }/*THEN*/ }/*FOR*/ return ScalarLogical(FALSE); success: return ScalarLogical(TRUE); }/*IS_LISTED*/
/* remove one variable in each highly-correlated pair. */ SEXP dedup (SEXP data, SEXP threshold, SEXP complete, SEXP debug) { int i = 0, j = 0, k = 0, dropped = 0, nc = 0; int debuglevel = isTRUE(debug); double *mean = NULL, *sse = NULL, *xx = NULL, *yy = NULL; double cur_mean[2], cur_sse[2]; double tol = MACHINE_TOL, t = NUM(threshold); long double sum = 0; SEXP result, colnames; gdata dt = { 0 }; /* extract the columns from the data frame. */ dt = gdata_from_SEXP(data, 0); meta_init_flags(&(dt.m), 0, complete, R_NilValue); meta_copy_names(&(dt.m), 0, data); /* set up the vectors for the pairwise complete observations. */ xx = Calloc1D(dt.m.nobs, sizeof(double)); yy = Calloc1D(dt.m.nobs, sizeof(double)); if (debuglevel > 0) Rprintf("* caching means and variances.\n"); mean = Calloc1D(dt.m.ncols, sizeof(double)); sse = Calloc1D(dt.m.ncols, sizeof(double)); /* cache the mean and variance of complete variables. */ for (j = 0; j < dt.m.ncols; j++) { if (!dt.m.flag[j].complete) continue; mean[j] = c_mean(dt.col[j], dt.m.nobs); sse[j] = c_sse(dt.col[j], mean[j], dt.m.nobs); }/*FOR*/ /* main loop. */ for (j = 0; j < dt.m.ncols - 1; j++) { /* skip variables already flagged for removal. */ if (dt.m.flag[j].drop) continue; if (debuglevel > 0) Rprintf("* looking at %s with %d variables still to check.\n", dt.m.names[j], dt.m.ncols - (j + 1)); for (k = j + 1; k < dt.m.ncols; k++) { /* skip variables already flagged for removal. */ if (dt.m.flag[k].drop) continue; if (dt.m.flag[j].complete && dt.m.flag[k].complete) { /* use the cached means and variances. */ cur_mean[0] = mean[j]; cur_mean[1] = mean[k]; cur_sse[0] = sse[j]; cur_sse[1] = sse[k]; /* compute the covariance. */ for (i = 0, sum = 0; i < dt.m.nobs; i++) sum += (dt.col[j][i] - cur_mean[0]) * (dt.col[k][i] - cur_mean[1]); }/*THEN*/ else { for (i = 0, nc = 0; i < dt.m.nobs; i++) { if (ISNAN(dt.col[j][i]) || ISNAN(dt.col[k][i])) continue; xx[nc] = dt.col[j][i]; yy[nc++] = dt.col[k][i]; }/*FOR*/ /* if there are no complete observations, take the variables to be * independent. */ if (nc == 0) continue; cur_mean[0] = c_mean(xx, nc); cur_mean[1] = c_mean(yy, nc); cur_sse[0] = c_sse(xx, cur_mean[0], nc); cur_sse[1] = c_sse(yy, cur_mean[1], nc); /* compute the covariance. */ for (i = 0, sum = 0; i < nc; i++) sum += (xx[i] - cur_mean[0]) * (yy[i] - cur_mean[1]); }/*ELSE*/ /* safety check against "divide by zero" errors. */ if ((cur_sse[0] < tol) || (cur_sse[1] < tol)) sum = 0; else sum /= sqrt(cur_sse[0] * cur_sse[1]); /* test the correlation against the threshold. */ if (fabsl(sum) > t) { if (debuglevel > 0) Rprintf("%s is collinear with %s, dropping %s with COR = %.4Lf\n", dt.m.names[j], dt.m.names[k], dt.m.names[k], sum); /* flag the variable for removal. */ dt.m.flag[k].drop = TRUE; dropped++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, dt.m.ncols - dropped)); PROTECT(colnames = allocVector(STRSXP, dt.m.ncols - dropped)); for (j = 0, k = 0; j < dt.m.ncols; j++) if (!dt.m.flag[j].drop) { SET_STRING_ELT(colnames, k, mkChar(dt.m.names[j])); SET_VECTOR_ELT(result, k++, VECTOR_ELT(data, j)); }/*THEN*/ setAttrib(result, R_NamesSymbol, colnames); /* make it a data frame. */ minimal_data_frame(result); Free1D(mean); Free1D(sse); Free1D(xx); Free1D(yy); FreeGDT(dt, FALSE); UNPROTECT(2); return result; }/*DEDUP*/
/* a single step of the optimized hill climbing (one arc addition/removal/reversal). */ SEXP hc_opt_step(SEXP amat, SEXP nodes, SEXP added, SEXP cache, SEXP reference, SEXP wlmat, SEXP blmat, SEXP nparents, SEXP maxp, SEXP debug) { int nnodes = length(nodes), i = 0, j = 0; int *am = NULL, *ad = NULL, *w = NULL, *b = NULL, debuglevel = isTRUE(debug); int counter = 0, update = 1, from = 0, to = 0, *path = NULL, *scratch = NULL; double *cache_value = NULL, temp = 0, max = 0, tol = MACHINE_TOL; double *mp = REAL(maxp), *np = REAL(nparents); SEXP bestop; /* allocate and initialize the return value (use FALSE as a canary value). */ PROTECT(bestop = allocVector(VECSXP, 3)); setAttrib(bestop, R_NamesSymbol, mkStringVec(3, "op", "from", "to")); /* allocate and initialize a dummy FALSE object. */ SET_VECTOR_ELT(bestop, 0, ScalarLogical(FALSE)); /* allocate buffers for c_has_path(). */ path = Calloc1D(nnodes, sizeof(int)); scratch = Calloc1D(nnodes, sizeof(int)); /* save pointers to the numeric/integer matrices. */ cache_value = REAL(cache); ad = INTEGER(added); am = INTEGER(amat); w = INTEGER(wlmat); b = INTEGER(blmat); if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0; i < nnodes * nnodes; i++) counter += ad[i]; Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to add one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (ad[CMC(i, j, nnodes)] == 0) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)]; if (debuglevel > 0) { Rprintf(" > trying to add %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ /* this score delta is the best one at the moment, so add the arc if it * does not introduce cycles in the graph. */ if (temp - max > tol) { if (c_has_path(j, i, am, nnodes, nodes, FALSE, FALSE, path, scratch, FALSE)) { if (debuglevel > 0) Rprintf(" > not adding, introduces cycles in the graph.\n"); continue; }/*THEN*/ if (debuglevel > 0) Rprintf(" @ adding %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "set", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0, counter = 0; i < nnodes * nnodes; i++) counter += am[i] * (1 - w[i]); Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to remove one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (am[CMC(i, j, nnodes)] == 0) continue; /* whitelisted arcs are not to be removed, ever. */ if (w[CMC(i, j, nnodes)] == 1) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)]; if (debuglevel > 0) { Rprintf(" > trying to remove %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ if (temp - max > tol) { if (debuglevel > 0) Rprintf(" @ removing %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "drop", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ if (debuglevel > 0) { /* count how may arcs are to be tested. */ for (i = 0, counter = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) counter += am[CMC(i, j, nnodes)] * (1 - b[CMC(j, i, nnodes)]); Rprintf("----------------------------------------------------------------\n"); Rprintf("* trying to reverse one of %d arcs.\n", counter); }/*THEN*/ for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* nothing to see, move along. */ if (am[CMC(i, j, nnodes)] == 0) continue; /* don't reverse an arc if the one in the opposite direction is * blacklisted, ever. */ if (b[CMC(j, i, nnodes)] == 1) continue; /* do not reverse an arc if that means violating the limit on the * maximum number of parents. */ if (np[i] >= *mp) continue; /* retrieve the score delta from the cache. */ temp = cache_value[CMC(i, j, nnodes)] + cache_value[CMC(j, i, nnodes)]; /* nuke small values and negative zeroes. */ if (fabs(temp) < tol) temp = 0; if (debuglevel > 0) { Rprintf(" > trying to reverse %s -> %s.\n", NODE(i), NODE(j)); Rprintf(" > delta between scores for nodes %s %s is %lf.\n", NODE(i), NODE(j), temp); }/*THEN*/ if (temp - max > tol) { if (c_has_path(i, j, am, nnodes, nodes, FALSE, TRUE, path, scratch, FALSE)) { if (debuglevel > 0) Rprintf(" > not reversing, introduces cycles in the graph.\n"); continue; }/*THEN*/ if (debuglevel > 0) Rprintf(" @ reversing %s -> %s.\n", NODE(i), NODE(j)); /* update the return value. */ bestop_update(bestop, "reverse", NODE(i), NODE(j)); /* store the node indices to update the reference scores. */ from = i; to = j; /* both nodes' reference scores must be updated. */ update = 2; /* update the threshold score delta. */ max = temp; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* update the reference scores. */ REAL(reference)[to] += cache_value[CMC(from, to, nnodes)]; if (update == 2) REAL(reference)[from] += cache_value[CMC(to, from, nnodes)]; Free1D(path); Free1D(scratch); UNPROTECT(1); return bestop; }/*HC_OPT_STEP*/
/* find out the partial ordering of the nodes of a DAG. */ SEXP topological_ordering(SEXP bn, SEXP root_nodes, SEXP reverse, SEXP debug) { int *depth = NULL, *matched = NULL, debuglevel = isTRUE(debug); int d = 0, i = 0, j = 0, changed = 0, nnodes = 0; char *direction = NULL; SEXP nodes_data, nodes, try, children, ordering; if (isTRUE(reverse)) direction = "parents"; else direction = "children"; /* get to the nodes' data in both 'bn' and 'bn.fit' objects. */ nodes_data = getListElement(bn, "nodes"); if (isNull(nodes_data)) nodes_data = bn; /* get and count the node labels. */ PROTECT(nodes = getAttrib(nodes_data, R_NamesSymbol)); nnodes = length(nodes); /* allocate a status vector to trak the ordering of the nodes. */ PROTECT(ordering = allocVector(INTSXP, nnodes)); depth = INTEGER(ordering); memset(depth, '\0', nnodes * sizeof(int)); if (debuglevel > 0) Rprintf("* currently at depth 1 (starting BFS).\n"); /* set the root nodes as the starting point of the BFS. */ PROTECT(try = match(nodes, root_nodes, 0)); matched = INTEGER(try); for (i = 0; i < length(try); i++) { if (debuglevel > 0) Rprintf(" > got node %s.\n", NODE(matched[i] - 1)); depth[matched[i] - 1] = 1; }/*FOR*/ UNPROTECT(1); /* now let's go down from the roots to the leafs, one layer at a time. */ for (d = 1; d <= nnodes; d++) { if (debuglevel > 0) Rprintf("* currently at depth %d.\n", d + 1); /* reset the changed flag. */ changed = 0; for (i = 0; i < nnodes; i++) { /* this node has already been visisted, skip. */ if (depth[i] < d) continue; children = getListElement(VECTOR_ELT(nodes_data, i), direction); /* this node is a leaf, nothing to do, move along. */ if (length(children) == 0) continue; /* set the changed flag. */ changed = 1; PROTECT(try = match(nodes, children, 0)); matched = INTEGER(try); /* set the correct depth to the children of this node. */ for (j = 0; j < length(try); j++) { if (debuglevel > 0) Rprintf(" > got node %s from %s.\n", NODE(matched[j] - 1), NODE(i)); depth[matched[j] - 1] = d + 1; }/*FOR*/ UNPROTECT(1); }/*FOR*/ /* all nodes have been visited, break. */ if (!changed) break; }/*FOR*/ if (debuglevel > 0) Rprintf("* all nodes have been scheduled.\n"); /* add the node labels to the return value. */ setAttrib(ordering, R_NamesSymbol, nodes); UNPROTECT(2); return ordering; }/*TOPOLOGICAL_ORDERING*/
double castelo_prior(SEXP beta, SEXP target, SEXP parents, SEXP children, int debuglevel) { int i = 0, k = 0, t = 0, nnodes = 0, cur_arc = 0; int nbeta = length(VECTOR_ELT(beta, 0)); int *temp = NULL, *aid = INTEGER(VECTOR_ELT(beta, 2)); double prior = 0, result = 0; double *bkwd = REAL(VECTOR_ELT(beta, 4)), *fwd = REAL(VECTOR_ELT(beta, 3)); short int *adjacent = NULL; SEXP nodes, try; /* get the node labels. */ nodes = getAttrib(beta, BN_NodesSymbol); nnodes = length(nodes); /* match the target node. */ PROTECT(try = match(nodes, target, 0)); t = INT(try); UNPROTECT(1); /* find out which nodes are parents and which nodes are children. */ adjacent = allocstatus(nnodes); PROTECT(try = match(nodes, parents, 0)); temp = INTEGER(try); for (i = 0; i < length(try); i++) adjacent[temp[i] - 1] = PARENT; UNPROTECT(1); PROTECT(try = match(nodes, children, 0)); temp = INTEGER(try); for (i = 0; i < length(try); i++) adjacent[temp[i] - 1] = CHILD; UNPROTECT(1); /* prior probabilities table lookup. */ for (i = t + 1; i <= nnodes; i++) { /* compute the arc id. */ cur_arc = UPTRI3(t, i, nnodes); /* look up the prior probability. */ for (/*k,*/ prior = ((double)1/3); k < nbeta; k++) { /* arcs are ordered, so we can stop early in the lookup. */ if (aid[k] > cur_arc) break; if (aid[k] == cur_arc) { switch(adjacent[i - 1]) { case PARENT: prior = bkwd[k]; break; case CHILD: prior = fwd[k]; break; default: prior = fmax2(0, 1 - bkwd[k] - fwd[k]); }/*SWITCH*/ break; }/*THEN*/ }/*FOR*/ if (debuglevel > 0) { switch(adjacent[i - 1]) { case PARENT: Rprintf(" > found arc %s -> %s, prior pobability is %lf.\n", NODE(i - 1), NODE(t - 1), prior); break; case CHILD: Rprintf(" > found arc %s -> %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); break; default: Rprintf(" > no arc between %s and %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); }/*SWITCH*/ }/*THEN*/ /* move to log-scale and divide by the non-informative log(1/3), so that * the contribution of each arc whose prior has not been specified by the * user is zero; overflow is likely otherwise. */ result += log(prior / ((double)1/3)); }/*FOR*/ return result; }/*CASTELO_PRIOR*/ /* complete a prior as per Castelo & Siebes. */ SEXP castelo_completion(SEXP prior, SEXP nodes, SEXP learning) { int i = 0, k = 0, cur = 0, narcs1 = 0, narcs2 = 0, nnodes = length(nodes); int *m1 = NULL, *m2 = NULL, *und = NULL, *aid = NULL, *poset = NULL, *id = NULL; double *d1 = NULL, *d2 = NULL, *p = NULL; SEXP df, arc_id, undirected, a1, a2, match1, match2, prob; SEXP result, from, to, nid, dir1, dir2; /* compute numeric IDs for the arcs. */ a1 = VECTOR_ELT(prior, 0); a2 = VECTOR_ELT(prior, 1); narcs1 = length(a1); PROTECT(match1 = match(nodes, a1, 0)); PROTECT(match2 = match(nodes, a2, 0)); m1 = INTEGER(match1); m2 = INTEGER(match2); PROTECT(arc_id = allocVector(INTSXP, narcs1)); aid = INTEGER(arc_id); c_arc_hash(narcs1, nnodes, m1, m2, aid, NULL, FALSE); /* duplicates correspond to undirected arcs. */ PROTECT(undirected = dupe(arc_id)); und = INTEGER(undirected); /* extract the components from the prior. */ prob = VECTOR_ELT(prior, 2); p = REAL(prob); /* count output arcs. */ for (i = 0; i < narcs1; i++) narcs2 += 2 - und[i]; narcs2 /= 2; /* allocate the columns of the return value. */ PROTECT(from = allocVector(STRSXP, narcs2)); PROTECT(to = allocVector(STRSXP, narcs2)); PROTECT(nid = allocVector(INTSXP, narcs2)); id = INTEGER(nid); PROTECT(dir1 = allocVector(REALSXP, narcs2)); d1 = REAL(dir1); PROTECT(dir2 = allocVector(REALSXP, narcs2)); d2 = REAL(dir2); /* sort the strength coefficients. */ poset = alloc1dcont(narcs1); for (k = 0; k < narcs1; k++) poset[k] = k; R_qsort_int_I(aid, poset, 1, narcs1); for (i = 0, k = 0; i < narcs1; i++) { cur = poset[i]; #define ASSIGN(A1, A2, D1, D2) \ SET_STRING_ELT(from, k, STRING_ELT(A1, cur)); \ SET_STRING_ELT(to, k, STRING_ELT(A2, cur)); \ id[k] = aid[i]; \ D1[k] = p[cur]; \ if ((und[cur] == TRUE) && (i < narcs1 - 1)) \ D2[k] = p[poset[++i]]; \ else \ D2[k] = (1 - D1[k])/2; /* copy the node labels. */ if (m1[cur] < m2[cur]) { ASSIGN(a1, a2, d1, d2); }/*THEN*/ else { ASSIGN(a2, a1, d2, d1); }/*ELSE*/ /* check the probabilities do not exceed 1; fail only for large errors. */ if (d1[k] + d2[k] > 1) { if (d1[k] + d2[k] < 1 + 2 * MACHINE_TOL) { d1[k] = d1[k] / (d1[k] + d2[k]); d2[k] = d2[k] / (d1[k] + d2[k]); }/*THEN*/ else { UNPROTECT(9); error("the probabilities for arc %s -> %s sum to %lf.", CHAR(STRING_ELT(from, k)), CHAR(STRING_ELT(to, k)), d1[k] + d2[k]); }/*ELSE*/ }/*THEN*/ /* bound the probability of not including an arc away from zero, structure * learning otherwise fails when starting from the empty graph and gets * stuck very easily in general. */ if (isTRUE(learning) && (fabs(1 - d1[k] - d2[k]) < MACHINE_TOL)) { d1[k] = d1[k] - MACHINE_TOL; d2[k] = d2[k] - MACHINE_TOL; }/*THEN*/ /* move to the next arc. */ k++; }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, 5)); SET_VECTOR_ELT(result, 0, from); SET_VECTOR_ELT(result, 1, to); SET_VECTOR_ELT(result, 2, nid); SET_VECTOR_ELT(result, 3, dir1); SET_VECTOR_ELT(result, 4, dir2); setAttrib(result, R_NamesSymbol, mkStringVec(5, "from", "to", "aid", "fwd", "bkwd")); PROTECT(df = minimal_data_frame(result)); UNPROTECT(11); return df; }/*CASTELO_COMPLETION*/