/* determine whether a graph is DAG or a PDAG/UG. */ SEXP is_dag(SEXP arcs, SEXP nnodes) { int i = 0, nrows = length(arcs)/2, n = INT(nnodes); int *a = INTEGER(arcs); short int *checklist = NULL; /* allocate and initialize the checklist. */ checklist = allocstatus(UPTRI_MATRIX(n)); for (i = 0; i < nrows; i++) { if (checklist[UPTRI(a[CMC(i, 0, nrows)], a[CMC(i, 1, nrows)], n)] == 0) { /* this arc is not present in the checklist; add it. */ checklist[UPTRI(a[CMC(i, 0, nrows)], a[CMC(i, 1, nrows)], n)] = 1; }/*THEN*/ else { /* this arc or its opposite already present in the checklist; the graph * has at least an undirected arc, so return FALSE. */ return ScalarLogical(FALSE); }/*THEN*/ }/*FOR*/ return ScalarLogical(TRUE); }/*IS_DAG*/
/* get the Markov blanket of a node from a fitted model. */ SEXP fitted_mb(SEXP bn, SEXP target) { int i = 0, j = 0, target_node = 0, nnodes = 0, counter = 0; int *matched = NULL; short int *status = NULL; SEXP mb, labels, try, temp, node_data; /* get the nodes' labels. */ labels = getAttrib(bn, R_NamesSymbol); nnodes = LENGTH(labels); /* allocate and initialize a status vector. */ status = allocstatus(nnodes); /* match the label of the target node. */ PROTECT(try = match(labels, target, 0)); target_node = INT(try); UNPROTECT(1); /* mark the target node as such. */ status[target_node - 1] = TARGET; /* match the parents and the children of the target node. */ node_data = VECTOR_ELT(bn, target_node - 1); MATCH_NODES("parents", PARENT); MATCH_NODES("children", CHILD); /* now match the parents of each child. */ for (j = 0; j < nnodes; j++) { /* this is not a child, go on. */ if (status[j] != CHILD) continue; node_data = VECTOR_ELT(bn, j); MATCH_NODES("parents", BLANKET); }/*FOR*/ /* a node is not considered part of its own Markov blanket. */ status[target_node - 1] = 0; /* allocate and initialize the result. */ PROTECT(mb = allocVector(STRSXP, counter)); for (i = 0, j = 0; i < nnodes; i++) if (status[i] != 0) SET_STRING_ELT(mb, j++, STRING_ELT(labels, i)); UNPROTECT(1); return mb; }/*FITTED_MB*/
SEXP is_pdag_acyclic(SEXP arcs, SEXP nodes, SEXP return_nodes, SEXP directed, SEXP debug) { int i = 0, j = 0, z = 0; int nrows = LENGTH(nodes); int check_status = nrows, check_status_old = nrows; int *rowsums = NULL, *colsums = NULL, *crossprod = NULL, *a = NULL; int *debuglevel = NULL; short int *status = NULL; SEXP amat; /* dereference the debug parameter. */ debuglevel = LOGICAL(debug); /* build the adjacency matrix from the arc set. */ if (*debuglevel > 0) Rprintf("* building the adjacency matrix.\n"); PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* should we consider only directed arcs? */ if (isTRUE(directed)) { /* removing undirected arcs, so that only cycles made only of directed * arcs will make the function return TRUE. */ for (i = 0; i < nrows; i++) for (j = 0; j < nrows; j++) if ((a[CMC(i, j, nrows)] == 1) && (a[CMC(j, i, nrows)] == 1)) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; }/*THEN*/ /* initialize the status, {row,col}sums and crossprod arrays. */ status = allocstatus(nrows); rowsums = alloc1dcont(nrows); colsums = alloc1dcont(nrows); crossprod = alloc1dcont(nrows); if (*debuglevel > 0) Rprintf("* checking whether the partially directed graph is acyclic.\n"); /* even in the worst case scenario at least two nodes are marked as * good at each iteration, so even ceil(nrows/2) iterations should be * enough. */ for (z = 0; z < nrows; z++) { start: if (*debuglevel > 0) Rprintf("* beginning iteration %d.\n", z + 1); for (i = 0; i < nrows; i++) { /* skip known-good nodes. */ if (status[i] == GOOD) continue; /* reset and update row and column totals. */ rowsums[i] = colsums[i] = crossprod[i] = 0; /* compute row and column totals for the i-th node. */ for (j = 0; j < nrows; j++) { rowsums[i] += a[CMC(i, j, nrows)]; colsums[i] += a[CMC(j, i, nrows)]; crossprod[i] += a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)]; }/*FOR*/ there: if (*debuglevel > 0) Rprintf(" > checking node %s (%d child(ren), %d parent(s), %d neighbours).\n", NODE(i), rowsums[i], colsums[i], crossprod[i]); /* if either total is zero, the node is either a root node or a * leaf node, and is not part of any cycle. */ if (((rowsums[i] == 0) || (colsums[i] == 0)) || ((crossprod[i] == 1) && (rowsums[i] == 1) && (colsums[i] == 1))) { if (*debuglevel > 0) Rprintf(" @ node %s is cannot be part of a cycle.\n", NODE(i)); /* update the adjacency matrix and the row/column totals. */ for (j = 0; j < nrows; j++) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; rowsums[i] = colsums[i] = crossprod[i] = 0; /* mark the node as good. */ status[i] = GOOD; check_status--; }/*THEN*/ else if (crossprod[i] == 1) { /* find the other of the undirected arc. */ for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) break; /* safety check, just in case. */ if (i == j) continue; if (((colsums[i] == 1) && (colsums[j] == 1)) || ((rowsums[i] == 1) && (rowsums[j] == 1))) { if (*debuglevel > 0) Rprintf(" @ arc %s - %s is cannot be part of a cycle.\n", NODE(i), NODE(j)); /* update the adjacency matrix and the row/column totals. */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; crossprod[i] = 0; rowsums[i]--; colsums[i]--; rowsums[j]--; colsums[j]--; /* jump back to the first check; if either the row or column total * was equal to 1 only because of the undirected arc, the node can * now be marked as good. */ if ((rowsums[i] == 0) || (colsums[i] == 0)) goto there; }/*THEN*/ }/*THEN*/ }/*FOR*/ /* at least three nodes are needed to have a cycle. */ if (check_status < 3) { if (*debuglevel > 0) Rprintf("@ at least three nodes are needed to have a cycle.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ /* if there are three or more bad nodes and there was no change in * the last iteration, the algorithm is stuck on a cycle. */ if (check_status_old == check_status) { if (*debuglevel > 0) Rprintf("@ no change in the last iteration.\n"); /* give up and call c_has_path() to kill some undirected arcs. */ for (i = 0; i < nrows; i++) for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) { /* remove the arc from the adjacency matrix while testing it, * there's a path is always found (the arc itself). */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; if(!c_has_path(i, j, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE) && !c_has_path(j, i, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE)) { if (*debuglevel > 0) Rprintf("@ arc %s - %s is not part of any cycle, removing.\n", NODE(i), NODE(j)); /* increase the iteration counter and start again. */ z++; goto start; }/*THEN*/ else { /* at least one cycle is really present; give up and return. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*ELSE*/ }/*THEN*/ /* give up if there are no undirected arcs, cycles composed * entirely by directed arcs are never false positives. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ else { check_status_old = check_status; }/*ELSE*/ }/*FOR*/ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*IS_PDAG_ACYCLIC*/
/* get root or leaf nodes of the graph. */ SEXP root_nodes(SEXP bn, SEXP leaves) { short int *status = NULL; int *get_leaves = INTEGER(leaves); int i = 0, k = 0, counter = 0; SEXP temp, temp2, nodes, node_data, labels, result; /* get to the nodes' data. */ nodes = getListElement(bn, "nodes"); /* this is for "bn.fit" objects. */ if (isNull(nodes)) nodes = bn; /* get the nodes' labels. */ labels = getAttrib(nodes, R_NamesSymbol); /* allocate and initialize a status vector. */ status = allocstatus(LENGTH(nodes)); for (i = 0; i < LENGTH(nodes); i++) { /* get the parents/children of this node. */ node_data = VECTOR_ELT(nodes, i); if (*get_leaves == FALSE) temp = getListElement(node_data, "parents"); else temp = getListElement(node_data, "children"); /* this is not a root/leaf node, go on. */ if (LENGTH(temp) != 0) continue; /* this takes care of dubious neighbours in "bn" objects. */ temp = getListElement(node_data, "nbr"); if (!isNull(temp)) { if (*get_leaves == FALSE) temp2 = getListElement(node_data, "children"); else temp2 = getListElement(node_data, "parents"); /* in partially directed graphs not all neighbours can be classified as * parents or children due to undirected arcs; return only nodes which * are not incident on any undirected arc. */ if (LENGTH(temp) != LENGTH(temp2)) continue; }/*THEN*/ /* this is a root/leaf node, all right. */ status[i] = 1; /* increase the counter. */ counter++; }/*FOR*/ /* allocate and initialize the result. */ PROTECT(result = allocVector(STRSXP, counter)); for (i = 0; i < LENGTH(nodes); i++) if (status[i] == 1) SET_STRING_ELT(result, k++, STRING_ELT(labels, i)); UNPROTECT(1); return result; }/*ROOT_NODES*/
double castelo_prior(SEXP beta, SEXP target, SEXP parents, SEXP children, SEXP debug) { int i = 0, k = 0, t = 0, nnodes = 0, cur_arc = 0; int nbeta = LENGTH(VECTOR_ELT(beta, 0)); int *temp = NULL, *debuglevel = LOGICAL(debug), *aid = INTEGER(VECTOR_ELT(beta, 2)); double prior = 0, result = 0; double *bkwd = REAL(VECTOR_ELT(beta, 4)), *fwd = REAL(VECTOR_ELT(beta, 3)); short int *adjacent = NULL; SEXP nodes, try; /* get the node labels. */ nodes = getAttrib(beta, install("nodes")); nnodes = LENGTH(nodes); /* match the target node. */ PROTECT(try = match(nodes, target, 0)); t = INT(try); UNPROTECT(1); /* find out which nodes are parents and which nodes are children. */ adjacent = allocstatus(nnodes); PROTECT(try = match(nodes, parents, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = PARENT; UNPROTECT(1); PROTECT(try = match(nodes, children, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = CHILD; UNPROTECT(1); /* prior probabilities table lookup. */ for (i = t + 1; i <= nnodes; i++) { /* compute the arc id. */ cur_arc = UPTRI3(t, i, nnodes); /* look up the prior probability. */ for (/*k,*/ prior = ((double)1/3); k < nbeta; k++) { /* arcs are ordered, so we can stop early in the lookup. */ if (aid[k] > cur_arc) break; if (aid[k] == cur_arc) { switch(adjacent[i - 1]) { case PARENT: prior = bkwd[k]; break; case CHILD: prior = fwd[k]; break; default: prior = 1 - bkwd[k] - fwd[k]; }/*SWITCH*/ break; }/*THEN*/ }/*FOR*/ if (*debuglevel > 0) { switch(adjacent[i - 1]) { case PARENT: Rprintf(" > found arc %s -> %s, prior pobability is %lf.\n", NODE(i - 1), NODE(t - 1), prior); break; case CHILD: Rprintf(" > found arc %s -> %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); break; default: Rprintf(" > no arc between %s and %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); }/*SWITCH*/ }/*THEN*/ /* move to log-scale and divide by the non-informative log(1/3), so that * the contribution of each arc whose prior has not been not specified by * the user is zero; overflow is likely otherwise. */ result += log(prior / ((double)1/3)); }/*FOR*/ return result; }/*CASTELO_PRIOR*/ /* complete a prior as per Castelo & Siebes. */ SEXP castelo_completion(SEXP prior, SEXP nodes) { int i = 0, k = 0, cur = 0, narcs1 = 0, narcs2 = 0, nnodes = LENGTH(nodes); int *m1 = NULL, *m2 = NULL, *und = NULL, *aid = NULL, *poset = NULL, *id = NULL; double *d1 = NULL, *d2 = NULL, *p = NULL; SEXP df, arc_id, undirected, a1, a2, match1, match2, prob; SEXP result, colnames, from, to, nid, dir1, dir2; /* compute numeric IDs for the arcs. */ a1 = VECTOR_ELT(prior, 0); a2 = VECTOR_ELT(prior, 1); narcs1 = LENGTH(a1); PROTECT(match1 = match(nodes, a1, 0)); PROTECT(match2 = match(nodes, a2, 0)); m1 = INTEGER(match1); m2 = INTEGER(match2); PROTECT(arc_id = allocVector(INTSXP, narcs1)); aid = INTEGER(arc_id); c_arc_hash(&narcs1, &nnodes, m1, m2, aid, NULL, TRUE); /* duplicates correspond to undirected arcs. */ PROTECT(undirected = dupe(arc_id)); und = INTEGER(undirected); /* extract the components from the prior. */ prob = VECTOR_ELT(prior, 2); p = REAL(prob); /* count output arcs. */ for (i = 0; i < narcs1; i++) narcs2 += 2 - und[i]; narcs2 /= 2; /* allocate the columns of the return value. */ PROTECT(from = allocVector(STRSXP, narcs2)); PROTECT(to = allocVector(STRSXP, narcs2)); PROTECT(nid = allocVector(INTSXP, narcs2)); id = INTEGER(nid); PROTECT(dir1 = allocVector(REALSXP, narcs2)); d1 = REAL(dir1); PROTECT(dir2 = allocVector(REALSXP, narcs2)); d2 = REAL(dir2); /* sort the strength coefficients. */ poset = alloc1dcont(narcs1); for (k = 0; k < narcs1; k++) poset[k] = k; R_qsort_int_I(aid, poset, 1, narcs1); for (i = 0, k = 0; i < narcs1; i++) { cur = poset[i]; #define ASSIGN(A1, A2, D1, D2) \ SET_STRING_ELT(from, k, STRING_ELT(A1, cur)); \ SET_STRING_ELT(to, k, STRING_ELT(A2, cur)); \ id[k] = aid[i]; \ D1[k] = p[cur]; \ if ((und[cur] == TRUE) && (i < narcs1 - 1)) \ D2[k] = p[poset[++i]]; \ else \ D2[k] = (1 - D1[k])/2; /* copy the node labels. */ if (m1[cur] < m2[cur]) { ASSIGN(a1, a2, d1, d2); }/*THEN*/ else { ASSIGN(a2, a1, d2, d1); }/*ELSE*/ if (d1[k] + d2[k] > 1) { UNPROTECT(9); error("the probabilities for arc %s -> %s sum to %lf.", CHAR(STRING_ELT(from, k)), CHAR(STRING_ELT(to, k)), d1[k] + d2[k]); }/*THEN*/ /* move to the next arc. */ k++; }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, 5)); SET_VECTOR_ELT(result, 0, from); SET_VECTOR_ELT(result, 1, to); SET_VECTOR_ELT(result, 2, nid); SET_VECTOR_ELT(result, 3, dir1); SET_VECTOR_ELT(result, 4, dir2); PROTECT(colnames = allocVector(STRSXP, 5)); SET_STRING_ELT(colnames, 0, mkChar("from")); SET_STRING_ELT(colnames, 1, mkChar("to")); SET_STRING_ELT(colnames, 2, mkChar("aid")); SET_STRING_ELT(colnames, 3, mkChar("fwd")); SET_STRING_ELT(colnames, 4, mkChar("bkwd")); setAttrib(result, R_NamesSymbol, colnames); PROTECT(df = minimal_data_frame(result)); UNPROTECT(12); return df; }/*CASTELO_COMPLETION*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncols = LENGTH(data); int num = LENGTH(VECTOR_ELT(data, i)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, *clevels = NULL, *est = INTEGER(estimator); int *wl = NULL, *bl = NULL, *poset = NULL, *debuglevel = LOGICAL(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = alloc1dcont(1); *clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = alloc1dreal(UPTRI3_MATRIX(ncols)); include = allocstatus(UPTRI3_MATRIX(ncols)); /* compute the pairwise mutual information coefficients. */ if (*debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncols, nlevels, &num, cond, clevels, est); LIST_MUTUAL_INFORMATION_COEFS() /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (LENGTH(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = LENGTH(wlist); for (i = 0; i < nwl; i++) { if (*debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (LENGTH(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = LENGTH(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = alloc1dcont(UPTRI3_MATRIX(ncols)); for (i = 0; i < UPTRI3_MATRIX(ncols); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncols)); for (i = UPTRI3_MATRIX(ncols) - 1; i > 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncols, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncols - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (*debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, debug_coord[0], debug_coord[1], ncols, nodes, FALSE)) { if (*debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (*debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (LENGTH(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncols - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncols - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncols - 1)); return arcs; }/*CHOW_LIU*/
/* ARACNE structure learning algorithm. */ SEXP aracne(SEXP data, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP debug) { int i = 0, j = 0, k = 0, coord = 0, ncols = LENGTH(data); int num = LENGTH(VECTOR_ELT(data, i)), narcs = ncols * (ncols - 1) / 2; int *nlevels = NULL, *est = INTEGER(estimator), *wl = NULL, *bl = NULL; int *debuglevel = LOGICAL(debug); void **columns = NULL; short int *exclude = NULL; double *mim = NULL; SEXP arcs, nodes, wlist, blist; nodes = getAttrib(data, R_NamesSymbol); /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* allocate the mutual information matrix and the status vector. */ mim = alloc1dreal(UPTRI3_MATRIX(ncols)); exclude = allocstatus(UPTRI3_MATRIX(ncols)); /* compute the pairwise mutual information coefficients. */ if (*debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncols, nlevels, &num, NULL, NULL, est); LIST_MUTUAL_INFORMATION_COEFS() /* compare all the triplets. */ for (i = 0; i < ncols; i++) { for (j = i + 1; j < ncols; j++) { for (k = 0; k < ncols; k++) { if ((k == i) || (k == j)) continue; /* cache the UPTRI3 coordinates of the arc. */ coord = UPTRI3(i + 1, j + 1, ncols); /* if MI(X, Y) < min(MI(X, Z), MI(Z, Y)) drop arc X - Y. */ if ((mim[coord] < mim[UPTRI3(i + 1, k + 1, ncols)]) && (mim[coord] < mim[UPTRI3(j + 1, k + 1, ncols)])) { if (*debuglevel > 0) { Rprintf("* dropping arc %s - %s because of %s, %lf < min(%lf, %lf)\n", NODE(i), NODE(j), NODE(k), mim[UPTRI3(i + 1, j + 1, ncols)], mim[UPTRI3(i + 1, k + 1, ncols)], mim[UPTRI3(j + 1, k + 1, ncols)]); }/*THEN*/ /* update the status vector. */ exclude[coord] = 1; /* decrement the number of arcs. */ narcs--; break; }/*THEN*/ }/*FOR*/ }/*FOR*/ }/*FOR*/ /* add back whitelisted arcs. */ if ((!isNull(whitelist)) && (LENGTH(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); for (i = 0; i < LENGTH(wlist); i++) { if (*debuglevel > 0) { Rprintf("* adding back whitelisted arcs.\n"); if (exclude[wl[i]] == 1) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + LENGTH(wlist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + LENGTH(wlist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[wl[i]] == 1) narcs++; /* include the arc in the graph. */ exclude[wl[i]] = 0; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* remove blacklisted arcs. */ if ((!isNull(blacklist)) && (LENGTH(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); for (i = 0; i < LENGTH(blist); i++) { if (*debuglevel > 0) { Rprintf("* removing blacklisted arcs.\n"); if (exclude[bl[i]] == 0) { Rprintf(" > arc %s - %s has been dropped from the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + LENGTH(blist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was not present in the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + LENGTH(blist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[bl[i]] == 0) narcs--; /* remove the arc from the graph. */ exclude[bl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ CONVERT_TO_ARC_SET(exclude, 1, 2 * narcs); return arcs; }/*ARACNE*/
SEXP is_dag_acyclic(SEXP arcs, SEXP nodes, SEXP return_nodes, SEXP debug) { SEXP amat; int i = 0, j = 0, z = 0; int nrows = LENGTH(nodes); int check_status = nrows; int check_status_old = nrows; int rowsums, colsums; int *a = NULL, *debuglevel = NULL; short int *status = NULL; /* dereference the debug parameter. */ debuglevel = LOGICAL(debug); /* build the adjacency matrix from the arc set. */ if (*debuglevel > 0) Rprintf("* building the adjacency matrix.\n"); PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* allocate and initialize the status array. */ status = allocstatus(nrows); if (*debuglevel > 0) Rprintf("* checking whether the directed graph is acyclic.\n"); /* even in the worst case scenario at least two nodes are marked as * good at each iteration, so even ceil(nrows/2) iterations should be * enough. */ for (z = 0; z < nrows; z++) { if (*debuglevel > 0) Rprintf("* beginning iteration %d.\n", z + 1); for (i = 0; i < nrows; i++) { /* skip known-good nodes. */ if (status[i] == GOOD) continue; /* reset and update row and column totals. */ rowsums = colsums = 0; /* compute row and column totals for the i-th node. */ for (j = 0; j < nrows; j++) { rowsums += a[CMC(i, j, nrows)]; colsums += a[CMC(j, i, nrows)]; }/*FOR*/ if (*debuglevel > 0) Rprintf(" > checking node %s (%d child(ren), %d parent(s)).\n", NODE(i), rowsums, colsums); /* if either total is zero, the node is either a root node or a * leaf node, and is not part of any cycle. */ if ((rowsums == 0) || (colsums == 0)) { if (*debuglevel > 0) Rprintf(" @ node %s is cannot be part of a cycle.\n", NODE(i)); for (j = 0; j < nrows; j++) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; /* mark the node as good. */ status[i] = GOOD; check_status--; }/*THEN*/ }/*FOR*/ /* at least three nodes are needed to have a cycle. */ if (check_status < 3) { if (*debuglevel > 0) Rprintf("@ at least three nodes are needed to have a cycle.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ /* if there are three or more bad nodes and there was no change in * the last iteration, the algorithm is stuck on a cycle. */ if (check_status_old == check_status) { if (*debuglevel > 0) Rprintf("@ no change in the last iteration.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ else check_status_old = check_status; }/*FOR*/ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*IS_DAG_ACYCLIC*/
/* check neighbourhood sets and markov blankets for consistency.. */ SEXP bn_recovery(SEXP bn, SEXP strict, SEXP mb, SEXP debug, SEXP filter) { int i = 0, j = 0, k = 0, n = 0, counter = 0; short int *checklist = NULL, err = 0; int *debuglevel = NULL, *checkmb = NULL, *nbrfilter = NULL; SEXP temp, temp2, nodes, elnames = NULL, fixed; /* get the names of the nodes. */ nodes = getAttrib(bn, R_NamesSymbol); n = LENGTH(nodes); /* allocate and initialize the checklist. */ checklist = allocstatus(UPTRI_MATRIX(n)); /* dereference the debug, mb and filter parameters. */ debuglevel = LOGICAL(debug); checkmb = LOGICAL(mb); nbrfilter = INTEGER(filter); if (*debuglevel > 0) { Rprintf("----------------------------------------------------------------\n"); if (*checkmb) Rprintf("* checking consistency of markov blankets.\n"); else Rprintf("* checking consistency of neighbourhood sets.\n"); }/*THEN*/ /* scan the structure to determine the number of arcs. */ for (i = 0; i < n; i++) { if (*debuglevel > 0) Rprintf(" > checking node %s.\n", NODE(i)); /* get the entry for the (neighbours|elements of the markov blanket) of the node.*/ temp = getListElement(bn, (char *)NODE(i)); if (!(*checkmb)) temp = getListElement(temp, "nbr"); /* check each element of the array and identify which variable it corresponds to. */ for (j = 0; j < LENGTH(temp); j++) { for (k = 0; k < n; k++) { /* increment the right element of checklist. */ if (!strcmp(NODE(k), (char *)CHAR(STRING_ELT(temp, j)))) checklist[UPTRI(i + 1, k + 1, n)]++; }/*FOR*/ }/*FOR*/ }/*FOR*/ /* if A is a neighbour of B, B is a neighbour of A; therefore each entry in * the checklist array must be equal to either zero (if the corresponding * nodes are not neighbours) or two (if the corresponding nodes are neighbours). * Any other value (typically one) is caused by an incorrect (i.e. asymmetric) * neighbourhood structure. The same logic holds for the markov blankets. */ for (i = 0; i < n; i++) for (j = i; j < n; j++) { if ((checklist[UPTRI(i + 1, j + 1, n)] != 0) && (checklist[UPTRI(i + 1, j + 1, n)] != 2)) { if (*debuglevel > 0) { if (*checkmb) Rprintf("@ asymmetry in the markov blankets for %s and %s.\n", NODE(i), NODE(j)); else Rprintf("@ asymmetry in the neighbourhood sets for %s and %s.\n", NODE(i), NODE(j)); }/*THEN*/ err = 1; }/*THEN*/ }/*FOR*/ /* no need to go on if the (neighbourhood sets|markov blankets) are symmetric; * otherwise throw either an error or a warning according to the value of the * strict parameter. */ if (!err) { return bn; }/*THEN*/ else if (isTRUE(strict)) { if (*checkmb) error("markov blankets are not symmetric.\n"); else error("neighbourhood sets are not symmetric.\n"); }/*THEN*/ else { if (*checkmb) warning("markov blankets are not symmetric.\n"); else warning("neighbourhood sets are not symmetric.\n"); }/*ELSE*/ /* build a correct structure to return. */ PROTECT(fixed = allocVector(VECSXP, n)); setAttrib(fixed, R_NamesSymbol, nodes); if (!(*checkmb)) { /* allocate colnames. */ PROTECT(elnames = allocVector(STRSXP, 2)); SET_STRING_ELT(elnames, 0, mkChar("mb")); SET_STRING_ELT(elnames, 1, mkChar("nbr")); }/*THEN*/ for (i = 0; i < n; i++) { if (!(*checkmb)) { /* allocate the "mb" and "nbr" elements of the node. */ PROTECT(temp = allocVector(VECSXP, 2)); SET_VECTOR_ELT(fixed, i, temp); setAttrib(temp, R_NamesSymbol, elnames); /* copy the "mb" part from the old structure. */ temp2 = getListElement(bn, (char *)NODE(i)); temp2 = getListElement(temp2, "mb"); SET_VECTOR_ELT(temp, 0, temp2); }/*THEN*/ /* fix the neighbourhoods with an AND or an OR filter. * AND means that both neighbours have to see each other * to be neighbours, OR means that one neighbour at least * as to see the other one for both to be neighbours. */ switch(*nbrfilter) { case AND_FILTER: /* rescan the checklist. */ for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] == 2) if (i != j) counter++; /* allocate and fill the "nbr" element. */ PROTECT(temp2 = allocVector(STRSXP, counter)); for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] == 2) if (i != j) SET_STRING_ELT(temp2, --counter, STRING_ELT(nodes, j)); break; case OR_FILTER: /* rescan the checklist. */ for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] >= 1) if (i != j) counter++; /* allocate and fill the "nbr" element. */ PROTECT(temp2 = allocVector(STRSXP, counter)); for (j = 0; j < n; j++) if (checklist[UPTRI(i + 1, j + 1, n)] >= 1) if (i != j) SET_STRING_ELT(temp2, --counter, STRING_ELT(nodes, j)); break; } if (*checkmb) { SET_VECTOR_ELT(fixed, i, temp2); UNPROTECT(1); }/*THEN*/ else { SET_VECTOR_ELT(temp, 1, temp2); UNPROTECT(2); }/*ELSE*/ }/*FOR*/ if (*checkmb) UNPROTECT(1); else UNPROTECT(2); return fixed; }/*BN_RECOVERY*/