value arc_hash_insert(arc *c, value hash, value key, value val) { unsigned int hv, index, i; value e; index = 0; /* First of all, look for the key if a binding already exists for it */ e = hash_lookup(c, hash, key, &index); if (BOUND_P(e)) { /* if we are already bound, overwrite the old value */ e = VINDEX(HASH_TABLE(hash), index); BVALUE(e) = val; return(val); } /* Not yet bound. Look for a slot where we can put it */ if (HASH_NENTRIES(hash)+1 > HASH_LLIMIT(hash)) hashtable_expand(c, hash); SET_NENTRIES(hash, HASH_NENTRIES(hash)+1); hv = arc_hash(c, key); index = hv & TABLEMASK(hash); for (i=0;; i++) { e = VINDEX(HASH_TABLE(hash), index); /* If we see an empty bucket in our search, or if we see a bucket whose key is the same as the key specified, we have found the place where the element should go. This second case should never happen, based on what we did above, but hey, belt and suspenders. */ if (EMPTYP(e) || arc_is2(c, BKEY(e), key) == CTRUE) break; /* We found a bucket, but it is occupied by some other key. Continue probing. */ index = (index + PROBE(i)) & TABLEMASK(hash); } if (EMPTYP(e)) { /* No such key in the hash table yet. Create a bucket and assign it to the table. */ e = mkhashbucket(c, key, val, index, hash, INT2FIX(hv)); SVINDEX(HASH_TABLE(hash), index, e); } else { /* The key already exists. Use the current bucket but change the value to the value specified. */ BVALUE(e) = val; } return(val); }
static value hash_lookup(arc *c, value hash, value key, unsigned int *index) { unsigned int hv, i; value e; hv = arc_hash(c, key); *index = hv & TABLEMASK(hash); for (i=0;; i++) { *index = (*index + PROBE(i)) & TABLEMASK(hash); e = HASH_INDEX(hash, *index); /* CUNBOUND means there was never any element at that index, so we can stop. */ if (e == CUNBOUND) return(CUNBOUND); /* CUNDEF means that there was an element at that index, but it was deleted at some point, so we may need to continue probing. */ if (e == CUNDEF) continue; if (arc_is2(c, BKEY(e), key) == CTRUE) return(BVALUE(e)); } return(CUNBOUND); }
/* backend for the all.equal() function for bn objects. */ SEXP all_equal(SEXP target, SEXP current) { int nnodes = 0, narcs = 0; int *t = NULL, *c = NULL; SEXP tnodes, cnodes, cmatch, tarcs, carcs, thash, chash; /* get the node set of each network. */ tnodes = getAttrib(getListElement(target, "nodes"), R_NamesSymbol); cnodes = getAttrib(getListElement(current, "nodes"), R_NamesSymbol); /* first check: node sets must have the same size. */ if (length(tnodes) != length(cnodes)) return mkString("Different number of nodes"); /* store for future use. */ nnodes = length(tnodes); /* second check: node sets must contain the same node labels. */ PROTECT(cmatch = match(tnodes, cnodes, 0)); c = INTEGER(cmatch); /* sorting takes care of different node orderings. */ R_isort(c, nnodes); /* check that every node in the first network is also present in the * second one; this is enough because the node sets have the same size * and the nodes in each set are guaranteed to be unique. */ for (int i = 0; i < nnodes; i++) { if (c[i] != i + 1) { UNPROTECT(1); return mkString("Different node sets"); }/*THEN*/ }/*FOR*/ UNPROTECT(1); /* get the node set of each network. */ tarcs = getListElement(target, "arcs"); carcs = getListElement(current, "arcs"); /* third check: arc sets must have the same size. */ if (length(tarcs) != length(carcs)) return mkString("Different number of directed/undirected arcs"); /* store for future use. */ narcs = length(tarcs)/2; /* fourth check: arcs sets must contain the same arcs. */ if (narcs > 0) { /* compute the numeric hashes of both arc sets (against the same * node set to make comparisons meaningful) and sort them. */ PROTECT(thash = arc_hash(tarcs, tnodes, FALSE, TRUE)); PROTECT(chash = arc_hash(carcs, tnodes, FALSE, TRUE)); /* dereference the resulting integer vectors. */ t = INTEGER(thash); c = INTEGER(chash); /* sorting takes care of different arc orderings. */ R_isort(t, narcs); R_isort(c, narcs); /* compare the integer vectors as generic memory areas. */ if (memcmp(t, c, narcs * sizeof(int))) { UNPROTECT(2); return mkString("Different arc sets"); }/*THEN*/ UNPROTECT(2); }/*THEN*/ /* all checks completed successfully, returning TRUE. */ return ScalarLogical(TRUE); }/*ALL_EQUAL*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncol = length(data); int num = length(VECTOR_ELT(data, 0)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, clevels = 0, *est = INTEGER(estimator), *depth = NULL; int *wl = NULL, *bl = NULL, *poset = NULL, debuglevel = isTRUE(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); include = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, cond, &clevels, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS(); /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = length(wlist); for (i = 0; i < nwl; i++) { if (debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = length(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(int)); for (i = 0; i < UPTRI3_MATRIX(ncol); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncol)); depth = Calloc1D(ncol, sizeof(int)); for (i = UPTRI3_MATRIX(ncol) - 1; i >= 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncol, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncol - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, depth, debug_coord[0], debug_coord[1], ncol, nodes, FALSE)) { if (debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (length(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncol - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncol - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncol - 1)); Free1D(depth); Free1D(mim); Free1D(include); Free1D(poset); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); return arcs; }/*CHOW_LIU*/
/* ARACNE structure learning algorithm. */ SEXP aracne(SEXP data, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP debug) { int i = 0, j = 0, k = 0, coord = 0, ncol = length(data); int num = length(VECTOR_ELT(data, i)), narcs = ncol * (ncol - 1) / 2; int *nlevels = NULL, *est = INTEGER(estimator), *wl = NULL, *bl = NULL; int debuglevel = isTRUE(debug); void **columns = NULL; short int *exclude = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, nodes, wlist, blist; PROTECT(nodes = getAttrib(data, R_NamesSymbol)); /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); exclude = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, NULL, NULL, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS() /* compare all the triplets. */ for (i = 0; i < ncol; i++) { for (j = i + 1; j < ncol; j++) { for (k = 0; k < ncol; k++) { if ((k == i) || (k == j)) continue; /* cache the UPTRI3 coordinates of the arc. */ coord = UPTRI3(i + 1, j + 1, ncol); /* if MI(X, Y) < min(MI(X, Z), MI(Z, Y)) drop arc X - Y. */ if ((mim[coord] < mim[UPTRI3(i + 1, k + 1, ncol)]) && (mim[coord] < mim[UPTRI3(j + 1, k + 1, ncol)])) { if (debuglevel > 0) { Rprintf("* dropping arc %s - %s because of %s, %lf < min(%lf, %lf)\n", NODE(i), NODE(j), NODE(k), mim[UPTRI3(i + 1, j + 1, ncol)], mim[UPTRI3(i + 1, k + 1, ncol)], mim[UPTRI3(j + 1, k + 1, ncol)]); }/*THEN*/ /* update the status vector. */ exclude[coord] = 1; /* decrement the number of arcs. */ narcs--; break; }/*THEN*/ }/*FOR*/ }/*FOR*/ }/*FOR*/ /* add back whitelisted arcs. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); for (i = 0; i < length(wlist); i++) { if (debuglevel > 0) { Rprintf("* adding back whitelisted arcs.\n"); if (exclude[wl[i]] == 1) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[wl[i]] == 1) narcs++; /* include the arc in the graph. */ exclude[wl[i]] = 0; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* remove blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); for (i = 0; i < length(blist); i++) { if (debuglevel > 0) { Rprintf("* removing blacklisted arcs.\n"); if (exclude[bl[i]] == 0) { Rprintf(" > arc %s - %s has been dropped from the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was not present in the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[bl[i]] == 0) narcs--; /* remove the arc from the graph. */ exclude[bl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ CONVERT_TO_ARC_SET(exclude, 1, 2 * narcs); Free1D(mim); Free1D(exclude); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); UNPROTECT(1); return arcs; }/*ARACNE*/
/* mean strength for p-values and score deltas. */ static void mean_strength_overall(SEXP *mean_df, SEXP strength, SEXP nodes, int nrows, int nstr, SEXP ref_hash, double *w) { int i = 0, j = 0, *t = NULL; double *mstr = NULL, *cur_strength = NULL; long double cumw = 0; SEXP mean_str, cur, cur_hash, try; /* allocate the strength accumulator vector. */ PROTECT(mean_str = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 2, mean_str); mstr = REAL(mean_str); memset(mstr, '\0', nrows * sizeof(double)); for (i = 0; i < nstr; i++) { /* move to the next object. */ cur = VECTOR_ELT(strength, i); /* get the strength values from the bn.strength object. */ cur_strength = REAL(VECTOR_ELT(cur, 2)); /* get the arc IDs to use to correctly match strengths. */ PROTECT(cur_hash = arc_hash(cur, nodes, FALSE, FALSE)); /* match the current arc IDs to the reference arc IDs. */ PROTECT(try = match(ref_hash, cur_hash, 0)); t = INTEGER(try); for (j = 0; j < nrows; j++) mstr[t[j] - 1] += w[i] * cur_strength[j]; /* update the total weight mass. */ cumw += w[i]; UNPROTECT(2); }/*FOR*/ /* rescale by the total weight mass. */ for (j = 0; j < nrows; j++) mstr[j] /= cumw; UNPROTECT(1); }/*MEAN_STRENGTH_OVERALL*/ /* mean strength for bootstrap probabilities. */ static void mean_strength_direction(SEXP *mean_df, SEXP strength, SEXP nodes, int nrows, int nstr, SEXP ref_hash, double *w) { int i = 0, j = 0, *t = NULL, nnodes = length(nodes); double *mstr = NULL, *mdir = NULL, *cur_strength = NULL, *cur_dir = NULL; double fwd = 0, bkwd = 0; long double cumw = 0; SEXP mean_str, mean_dir, cur, cur_hash, try; /* allocate vectors for strength and direction. */ PROTECT(mean_str = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 2, mean_str); mstr = REAL(mean_str); memset(mstr, '\0', nrows * sizeof(double)); PROTECT(mean_dir = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 3, mean_dir); mdir = REAL(mean_dir); memset(mdir, '\0', nrows * sizeof(double)); for (i = 0; i < nstr; i++) { /* move to the next object. */ cur = VECTOR_ELT(strength, i); /* get the strength and direction values from the bn.strength object. */ cur_strength = REAL(VECTOR_ELT(cur, 2)); cur_dir = REAL(VECTOR_ELT(cur, 3)); /* get the arc IDs to use to correctly match strengths. */ PROTECT(cur_hash = arc_hash(cur, nodes, FALSE, FALSE)); /* match the current arc IDs to the reference arc IDs. */ PROTECT(try = match(ref_hash, cur_hash, 0)); t = INTEGER(try); for (j = 0; j < nrows; j++) mstr[t[j] - 1] += w[i] * (cur_strength[j] * cur_dir[j]); /* update the total weight mass. */ cumw += w[i]; UNPROTECT(2); }/*FOR*/ /* rescale by the total weight mass. */ for (j = 0; j < nrows; j++) mstr[j] /= cumw; /* split arc strength from direction strength. */ for (i = 0; i < nnodes; i++) { for (j = i + 1; j < nnodes; j++) { fwd = mstr[CMC(j, i, nnodes) - i - 1]; bkwd = mstr[CMC(i, j, nnodes) - j]; mstr[CMC(j, i, nnodes) - i - 1] = mstr[CMC(i, j, nnodes) - j] = fwd + bkwd; if (bkwd + fwd > 0) { mdir[CMC(j, i, nnodes) - i - 1] = fwd / (fwd + bkwd); mdir[CMC(i, j, nnodes) - j] = bkwd / (fwd + bkwd); }/*THEN*/ else { mdir[CMC(j, i, nnodes) - i - 1] = mdir[CMC(i, j, nnodes) - j] = 0; }/*ELSE*/ }/*FOR*/ }/*FOR*/ UNPROTECT(2); }/*MEAN_STRENGTH_DIRECTION*/ /* average multiple bn.strength objects, with weights. */ SEXP mean_strength(SEXP strength, SEXP nodes, SEXP weights) { int nstr = length(weights), ncols = 0, nrows = 0; double *w = REAL(weights); const char *m = NULL; SEXP ref, ref_hash, mean_df, method; /* initialize the result using the first bn.strength object as a reference. */ ref = VECTOR_ELT(strength, 0); ncols = length(ref); nrows = length(VECTOR_ELT(ref, 0)); PROTECT(mean_df = allocVector(VECSXP, ncols)); setAttrib(mean_df, R_NamesSymbol, getAttrib(ref, R_NamesSymbol)); SET_VECTOR_ELT(mean_df, 0, VECTOR_ELT(ref, 0)); SET_VECTOR_ELT(mean_df, 1, VECTOR_ELT(ref, 1)); /* make it a data frame */ minimal_data_frame(mean_df); /* compute the arc IDs to match arcs of later bn.strength objects. */ PROTECT(ref_hash = arc_hash(ref, nodes, FALSE, FALSE)); /* switch backend according to how the strengths were computed. */ method = getAttrib(ref, BN_MethodSymbol); m = CHAR(STRING_ELT(method, 0)); if ((strcmp(m, "score") == 0) || (strcmp(m, "test") == 0)) mean_strength_overall(&mean_df, strength, nodes, nrows, nstr, ref_hash, w); else if (strcmp(m, "bootstrap") == 0) mean_strength_direction(&mean_df, strength, nodes, nrows, nstr, ref_hash, w); UNPROTECT(2); return mean_df; }/*MEAN_STRENGTH*/
/* backend for the all.equal() function for bn objects. */ SEXP all_equal(SEXP target, SEXP current) { int nnodes = 0, narcs = 0; int *t = NULL, *c = NULL; SEXP tnodes, cnodes, cmatch, tarcs, carcs, thash, chash, result; /* get the node set of each network. */ tnodes = getAttrib(getListElement(target, "nodes"), R_NamesSymbol); cnodes = getAttrib(getListElement(current, "nodes"), R_NamesSymbol); /* first check: node sets must have the same size. */ if (LENGTH(tnodes) != LENGTH(cnodes)) { PROTECT(result = allocVector(STRSXP, 1)); SET_STRING_ELT(result, 0, mkChar("Different number of nodes")); UNPROTECT(1); return result; }/*THEN*/ /* store for future use. */ nnodes = LENGTH(tnodes); /* second check: node sets must contain the same node labels. */ PROTECT(cmatch = match(tnodes, cnodes, 0)); c = INTEGER(cmatch); R_isort(c, narcs); /* check that every node in the first network is also present in the * second one; this is enough because the node sets have the same size * and the nodes in each set are guaranteed to be unique. */ for (int i = 0; i < nnodes; i++) { if (c[i] != i + 1) { PROTECT(result = allocVector(STRSXP, 1)); SET_STRING_ELT(result, 0, mkChar("Different node sets")); UNPROTECT(2); return result; }/*THEN*/ }/*FOR*/ UNPROTECT(1); /* get the node set of each network. */ tarcs = getListElement(target, "arcs"); carcs = getListElement(current, "arcs"); /* third check: arc sets must have the same size. */ if (LENGTH(tarcs) != LENGTH(carcs)) { PROTECT(result = allocVector(STRSXP, 1)); SET_STRING_ELT(result, 0, mkChar("Different number of directed/undirected arcs")); UNPROTECT(1); return result; }/*THEN*/ /* store for future use. */ narcs = LENGTH(tarcs)/2; /* fourth check: arcs sets must contain the same arcs. */ if (narcs > 0) { /* compute the numeric hash of the arcs and sort them. */ PROTECT(thash = arc_hash(tarcs, tnodes, FALSE, TRUE)); PROTECT(chash = arc_hash(carcs, cnodes, FALSE, TRUE)); /* dereference the resulting integer vectors. */ t = INTEGER(thash); c = INTEGER(chash); /* compare the integer vectors as generic memory areas. */ if (memcmp(t, c, narcs * sizeof(int))) { PROTECT(result = allocVector(STRSXP, 1)); SET_STRING_ELT(result, 0, mkChar("Different arc sets")); UNPROTECT(3); return result; }/*THEN*/ UNPROTECT(2); }/*THEN*/ /* all checks completed successfully, returning TRUE. */ PROTECT(result = allocVector(LGLSXP, 1)); LOGICAL(result)[0] = TRUE; UNPROTECT(1); return result; }/*ALL_EQUAL*/