double castelo_prior(SEXP beta, SEXP target, SEXP parents, SEXP children, SEXP debug) { int i = 0, k = 0, t = 0, nnodes = 0, cur_arc = 0; int nbeta = LENGTH(VECTOR_ELT(beta, 0)); int *temp = NULL, *debuglevel = LOGICAL(debug), *aid = INTEGER(VECTOR_ELT(beta, 2)); double prior = 0, result = 0; double *bkwd = REAL(VECTOR_ELT(beta, 4)), *fwd = REAL(VECTOR_ELT(beta, 3)); short int *adjacent = NULL; SEXP nodes, try; /* get the node labels. */ nodes = getAttrib(beta, install("nodes")); nnodes = LENGTH(nodes); /* match the target node. */ PROTECT(try = match(nodes, target, 0)); t = INT(try); UNPROTECT(1); /* find out which nodes are parents and which nodes are children. */ adjacent = allocstatus(nnodes); PROTECT(try = match(nodes, parents, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = PARENT; UNPROTECT(1); PROTECT(try = match(nodes, children, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = CHILD; UNPROTECT(1); /* prior probabilities table lookup. */ for (i = t + 1; i <= nnodes; i++) { /* compute the arc id. */ cur_arc = UPTRI3(t, i, nnodes); /* look up the prior probability. */ for (/*k,*/ prior = ((double)1/3); k < nbeta; k++) { /* arcs are ordered, so we can stop early in the lookup. */ if (aid[k] > cur_arc) break; if (aid[k] == cur_arc) { switch(adjacent[i - 1]) { case PARENT: prior = bkwd[k]; break; case CHILD: prior = fwd[k]; break; default: prior = 1 - bkwd[k] - fwd[k]; }/*SWITCH*/ break; }/*THEN*/ }/*FOR*/ if (*debuglevel > 0) { switch(adjacent[i - 1]) { case PARENT: Rprintf(" > found arc %s -> %s, prior pobability is %lf.\n", NODE(i - 1), NODE(t - 1), prior); break; case CHILD: Rprintf(" > found arc %s -> %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); break; default: Rprintf(" > no arc between %s and %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); }/*SWITCH*/ }/*THEN*/ /* move to log-scale and divide by the non-informative log(1/3), so that * the contribution of each arc whose prior has not been not specified by * the user is zero; overflow is likely otherwise. */ result += log(prior / ((double)1/3)); }/*FOR*/ return result; }/*CASTELO_PRIOR*/ /* complete a prior as per Castelo & Siebes. */ SEXP castelo_completion(SEXP prior, SEXP nodes) { int i = 0, k = 0, cur = 0, narcs1 = 0, narcs2 = 0, nnodes = LENGTH(nodes); int *m1 = NULL, *m2 = NULL, *und = NULL, *aid = NULL, *poset = NULL, *id = NULL; double *d1 = NULL, *d2 = NULL, *p = NULL; SEXP df, arc_id, undirected, a1, a2, match1, match2, prob; SEXP result, colnames, from, to, nid, dir1, dir2; /* compute numeric IDs for the arcs. */ a1 = VECTOR_ELT(prior, 0); a2 = VECTOR_ELT(prior, 1); narcs1 = LENGTH(a1); PROTECT(match1 = match(nodes, a1, 0)); PROTECT(match2 = match(nodes, a2, 0)); m1 = INTEGER(match1); m2 = INTEGER(match2); PROTECT(arc_id = allocVector(INTSXP, narcs1)); aid = INTEGER(arc_id); c_arc_hash(&narcs1, &nnodes, m1, m2, aid, NULL, TRUE); /* duplicates correspond to undirected arcs. */ PROTECT(undirected = dupe(arc_id)); und = INTEGER(undirected); /* extract the components from the prior. */ prob = VECTOR_ELT(prior, 2); p = REAL(prob); /* count output arcs. */ for (i = 0; i < narcs1; i++) narcs2 += 2 - und[i]; narcs2 /= 2; /* allocate the columns of the return value. */ PROTECT(from = allocVector(STRSXP, narcs2)); PROTECT(to = allocVector(STRSXP, narcs2)); PROTECT(nid = allocVector(INTSXP, narcs2)); id = INTEGER(nid); PROTECT(dir1 = allocVector(REALSXP, narcs2)); d1 = REAL(dir1); PROTECT(dir2 = allocVector(REALSXP, narcs2)); d2 = REAL(dir2); /* sort the strength coefficients. */ poset = alloc1dcont(narcs1); for (k = 0; k < narcs1; k++) poset[k] = k; R_qsort_int_I(aid, poset, 1, narcs1); for (i = 0, k = 0; i < narcs1; i++) { cur = poset[i]; #define ASSIGN(A1, A2, D1, D2) \ SET_STRING_ELT(from, k, STRING_ELT(A1, cur)); \ SET_STRING_ELT(to, k, STRING_ELT(A2, cur)); \ id[k] = aid[i]; \ D1[k] = p[cur]; \ if ((und[cur] == TRUE) && (i < narcs1 - 1)) \ D2[k] = p[poset[++i]]; \ else \ D2[k] = (1 - D1[k])/2; /* copy the node labels. */ if (m1[cur] < m2[cur]) { ASSIGN(a1, a2, d1, d2); }/*THEN*/ else { ASSIGN(a2, a1, d2, d1); }/*ELSE*/ if (d1[k] + d2[k] > 1) { UNPROTECT(9); error("the probabilities for arc %s -> %s sum to %lf.", CHAR(STRING_ELT(from, k)), CHAR(STRING_ELT(to, k)), d1[k] + d2[k]); }/*THEN*/ /* move to the next arc. */ k++; }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, 5)); SET_VECTOR_ELT(result, 0, from); SET_VECTOR_ELT(result, 1, to); SET_VECTOR_ELT(result, 2, nid); SET_VECTOR_ELT(result, 3, dir1); SET_VECTOR_ELT(result, 4, dir2); PROTECT(colnames = allocVector(STRSXP, 5)); SET_STRING_ELT(colnames, 0, mkChar("from")); SET_STRING_ELT(colnames, 1, mkChar("to")); SET_STRING_ELT(colnames, 2, mkChar("aid")); SET_STRING_ELT(colnames, 3, mkChar("fwd")); SET_STRING_ELT(colnames, 4, mkChar("bkwd")); setAttrib(result, R_NamesSymbol, colnames); PROTECT(df = minimal_data_frame(result)); UNPROTECT(12); return df; }/*CASTELO_COMPLETION*/
/* compute all the pairwise mutual information coefficients between the variables. */ void mi_matrix(double *mim, void **columns, int dim, int *nlevels, int *num, void *cond, int *clevels, double *means, double *sse, int *est) { int i = 0, j = 0; switch(*est) { case DISCRETE_MAXIMUM_LIKELIHOOD: if (!cond) { for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = c_chisqtest(((int **)columns)[i], nlevels[i], ((int **)columns)[j], nlevels[j], *num, NULL, MI, FALSE); }/*FOR*/ }/*FOR*/ }/*THEN*/ else { for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = c_cchisqtest(((int **)columns)[i], nlevels[i], ((int **)columns)[j], nlevels[j], (int *)cond, *clevels, *num, NULL, MI, FALSE); }/*FOR*/ }/*FOR*/ }/*ELSE*/ break; case GAUSSIAN_MAXIMUM_LIKELIHOOD: for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = cor_mi_trans( c_fast_cor(((double **)columns)[i], ((double **)columns)[j], *num, means[i], means[j], sse[i], sse[j])); }/*FOR*/ }/*FOR*/ break; }/*SWITCH*/ }/*MI_MATRIX*/
/* compute all the pairwise mutual information coefficients between the variables. */ void mi_matrix(double *mim, void **columns, int dim, int *nlevels, int *num, void *cond, int *clevels, int *est) { int i = 0, j = 0; switch(*est) { case DISCRETE_MAXIMUM_LIKELIHOOD: if (cond == NULL) { for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = c_mi(((int **)columns)[i], nlevels + i, ((int **)columns)[j], nlevels + j, num); }/*FOR*/ }/*FOR*/ }/*THEN*/ else { for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = c_cmi(((int **)columns)[i], nlevels + i, ((int **)columns)[j], nlevels + j, (int *)cond, clevels, num); }/*FOR*/ }/*FOR*/ }/*ELSE*/ break; case GAUSSIAN_MAXIMUM_LIKELIHOOD: for (i = 0; i < dim; i++) { for (j = i + 1; j < dim; j++) { mim[UPTRI3(i + 1, j + 1, dim)] = c_mig(((double **)columns)[i], ((double **)columns)[j], num); }/*FOR*/ }/*FOR*/ break; }/*SWITCH*/ }/*MI_MATRIX*/
/* ARACNE structure learning algorithm. */ SEXP aracne(SEXP data, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP debug) { int i = 0, j = 0, k = 0, coord = 0, ncol = length(data); int num = length(VECTOR_ELT(data, i)), narcs = ncol * (ncol - 1) / 2; int *nlevels = NULL, *est = INTEGER(estimator), *wl = NULL, *bl = NULL; int debuglevel = isTRUE(debug); void **columns = NULL; short int *exclude = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, nodes, wlist, blist; PROTECT(nodes = getAttrib(data, R_NamesSymbol)); /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); exclude = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, NULL, NULL, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS() /* compare all the triplets. */ for (i = 0; i < ncol; i++) { for (j = i + 1; j < ncol; j++) { for (k = 0; k < ncol; k++) { if ((k == i) || (k == j)) continue; /* cache the UPTRI3 coordinates of the arc. */ coord = UPTRI3(i + 1, j + 1, ncol); /* if MI(X, Y) < min(MI(X, Z), MI(Z, Y)) drop arc X - Y. */ if ((mim[coord] < mim[UPTRI3(i + 1, k + 1, ncol)]) && (mim[coord] < mim[UPTRI3(j + 1, k + 1, ncol)])) { if (debuglevel > 0) { Rprintf("* dropping arc %s - %s because of %s, %lf < min(%lf, %lf)\n", NODE(i), NODE(j), NODE(k), mim[UPTRI3(i + 1, j + 1, ncol)], mim[UPTRI3(i + 1, k + 1, ncol)], mim[UPTRI3(j + 1, k + 1, ncol)]); }/*THEN*/ /* update the status vector. */ exclude[coord] = 1; /* decrement the number of arcs. */ narcs--; break; }/*THEN*/ }/*FOR*/ }/*FOR*/ }/*FOR*/ /* add back whitelisted arcs. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); for (i = 0; i < length(wlist); i++) { if (debuglevel > 0) { Rprintf("* adding back whitelisted arcs.\n"); if (exclude[wl[i]] == 1) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + length(wlist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[wl[i]] == 1) narcs++; /* include the arc in the graph. */ exclude[wl[i]] = 0; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* remove blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); for (i = 0; i < length(blist); i++) { if (debuglevel > 0) { Rprintf("* removing blacklisted arcs.\n"); if (exclude[bl[i]] == 0) { Rprintf(" > arc %s - %s has been dropped from the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*THEN*/ else { Rprintf(" > arc %s - %s was not present in the graph.\n", CHAR(STRING_ELT(blacklist, i)), CHAR(STRING_ELT(blacklist, i + length(blist)))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (exclude[bl[i]] == 0) narcs--; /* remove the arc from the graph. */ exclude[bl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ CONVERT_TO_ARC_SET(exclude, 1, 2 * narcs); Free1D(mim); Free1D(exclude); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); UNPROTECT(1); return arcs; }/*ARACNE*/
int c_uptri3_path(short int *uptri, int from, int to, int nnodes, SEXP nodes, int debuglevel) { int i = 0, j = 0, d = 0, *depth = NULL; depth = alloc1dcont(nnodes); depth[from] = 1; /* for each depth level... */ for (d = 1; d <= nnodes; d++) { if (debuglevel) Rprintf("* considering depth %d.\n", d); /* ... for each node... */ for (i = 0; i < nnodes; i++) { /* ... if it is at the current depth level... */ if (depth[i] != d) continue; if (debuglevel) Rprintf(" > found node %s.\n", NODE(i)); for (j = 0; j < nnodes; j++) { /* ... and it is adjacent to another node... */ if (!(uptri[UPTRI3(i + 1, j + 1, nnodes)] == 1) || (i == j)) continue; /* ... that hasn't already been visited... */ if (depth[j] != 0) { if (debuglevel) Rprintf(" @ node '%s' already visited, skipping.\n", NODE(j)); continue; }/*THEN*/ if (j == to) { /* ... and it's the destination, exit. */ if (debuglevel) Rprintf(" @ arrived at node %s, exiting.\n", NODE(to)); return TRUE; }/*THEN*/ else { /* ...and it's not the destination, add it to the next depth level. */ depth[j] = d + 1; }/*ELSE*/ if (debuglevel) Rprintf(" > added node %s at depth %d\n", NODE(j), d + 1); }/*FOR*/ }/*FOR*/ }/*FOR*/ return FALSE; }/*HAS_UPTRI3_PATH*/