/* unconditional mutual information, to be used in C code. */ double c_mi(int *xx, int *llx, int *yy, int *lly, int *num) { int i = 0, j = 0, k = 0; int **n = NULL, *ni = NULL, *nj = NULL; double res = 0; /* initialize the contingency table and the marginal frequencies. */ n = alloc2dcont(*llx, *lly); ni = alloc1dcont(*llx); nj = alloc1dcont(*lly); /* compute the joint frequency of x and y. */ for (k = 0; k < *num; k++) { n[xx[k] - 1][yy[k] - 1]++; }/*FOR*/ /* compute the marginals. */ for (i = 0; i < *llx; i++) for (j = 0; j < *lly; j++) { ni[i] += n[i][j]; nj[j] += n[i][j]; }/*FOR*/ /* compute the mutual information from the joint and marginal frequencies. */ for (i = 0; i < *llx; i++) for (j = 0; j < *lly; j++) res += MI_PART(n[i][j], ni[i], nj[j], *num); return (res)/(*num); }/*C_MI*/
SEXP dlik(SEXP x) { int i = 0, k = 0; int *n = NULL, *xx = INTEGER(x), llx = NLEVELS(x), num = LENGTH(x); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc1dcont(llx); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) { n[xx[k] - 1]++; }/*FOR*/ /* compute the entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) { if (n[i] != 0) *res += (double)n[i] * log((double)n[i] / num); }/*FOR*/ UNPROTECT(1); return result; }/*DLIK*/
/* compute the cached values fro all nodes. */ SEXP cache_structure(SEXP nodes, SEXP amat, SEXP debug) { int i = 0, debuglevel = LOGICAL(debug)[0], length_nodes = LENGTH(nodes); int *status = NULL, *a = INTEGER(amat); SEXP bn, temp; /* allocate the list and set its attributes.*/ PROTECT(bn = allocVector(VECSXP, length_nodes)); setAttrib(bn, R_NamesSymbol, nodes); /* allocate and intialize the status vector. */ status = alloc1dcont(length_nodes); if (isTRUE(debug)) Rprintf("* (re)building cached information about network structure.\n"); /* populate the list with nodes' data. */ for (i = 0; i < length_nodes; i++) { /* (re)initialize the status vector. */ memset(status, '\0', sizeof(int) * length_nodes); temp = cache_node_structure(i, nodes, a, length_nodes, status, debuglevel); /* save the returned list. */ SET_VECTOR_ELT(bn, i, temp); }/*FOR*/ UNPROTECT(1); return bn; }/*CACHE_STRUCTURE*/
/* initialize a three-dimensional contingency table and the marginals. */ void fill_3d_table(int *xx, int *yy, int *zz, int ****n, int ***ni, int ***nj, int **nk, int llx, int lly, int llz, int num) { int i = 0, j = 0, k = 0; *n = alloc3dcont(llz, llx, lly); *ni = alloc2dcont(llz, llx); *nj = alloc2dcont(llz, lly); *nk = alloc1dcont(llz); /* compute the joint frequency of x, y, and z. */ for (k = 0; k < num; k++) (*n)[zz[k] - 1][xx[k] - 1][yy[k] - 1]++; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) for (k = 0; k < llz; k++) { (*ni)[k][i] += (*n)[k][i][j]; (*nj)[k][j] += (*n)[k][i][j]; (*nk)[k] += (*n)[k][i][j]; }/*FOR*/ }/*FILL_3D_TABLE*/
/* compute the cached values for a single node (C-friendly). */ SEXP c_cache_partial_structure(int target, SEXP nodes, SEXP amat, int *status, SEXP debug) { int debuglevel = LOGICAL(debug)[0], length_nodes = LENGTH(nodes); int *a = INTEGER(amat); /* allocate and initialize the status vector. */ if (!(*status)) status = alloc1dcont(length_nodes); /* return the corresponding part of the bn structure. */ return cache_node_structure(target, nodes, a, length_nodes, status, debuglevel); }/*C_CACHE_PARTIAL_STRUCTURE*/
/* initialize a two-dimensional contingency table and the marginals. */ void fill_2d_table(int *xx, int *yy, int ***n, int **ni, int **nj, int llx, int lly, int num) { int i = 0, j = 0, k = 0; *n = alloc2dcont(llx, lly); *ni = alloc1dcont(llx); *nj = alloc1dcont(lly); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) (*n)[xx[k] - 1][yy[k] - 1]++; /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { (*ni)[i] += (*n)[i][j]; (*nj)[j] += (*n)[i][j]; }/*FOR*/ }/*FILL_2D_TABLE*/
SEXP smart_network_averaging(SEXP arcs, SEXP nodes, SEXP weights) { int k = 0, from = 0, to = 0, nrows = LENGTH(arcs) / 2, dims = LENGTH(nodes); int *a = NULL, *coords = NULL, *poset = NULL; double *w = NULL; SEXP weights2, amat, try, acyclic; /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, dims, dims)); a = INTEGER(amat); memset(a, '\0', sizeof(int) * dims * dims); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); /* duplicate the weights to preserve the oroginal ones. */ PROTECT(weights2 = duplicate(weights)); w = REAL(weights2); /* sort the strength coefficients. */ poset = alloc1dcont(nrows); for (k = 0; k < nrows; k++) poset[k] = k; R_qsort_I(w, poset, 1, nrows); /* iterate over the arcs in reverse order wrt their strength coefficients. */ for (k = 0; k < nrows; k++) { from = coords[poset[k]] - 1; to = coords[poset[k] + nrows] - 1; /* add an arc only if it does not introduce cycles. */ if (!c_has_path(to, from, a, dims, nodes, FALSE, TRUE, FALSE)) a[CMC(from, to, dims)] = 1; else warning("arc %s -> %s would introduce cycles in the graph, ignoring.", NODE(from), NODE(to)); }/*FOR*/ /* convert the adjacency matrix back to an arc set and return it. */ acyclic = amat2arcs(amat, nodes); UNPROTECT(3); return acyclic; }/*SMART_NETWORK_AVERAGING*/
SEXP cdlik(SEXP x, SEXP y) { int i = 0, j = 0, k = 0; int **n = NULL, *nj = NULL; int llx = NLEVELS(x), lly = NLEVELS(y), num = LENGTH(x); int *xx = INTEGER(x), *yy = INTEGER(y); double *res = NULL; SEXP result; /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table and the marginal frequencies. */ n = alloc2dcont(llx, lly); nj = alloc1dcont(lly); /* compute the joint frequency of x and y. */ for (k = 0; k < num; k++) { n[xx[k] - 1][yy[k] - 1]++; }/*FOR*/ /* compute the marginals. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { nj[j] += n[i][j]; }/*FOR*/ /* compute the conditional entropy from the joint and marginal frequencies. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) { if (n[i][j] != 0) *res += (double)n[i][j] * log((double)n[i][j] / (double)nj[j]); }/*FOR*/ UNPROTECT(1); return result; }/*CDLIK*/
/* unconditional discrete sampling. */ void rbn_discrete_root(SEXP result, int cur, SEXP cpt, int *num, int ordinal, SEXP fixed) { int np = LENGTH(cpt), *gen = NULL, *workplace = NULL; double *p = NULL; SEXP generated, class, lvls; /* get the levels of the curent variable .*/ lvls = VECTOR_ELT(getAttrib(cpt, R_DimNamesSymbol), 0); /* allocate the memory for the generated observations. */ PROTECT(generated = allocVector(INTSXP, *num)); gen = INTEGER(generated); if (fixed != R_NilValue) { int constant = INTEGER(match(lvls, fixed, 0))[0]; for (int i = 0; i < *num; i++) gen[i] = constant; }/*THEN*/ else { workplace = alloc1dcont(np); /* duplicate the probability table to save the original copy from tampering. */ p = alloc1dreal(np); memcpy(p, REAL(cpt), np * sizeof(double)); /* perform the random sampling. */ ProbSampleReplace(np, p, workplace, *num, gen); }/*ELSE*/ /* set up all the attributes for the newly generated observations. */ setAttrib(generated, R_LevelsSymbol, lvls); if (ordinal) { PROTECT(class = allocVector(STRSXP, 2)); SET_STRING_ELT(class, 0, mkChar("ordered")); SET_STRING_ELT(class, 1, mkChar("factor")); }/*THEN*/ else {
/* conditional mutual information, to be used in C code. */ double c_cmi(int *xx, int *llx, int *yy, int *lly, int *zz, int *llz, int *num) { int i = 0, j = 0, k = 0; int ***n = NULL, **ni = NULL, **nj = NULL, *nk = NULL; double res = 0; /* initialize the contingency table and the marginal frequencies. */ n = alloc3dcont(*llx, *lly, *llz); ni = alloc2dcont(*llx, *llz); nj = alloc2dcont(*lly, *llz); nk = alloc1dcont(*llz); /* compute the joint frequency of x, y, and z. */ for (k = 0; k < *num; k++) { n[xx[k] - 1][yy[k] - 1][zz[k] - 1]++; }/*FOR*/ /* compute the marginals. */ for (i = 0; i < *llx; i++) for (j = 0; j < *lly; j++) for (k = 0; k < *llz; k++) { ni[i][k] += n[i][j][k]; nj[j][k] += n[i][j][k]; nk[k] += n[i][j][k]; }/*FOR*/ /* compute the conditional mutual information from the joint and marginal frequencies. */ for (i = 0; i < *llx; i++) for (j = 0; j < *lly; j++) for (k = 0; k < *llz; k++) res += MI_PART(n[i][j][k], ni[i][k], nj[j][k], nk[k]); res = res/(*num); return res; }/*C_CMI*/
/* compute the cached values for a single node (R-friendly). */ SEXP cache_partial_structure(SEXP nodes, SEXP target, SEXP amat, SEXP debug) { int i = 0, debuglevel = LOGICAL(debug)[0], length_nodes = LENGTH(nodes); char *t = (char *)CHAR(STRING_ELT(target, 0)); int *status = NULL, *a = INTEGER(amat); if (isTRUE(debug)) Rprintf("* (re)building cached information about node %s.\n", t); /* allocate and initialize the status vector. */ status = alloc1dcont(length_nodes); /* iterate fo find the node position in the array. */ for (i = 0; i < length_nodes; i++) if (!strcmp(t, CHAR(STRING_ELT(nodes, i)))) break; /* return the corresponding part of the bn structure. */ return cache_node_structure(i, nodes, a, length_nodes, status, debuglevel); }/*CACHE_PARTIAL_STRUCTURE*/
/* predict the values of one or more variables given one or more variables by * maximum a posteriori (MAP). */ SEXP mappred(SEXP node, SEXP fitted, SEXP data, SEXP n, SEXP from, SEXP debug) { int i = 0, j = 0, k = 0, nobs = 0, nev = 0, nlvls = 0; int *vartypes = NULL, nsims = INT(n), debuglevel = isTRUE(debug); void **varptrs = NULL, **evptrs = NULL, *pred = NULL, *res = NULL; SEXP result, colnames, evidence, evmatch, temp = R_NilValue; SEXP cpdist, predicted, lvls = R_NilValue; double *wgt = NULL; long double *lvls_counts = NULL; /* extract the names of the variables in the data. */ colnames = getAttrib(data, R_NamesSymbol); /* remove the name of the variable to predict. */ nev = length(from); PROTECT(evmatch = match(colnames, from, 0)); /* cache variable types and pointers. */ vartypes = alloc1dcont(nev); varptrs = alloc1dpointer(nev); for (j = 0, k = 0; j < nev; j++) { temp = VECTOR_ELT(data, INTEGER(evmatch)[j] - 1); vartypes[k] = TYPEOF(temp); varptrs[k++] = DATAPTR(temp); }/*FOR*/ /* cache the sample size. */ nobs = length(temp); /* allocate a list to hold the evidence. */ PROTECT(evidence = allocVector(VECSXP, nev)); setAttrib(evidence, R_NamesSymbol, from); /* cache pointers to the elements of the evidence .*/ evptrs = alloc1dpointer(nev); for (j = 0; j < nev; j++) { PROTECT(temp = allocVector(vartypes[j], 1)); evptrs[j] = DATAPTR(temp); SET_VECTOR_ELT(evidence, j, temp); UNPROTECT(1); }/*FOR*/ /* make the evidence a data frame to compact debugging output. */ minimal_data_frame(evidence); /* allocate the return value. */ PROTECT(result = fitnode2df(fitted, STRING_ELT(node, 0), nobs)); res = DATAPTR(result); /* in the case of discrete variables, allocate scratch space for levels' * frequencies. */ if (TYPEOF(result) == INTSXP) { lvls = getAttrib(result, R_LevelsSymbol); nlvls = length(lvls); lvls_counts = allocldouble(nlvls); }/*THEN*/ /* allocate the weights. */ wgt = alloc1dreal(nsims); /* allocate sratch space for the random samplings. */ PROTECT(cpdist = fit2df(fitted, nsims)); predicted = getListElement(cpdist, (char *)CHAR(STRING_ELT(node, 0))); pred = DATAPTR(predicted); /* iterate over the observations. */ for (i = 0; i < nobs; i++) { /* copy the values into the list. */ for (j = 0; j < nev; j++) { switch(vartypes[j]) { case REALSXP: *((double *)evptrs[j]) = ((double *)varptrs[j])[i]; break; case INTSXP: *((int *)evptrs[j]) = ((int *)varptrs[j])[i]; break; }/*SWITCH*/ }/*FOR*/ if (debuglevel > 0) { Rprintf("* predicting observation %d conditional on:\n", i); PrintValue(evidence); }/*THEN*/ /* generate samples from the conditional posterior distribution. */ c_rbn_master(fitted, cpdist, n, evidence, FALSE); /* compute the weights. */ c_lw_weights(fitted, cpdist, nsims, wgt, from, FALSE); /* compute the posterior estimate. */ switch(TYPEOF(predicted)) { case REALSXP: /* average the predicted values. */ ((double *)res)[i] = posterior_mean((double *)pred, wgt, nsims, debuglevel); break; case INTSXP: /* pick the most frequent value. */ ((int *)res)[i] = posterior_mode((int *)pred, wgt, nsims, lvls_counts, lvls, nlvls, debuglevel); break; }/*SWITCH*/ }/*FOR*/ UNPROTECT(4); return result; }/*MAPPRED*/
SEXP tiers(SEXP nodes, SEXP debug) { int i = 0, j = 0, k = 0, narcs = 0, nnodes = 0, ntiers = LENGTH(nodes); int *tier_size = NULL, *debuglevel = LOGICAL(debug), tier_start = 0, cur = 0; SEXP flattened, blacklist, temp; /* allocate the counters for tiers' sizes.*/ tier_size = alloc1dcont(ntiers); if (!isString(nodes)) { /* "node" is a list, each tier is an element. */ for (i = ntiers - 1; i >= 0; i--) { temp = VECTOR_ELT(nodes, i); tier_size[i] = LENGTH(temp); nnodes += tier_size[i]; narcs += (nnodes - tier_size[i]) * tier_size[i]; }/*FOR*/ /* flatten the tiers to keep manipulation later on simple. */ PROTECT(flattened = allocVector(STRSXP, nnodes)); for (i = 0, k = 0; i < ntiers; i++) { temp = VECTOR_ELT(nodes, i); for (j = 0; j < tier_size[i]; j++) SET_STRING_ELT(flattened, k++, STRING_ELT(temp, j)); }/*FOR*/ }/*THEN*/ else { /* "node" is a character vector, which means that each node is in its own tier * and that there is no need to flatted it. */ flattened = nodes; nnodes = LENGTH(nodes); for (i = 0; i < ntiers; i++) tier_size[i] = 1; /* the blacklist is the one resulting from a complete node ordering. */ narcs = ntiers * (ntiers - 1) / 2; }/*ELSE*/ /* allocate the return value. */ PROTECT(blacklist = allocMatrix(STRSXP, narcs, 2)); for (k = 0, i = 0; k < nnodes; k++) { temp = STRING_ELT(flattened, k); if (*debuglevel > 0) Rprintf("* current node is %s in tier %d.\n", CHAR(temp), i + 1); for (j = tier_start + tier_size[i]; j < nnodes; j++) { if (*debuglevel) Rprintf(" > blacklisting %s -> %s\n", CHAR(STRING_ELT(flattened, j)), CHAR(temp)); SET_STRING_ELT(blacklist, cur, STRING_ELT(flattened, j)); SET_STRING_ELT(blacklist, cur + narcs, temp); cur++; }/*FOR*/ if (k >= tier_start + tier_size[i] - 1) tier_start += tier_size[i++]; if (i == ntiers) break; } /* allocate, initialize and set the column names. */ finalize_arcs(blacklist); if (!isString(nodes)) UNPROTECT(2); else UNPROTECT(1); return blacklist; }/*TIERS*/
/* conditional posterior dirichlet probability (covers BDe and K2 scores). */ SEXP cdpost(SEXP x, SEXP y, SEXP iss, SEXP exp, SEXP nparams) { int i = 0, j = 0, k = 0, imaginary = 0, num = LENGTH(x); int llx = NLEVELS(x), lly = NLEVELS(y), *xx = INTEGER(x), *yy = INTEGER(y); int **n = NULL, *nj = NULL; double alpha = 0, *res = NULL, *p = REAL(nparams); SEXP result; if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = (int) *p; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INT(iss); alpha = (double) imaginary / *p; }/*ELSE*/ /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc2dcont(llx, lly); nj = alloc1dcont(lly); /* compute the joint frequency of x and y. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*FOR*/ }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) { n[xx[i] - 1][yy[i] - 1]++; nj[yy[i] - 1]++; }/*THEN*/ else { k++; }/*ELSE*/ }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= LENGTH(exp); }/*ELSE*/ /* compute the conditional posterior probability. */ for (i = 0; i < llx; i++) for (j = 0; j < lly; j++) *res += lgammafn(n[i][j] + alpha) - lgammafn(alpha); for (j = 0; j < lly; j++) *res += lgammafn((double)imaginary / lly) - lgammafn(nj[j] + (double)imaginary / lly); UNPROTECT(1); return result; }/*CDPOST*/
/* posterior dirichlet probability (covers BDe and K2 scores). */ SEXP dpost(SEXP x, SEXP iss, SEXP exp) { int i = 0, k = 0, num = LENGTH(x); int llx = NLEVELS(x), *xx = INTEGER(x), *n = NULL, *imaginary = NULL; double alpha = 0, *res = NULL; SEXP result; /* the correct vaules for the hyperparameters alpha are documented in * "Learning Bayesian Networks: The Combination of Knowledge and Statistical * Data" by Heckerman, Geiger & Chickering (1995), page 17. */ if (isNull(iss)) { /* this is for K2, which does not define an imaginary sample size; * all hyperparameters are set to 1 in the prior distribution. */ imaginary = &llx; alpha = 1; }/*THEN*/ else { /* this is for the BDe score. */ imaginary = INTEGER(iss); alpha = (double) *imaginary / (double) llx; }/*ELSE*/ /* allocate and initialize result to zero. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the contingency table. */ n = alloc1dcont(llx); /* compute the frequency table of x, disregarding experimental data. */ if (exp == R_NilValue) { for (i = 0; i < num; i++) n[xx[i] - 1]++; }/*THEN*/ else { int *e = INTEGER(exp); for (i = 0, k = 0; i < num; i++) { if (i != e[k] - 1) n[xx[i] - 1]++; else k++; }/*FOR*/ /* adjust the sample size to match the number of observational data. */ num -= LENGTH(exp); }/*ELSE*/ /* compute the posterior probability. */ for (i = 0; i < llx; i++) *res += lgammafn(n[i] + alpha) - lgammafn(alpha); *res += lgammafn((double)(*imaginary)) - lgammafn((double)(*imaginary + num)); UNPROTECT(1); return result; }/*DPOST*/
/* conditional Monte Carlo simulation for discrete tests. */ SEXP cmcarlo_mean(SEXP x, SEXP y, SEXP z, SEXP lx, SEXP ly, SEXP lz, SEXP length, SEXP samples, SEXP test) { double *fact = NULL, *res = NULL, observed = 0; int **n = NULL, **ncolt = NULL, **nrowt = NULL, *ncond = NULL, *workspace = NULL; int *num = INTEGER(length), *B = INTEGER(samples); int *nr = INTEGER(lx), *nc = INTEGER(ly), *nl = INTEGER(lz); int *xx = INTEGER(x), *yy = INTEGER(y), *zz = INTEGER(z); int i = 0, j = 0, k = 0, npermuts = 0; SEXP result; /* allocate and initialize the result */ PROTECT(result = allocVector(REALSXP, 3)); res = REAL(result); res[0] = res[1] = res[2] = 0; // initial test score / mean score / nb permutations /* allocate and compute the factorials needed by rcont2. */ allocfact(*num); /* allocate and initialize the workspace for rcont2. */ workspace = alloc1dcont(*nc); /* initialize the contingency table. */ n = alloc2dcont(*nl, (*nr) * (*nc)); /* initialize the marginal frequencies. */ nrowt = alloc2dcont(*nl, *nr); ncolt = alloc2dcont(*nl, *nc); ncond = alloc1dcont(*nl); /* compute the joint frequency of x and y. */ for (k = 0; k < *num; k++) n[zz[k] - 1][CMC(xx[k] - 1, yy[k] - 1, *nr)]++; /* compute the marginals. */ for (i = 0; i < *nr; i++) for (j = 0; j < *nc; j++) for (k = 0; k < *nl; k++) { nrowt[k][i] += n[k][CMC(i, j, *nr)]; ncolt[k][j] += n[k][CMC(i, j, *nr)]; ncond[k] += n[k][CMC(i, j, *nr)]; }/*FOR*/ /* initialize the random number generator. */ GetRNGstate(); /* pick up the observed value of the test statistic, then generate a set of random contingency tables (given row and column totals) and adds their test scores to compute the mean.*/ switch(INT(test)) { case MUTUAL_INFORMATION: observed = 2 * _cmi(n, nrowt, ncolt, ncond, nr, nc, nl); for (j = 0; j < *B; j++) { for (k = 0; k < *nl; k++) rcont2(nr, nc, nrowt[k], ncolt[k], &(ncond[k]), fact, workspace, n[k]); res[1] += 2 * _cmi(n, nrowt, ncolt, ncond, nr, nc, nl); npermuts++; } break; case PEARSON_X2: observed = _cx2(n, nrowt, ncolt, ncond, nr, nc, nl); for (j = 0; j < *B; j++) { for (k = 0; k < *nl; k++) rcont2(nr, nc, nrowt[k], ncolt[k], &(ncond[k]), fact, workspace, n[k]); res[1] += _cx2(n, nrowt, ncolt, ncond, nr, nc, nl); npermuts++; } break; }/*SWITCH*/ PutRNGstate(); /* save the observed and mean values of the statistic, and the number of permutations performed. */ res[0] = observed; res[1] /= *B; // mean res[2] = npermuts; UNPROTECT(1); return result; }/*CMCARLO_MEAN*/
/* predict the value of a discrete node with one or more parents. */ SEXP cdpred(SEXP fitted, SEXP data, SEXP parents, SEXP debug) { int i = 0, k = 0, ndata = LENGTH(data), nrows = 0, ncols = 0; int *configs = INTEGER(parents), *debuglevel = LOGICAL(debug); int *iscratch = NULL, *maxima = NULL, *nmax = NULL, *res = NULL; double *prob = NULL, *dscratch = NULL, *buf = NULL; SEXP temp, result, tr_levels = getAttrib(data, R_LevelsSymbol); /* get the probabilities of the multinomial distribution. */ temp = getListElement(fitted, "prob"); nrows = INT(getAttrib(temp, R_DimSymbol)); ncols = LENGTH(temp) / nrows; prob = REAL(temp); /* create the vector of indexes. */ iscratch = alloc1dcont(nrows); /* create a scratch copy of the array. */ buf = alloc1dreal(nrows); dscratch = alloc1dreal(nrows * ncols); memcpy(dscratch, prob, nrows * ncols * sizeof(double)); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(nrows * ncols); /* allocate the maxima counters. */ nmax = alloc1dcont(ncols); /* get the mode for each configuration. */ for (i = 0; i < ncols; i++) { /* initialize the vector of indexes. */ for (k = 0; k < nrows; k++) iscratch[k] = k + 1; /* find out the mode(s). */ all_max(dscratch + i * nrows, nrows, maxima + i * nrows, nmax + i, iscratch, buf); }/*FOR*/ /* allocate and initialize the return value. */ PROTECT(result = allocVector(INTSXP, ndata)); res = INTEGER(result); /* initialize the random seed, just in case we need it for tie breaking. */ GetRNGstate(); /* copy the index of the mode in the return value. */ for (i = 0; i < ndata; i++) { if (nmax[configs[i] - 1] == 1) { res[i] = maxima[CMC(0, configs[i] - 1, nrows)]; if (*debuglevel > 0) { if (res[i] == NA_INTEGER) Rprintf(" > prediction for observation %d is NA with probabilities:\n"); else Rprintf(" > prediction for observation %d is '%s' with probabilities:\n", i + 1, CHAR(STRING_ELT(tr_levels, res[i] - 1))); Rprintf(" "); for (int k = 0; k < nrows; k++) Rprintf(" %lf", (prob + nrows * (configs[i] - 1))[k]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ SampleReplace(1, nmax[configs[i] - 1], res + i, maxima + (configs[i] - 1) * nrows); if (*debuglevel > 0) { Rprintf(" > there are %d levels tied for prediction of observation %d, applying tie breaking.\n", nmax[configs[i] - 1], i + 1); Rprintf(" > tied levels are:"); for (k = 0; k < nmax[configs[i] - 1]; k++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[CMC(k, configs[i] - 1, nrows)] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ }/*FOR*/ /* save the state of the random number generator. */ PutRNGstate(); /* copy the labels and the class from the input data. */ setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, getAttrib(data, R_ClassSymbol)); UNPROTECT(1); return result; }/*CDPRED*/
/* set the directions of the arcs in a tree given the root node. */ SEXP tree_directions(SEXP arcs, SEXP nodes, SEXP root, SEXP debug) { int i = 0, j = 0, d = 0, traversed = 1; int narcs = LENGTH(arcs)/2, nnodes = LENGTH(nodes); int *a = NULL, *depth = 0, *debuglevel = LOGICAL(debug); SEXP try, try2, result; /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); a = INTEGER(try); /* match the root node. */ PROTECT(try2 = match(nodes, root, 0)); /* allocate and initialize the statust vector. */ depth = alloc1dcont(nnodes); depth[INT(try2) - 1] = 1; if (*debuglevel > 0) Rprintf("> root node (depth 1) is %s.\n", NODE(INT(try2) - 1)); for (d = 1; d <= nnodes; d++) { if (*debuglevel > 0) Rprintf("> considering nodes at depth %d.\n", d + 1); for (i = 0; i < narcs; i++) { for (j = 0; j < nnodes; j++) { /* disregard nodes at the wrong depth. */ if (depth[j] != d) continue; if ((a[i + narcs] == (j + 1)) && (depth[a[i] - 1] == 0)) { if (*debuglevel > 0) Rprintf(" * found node %s.\n", NODE(a[i] - 1)); /* save the depth at which the node was found. */ depth[a[i] - 1] = d + 1; /* update the counter of the traversed nodes. */ traversed++; }/*THEN*/ }/*FOR*/ }/*FOR*/ /* check whether all nodes have been traversed. */ if (traversed == nnodes) break; }/*FOR*/ /* allocate and initialize the return value. */ PROTECT(result = allocMatrix(STRSXP, narcs/2, 2)); for (i = 0, j = 0; i < narcs; i++) { if (depth[a[i] - 1] < depth[a[i + narcs] - 1]) { SET_STRING_ELT(result, j, STRING_ELT(arcs, i)); SET_STRING_ELT(result, j + narcs/2, STRING_ELT(arcs, i + narcs)); j++; }/*THEN*/ }/*FOR*/ UNPROTECT(3); return result; }/*TREE_DIRECTIONS*/
SEXP score_cache_fill(SEXP nodes, SEXP data, SEXP network, SEXP score, SEXP extra, SEXP reference, SEXP equivalence, SEXP decomposability, SEXP updated, SEXP amat, SEXP cache, SEXP blmat, SEXP debug) { int *colsum = NULL, nnodes = length(nodes), lupd = length(updated); int *a = NULL, *upd = NULL, *b = NULL, debuglevel = isTRUE(debug); int i = 0, j = 0, k = 0; double *cache_value = NULL; void *p = NULL; SEXP arc, delta, op, temp; /* save a pointer to the adjacency matrix, the blacklist and the * updated nodes. */ a = INTEGER(amat); b = INTEGER(blmat); upd = INTEGER(updated); /* if there are no nodes to update, return. */ if (lupd == 0) return cache; /* set up row and column total to check for score equivalence; * zero means no parent nodes. */ if (isTRUE(equivalence)) { colsum = alloc1dcont(nnodes); for (i = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) colsum[j] += a[CMC(i, j, nnodes)]; }/*THEN*/ /* allocate and initialize the cache. */ cache_value = REAL(cache); /* allocate a two-slot character vector. */ PROTECT(arc = allocVector(STRSXP, 2)); /* allocate and initialize the fake score delta. */ PROTECT(delta = ScalarReal(0)); /* allocate and initialize the score.delta() operator. */ PROTECT(op = mkString("set")); for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* incident nodes must be different from each other. */ if (i == j) continue; /* if only one or two nodes' caches need updating, skip the rest. */ for (k = 0; k < lupd; k++) if (upd[k] == j) goto there; continue; there: /* no need to compute the score delta for blacklisted arcs. */ if (b[CMC(i, j, nnodes)] == 1) continue; /* use score equivalence if possible to check only one orientation. */ if (isTRUE(equivalence)) { /* if the following conditions are met, look up the score delta of * the reverse of the current arc: * 1) that score delta has already been computed. * 2) both incident nodes have no parent, so the arc is really * score equivalent (no v-structures). * 3) the reversed arc has not been blacklisted, as the score delta * is not computed in this case. */ if ((i > j) && (colsum[i] + colsum[j] == 0) && (b[CMC(j, i, nnodes)] == 0)) { cache_value[CMC(i, j, nnodes)] = cache_value[CMC(j, i, nnodes)]; continue; }/*THEN*/ }/*THEN*/ /* save the nodes incident on the arc. */ SET_STRING_ELT(arc, 0, STRING_ELT(nodes, i)); SET_STRING_ELT(arc, 1, STRING_ELT(nodes, j)); /* if the arc is not present in the graph it should be added; * otherwise it should be removed. */ if (a[CMC(i, j, nnodes)] == 0) SET_STRING_ELT(op, 0, mkChar("set")); else SET_STRING_ELT(op, 0, mkChar("drop")); /* checkpoint allocated memory. */ p = vmaxget(); /* evaluate the call to score.delta() for the arc. */ PROTECT(temp = score_delta(arc, network, data, score, delta, reference, op, extra, decomposability)); vmaxset(p); cache_value[CMC(i, j, nnodes)] = NUM(VECTOR_ELT(temp, 1)); UNPROTECT(1); if (debuglevel > 0) Rprintf("* caching score delta for arc %s -> %s (%lf).\n", CHAR(STRING_ELT(nodes, i)), CHAR(STRING_ELT(nodes, j)), cache_value[CMC(i, j, nnodes)]); }/*FOR*/ }/*FOR*/ UNPROTECT(3); return cache; }/*HC_CACHE_FILL*/
/* predict the value of a discrete node without parents. */ SEXP dpred(SEXP fitted, SEXP data, SEXP debug) { int i = 0, nmax = 0, ndata = LENGTH(data), length = 0; int *res = NULL, *debuglevel = LOGICAL(debug), *iscratch = NULL, *maxima = NULL; double *prob = NULL, *dscratch = NULL, *buf = NULL; SEXP ptab, result, tr_levels = getAttrib(data, R_LevelsSymbol); /* get the probabilities of the multinomial distribution. */ ptab = getListElement(fitted, "prob"); length = LENGTH(ptab); prob = REAL(ptab); /* create the vector of indexes. */ iscratch = alloc1dcont(length); for (i = 0; i < length; i++) iscratch[i] = i + 1; /* create a scratch copy of the array. */ buf = alloc1dreal(length); dscratch = alloc1dreal(length); memcpy(dscratch, prob, length * sizeof(double)); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(length); /* find out the mode(s). */ all_max(dscratch, length, maxima, &nmax, iscratch, buf); /* allocate and initialize the return value. */ PROTECT(result = allocVector(INTSXP, ndata)); res = INTEGER(result); if (nmax == 1) { /* copy the index of the mode in the return value. */ for (i = 0; i < ndata; i++) res[i] = maxima[0]; if (*debuglevel > 0) { if (res[0] == NA_INTEGER) Rprintf(" > prediction for all observations is NA with probabilities:\n"); else Rprintf(" > prediction for all observations is '%s' with probabilities:\n", CHAR(STRING_ELT(tr_levels, res[0] - 1))); Rprintf(" "); for (i = 0; i < LENGTH(ptab); i++) Rprintf(" %lf", prob[i]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ GetRNGstate(); SampleReplace(ndata, nmax, res, maxima); PutRNGstate(); if (*debuglevel > 0) { Rprintf(" > there are %d levels tied for prediction, applying tie breaking.\n", nmax); Rprintf(" > tied levels are:"); for (i = 0; i < nmax; i++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[i] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ /* copy the labels and the class from the input data. */ setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, getAttrib(data, R_ClassSymbol)); UNPROTECT(1); return result; }/*DPRED*/
/* unconditional Monte Carlo simulation for correlation-based tests. */ SEXP gauss_mcarlo(SEXP x, SEXP y, SEXP samples, SEXP test, SEXP alpha) { int j = 0, k = 0, num = LENGTH(x), *B = INTEGER(samples); double *xx = REAL(x), *yy = REAL(y), *yperm = NULL, *res = NULL; double observed = 0, enough = ceil(NUM(alpha) * (*B)) + 1, xm = 0, ym = 0; int *perm = NULL, *work = NULL; SEXP result; /* allocate the arrays needed by RandomPermutation. */ perm = alloc1dcont(num); work = alloc1dcont(num); /* allocate the array for the pemutations. */ yperm = alloc1dreal(num); /* cache the means of the two variables (they are invariant under permutation). */ for (j = 0; j < num; j++) { xm += xx[j]; ym += yy[j]; }/*FOR*/ xm /= num; ym /= num; /* allocate the result. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* initialize the random number generator. */ GetRNGstate(); /* pick up the observed value of the test statistic, then generate a set of random permutations (all variable but the second are fixed) and check how many tests are greater (in absolute value) than the original one.*/ switch(INT(test)) { case GAUSSIAN_MUTUAL_INFORMATION: case LINEAR_CORRELATION: case FISHER_Z: observed = _cov(xx, yy, &xm, &ym, &num); for (j = 0; j < *B; j++) { RandomPermutation(num, perm, work); for (k = 0; k < num; k++) yperm[k] = yy[perm[k]]; if (fabs(_cov(xx, yperm, &xm, &ym, &num)) > fabs(observed)) { sequential_counter_check(*res); }/*THEN*/ }/*FOR*/ break; }/*SWITCH*/ PutRNGstate(); /* save the observed p-value. */ *res /= *B; UNPROTECT(1); return result; }/*GAUSS_MCARLO*/
/* conditional Monte Carlo simulation for correlation-based tests. */ SEXP gauss_cmcarlo(SEXP data, SEXP length, SEXP samples, SEXP test, SEXP alpha) { int j = 0, k = 0, ncols = LENGTH(data), errcode = 0, *work = NULL, *perm = NULL; int error_counter = 0, *B = INTEGER(samples), *num = INTEGER(length); double observed = 0, permuted = 0, *yperm = NULL, *yorig = NULL, *res = NULL; double enough = ceil(NUM(alpha) * (*B)) + 1; double **column = NULL, *mean = NULL, *covariance = NULL, *covariance_backup = NULL; double *u = NULL, *d = NULL, *vt = NULL; SEXP result; /* allocate the matrices needed for the SVD decomposition. */ u = alloc1dreal(ncols * ncols); d = alloc1dreal(ncols); vt = alloc1dreal(ncols * ncols); /* allocate and initialize the result. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* allocate and initialize an array of pointers for the variables. */ column = (double **) alloc1dpointer(ncols); for (j = 0; j < ncols; j++) column[j] = REAL(VECTOR_ELT(data, j)); /* cache the means of the variables (they are invariant under permutation). */ mean = alloc1dreal(ncols); /* compute the mean values */ for (j = 0; j < ncols; j++) { for (k = 0 ; k < *num; k++) mean[j] += column[j][k]; mean[j] /= (*num); }/*FOR*/ /* allocate and initialize the covariance matrix. */ covariance = alloc1dreal(ncols * ncols); covariance_backup = alloc1dreal(ncols * ncols); c_covmat(column, mean, &ncols, num, covariance); memcpy(covariance_backup, covariance, ncols * ncols * sizeof(double)); /* substitute the original data with the fake column that will be permuted. */ yperm = alloc1dreal(*num); yorig = column[1]; memcpy(yperm, yorig, *num * sizeof(double)); column[1] = yperm; /* allocate the arrays needed by RandomPermutation. */ perm = alloc1dcont(*num); work = alloc1dcont(*num); /* initialize the random number generator. */ GetRNGstate(); /* pick up the observed value of the test statistic, then generate a set of random permutations (all variable but the second are fixed) and check how many tests are greater (in absolute value) than the original one.*/ switch(INT(test)) { case GAUSSIAN_MUTUAL_INFORMATION: case LINEAR_CORRELATION: case FISHER_Z: observed = c_fast_pcor(covariance, &ncols, u, d, vt, &errcode); if (errcode) error("an error (%d) occurred in the call to dgesvd().\n", errcode); for (j = 0; j < (*B); j++) { /* reset the error flag of the SVD Fortran routine. */ errcode = 0; RandomPermutation(*num, perm, work); for (k = 0; k < *num; k++) yperm[k] = yorig[perm[k]]; /* restore the covariance matrix from the good copy. */ memcpy(covariance, covariance_backup, ncols * ncols * sizeof(double)); /* update the relevant covariances. */ c_update_covmat(column, mean, 1, &ncols, num, covariance); permuted = c_fast_pcor(covariance, &ncols, u, d, vt, &errcode); if (errcode != 0) error_counter++; if (fabs(permuted) > fabs(observed)) { sequential_counter_check(*res); }/*THEN*/ }/*FOR*/ if (error_counter > 0) warning("unable to compute %d permutations due to errors in dgesvd().\n", error_counter); break; }/*SWITCH*/ PutRNGstate(); /* save the observed p-value. */ *res /= *B; UNPROTECT(1); return result; }/*GAUSS_CMCARLO*/
/* convert an arc set to a weighted edge list. */ SEXP arcs2welist(SEXP arcs, SEXP nodes, SEXP weights, SEXP nid, SEXP sublist, SEXP parents) { int i = 0, j = 0, k = 0, nnodes = length(nodes), narcs = length(arcs)/2; int *e = NULL, *coords = NULL, *adjacent = NULL; int num_id = isTRUE(nid), sub = isTRUE(sublist), up = isTRUE(parents); double *w = REAL(weights), *ew = NULL; SEXP try, elist, edges, edge_weights, temp, temp_name = R_NilValue; /* allocate the return value. */ PROTECT(elist = allocVector(VECSXP, nnodes)); /* set the node names. */ setAttrib(elist, R_NamesSymbol, nodes); /* allocate and initialize the subset name. */ if (sub > 0) PROTECT(temp_name = mkStringVec(2, "edges", "weight")); /* allocate the scratch space to keep track adjacent nodes. */ adjacent = alloc1dcont(nnodes); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); for (i = 0; i < narcs; i++) adjacent[coords[i + up * narcs] - 1]++; for (i = 0; i < nnodes; i++) { /* allocate and set up the edge array. */ if (num_id > 0) { PROTECT(edges = allocVector(INTSXP, adjacent[i])); e = INTEGER(edges); }/*THEN*/ else { PROTECT(edges = allocVector(STRSXP, adjacent[i])); }/*ELSE*/ /* allocate and set up the weights array. */ PROTECT(edge_weights = allocVector(REALSXP, adjacent[i])); ew = REAL(edge_weights); /* copy the coordinates or the labels of adjacent nodes. */ for (j = 0, k = 0; j < narcs; j++) { if (coords[j + up * narcs] != i + 1) continue; /* copy the weight as well. */ ew[k] = w[j]; if (num_id > 0) e[k++] = coords[(1 - up) * narcs + j]; else SET_STRING_ELT(edges, k++, STRING_ELT(arcs, (1 - up) * narcs + j)); if (k == adjacent[i]) break; }/*FOR*/ if (sub > 0) { /* allocate and set up the "edge" sublist for graphNEL. */ PROTECT(temp = allocVector(VECSXP, 2)); setAttrib(temp, R_NamesSymbol, temp_name); SET_VECTOR_ELT(temp, 0, edges); SET_VECTOR_ELT(temp, 1, edge_weights); SET_VECTOR_ELT(elist, i, temp); UNPROTECT(1); }/*THEN*/ else { /* save weights with edges as names. */ setAttrib(edge_weights, R_NamesSymbol, edges); SET_VECTOR_ELT(elist, i, edge_weights); }/*ELSE*/ UNPROTECT(2); }/*FOR*/ if (sub > 0) UNPROTECT(3); else UNPROTECT(2); return elist; }/*ARCS2WELIST*/
/* convert an arc set to an edge list. */ SEXP arcs2elist(SEXP arcs, SEXP nodes, SEXP id, SEXP sublist) { int i = 0, j = 0, k = 0, nnodes = LENGTH(nodes), narcs = LENGTH(arcs)/2; int *e = NULL, *coords = NULL, *children = NULL; int *convert = LOGICAL(id), *sub = LOGICAL(sublist); SEXP try, elist, edges, temp, temp_name = R_NilValue; /* allocate the return value. */ PROTECT(elist = allocVector(VECSXP, nnodes)); /* set the node names. */ setAttrib(elist, R_NamesSymbol, nodes); if (*sub > 0) { /* allocate and initialize the subset name. */ PROTECT(temp_name = allocVector(STRSXP, 1)); SET_STRING_ELT(temp_name, 0, mkChar("edges")); }/*THEN*/ /* allocate the scratch space to keep track of the children of each node. */ children = alloc1dcont(nnodes); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); for (i = 0; i < narcs; i++) { children[coords[i] - 1]++; }/*FOR*/ for (i = 0; i < nnodes; i++) { /* allocate and set up the edge array. */ if (*convert > 0) { PROTECT(edges = allocVector(INTSXP, children[i])); e = INTEGER(edges); }/*THEN*/ else { PROTECT(edges = allocVector(STRSXP, children[i])); }/*ELSE*/ /* copy the coordinates of the adjacent nodes. */ for (j = 0, k = 0; j < narcs; j++) { if (coords[j] != i + 1) continue; if (*convert > 0) e[k++] = coords[narcs + j]; else SET_STRING_ELT(edges, k++, STRING_ELT(arcs, narcs + j)); if (k == children[i]) break; }/*FOR*/ if (*sub > 0) { /* allocate and set up the "edge" sublist for graphNEL. */ PROTECT(temp = allocVector(VECSXP, 1)); setAttrib(temp, R_NamesSymbol, temp_name); SET_VECTOR_ELT(temp, 0, edges); SET_VECTOR_ELT(elist, i, temp); UNPROTECT(1); }/*THEN*/ else { SET_VECTOR_ELT(elist, i, edges); }/*ELSE*/ UNPROTECT(1); }/*FOR*/ if (*sub > 0) UNPROTECT(3); else UNPROTECT(2); return elist; }/*ARCS2ELIST*/
SEXP is_pdag_acyclic(SEXP arcs, SEXP nodes, SEXP return_nodes, SEXP directed, SEXP debug) { int i = 0, j = 0, z = 0; int nrows = LENGTH(nodes); int check_status = nrows, check_status_old = nrows; int *rowsums = NULL, *colsums = NULL, *crossprod = NULL, *a = NULL; int *debuglevel = NULL; short int *status = NULL; SEXP amat; /* dereference the debug parameter. */ debuglevel = LOGICAL(debug); /* build the adjacency matrix from the arc set. */ if (*debuglevel > 0) Rprintf("* building the adjacency matrix.\n"); PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* should we consider only directed arcs? */ if (isTRUE(directed)) { /* removing undirected arcs, so that only cycles made only of directed * arcs will make the function return TRUE. */ for (i = 0; i < nrows; i++) for (j = 0; j < nrows; j++) if ((a[CMC(i, j, nrows)] == 1) && (a[CMC(j, i, nrows)] == 1)) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; }/*THEN*/ /* initialize the status, {row,col}sums and crossprod arrays. */ status = allocstatus(nrows); rowsums = alloc1dcont(nrows); colsums = alloc1dcont(nrows); crossprod = alloc1dcont(nrows); if (*debuglevel > 0) Rprintf("* checking whether the partially directed graph is acyclic.\n"); /* even in the worst case scenario at least two nodes are marked as * good at each iteration, so even ceil(nrows/2) iterations should be * enough. */ for (z = 0; z < nrows; z++) { start: if (*debuglevel > 0) Rprintf("* beginning iteration %d.\n", z + 1); for (i = 0; i < nrows; i++) { /* skip known-good nodes. */ if (status[i] == GOOD) continue; /* reset and update row and column totals. */ rowsums[i] = colsums[i] = crossprod[i] = 0; /* compute row and column totals for the i-th node. */ for (j = 0; j < nrows; j++) { rowsums[i] += a[CMC(i, j, nrows)]; colsums[i] += a[CMC(j, i, nrows)]; crossprod[i] += a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)]; }/*FOR*/ there: if (*debuglevel > 0) Rprintf(" > checking node %s (%d child(ren), %d parent(s), %d neighbours).\n", NODE(i), rowsums[i], colsums[i], crossprod[i]); /* if either total is zero, the node is either a root node or a * leaf node, and is not part of any cycle. */ if (((rowsums[i] == 0) || (colsums[i] == 0)) || ((crossprod[i] == 1) && (rowsums[i] == 1) && (colsums[i] == 1))) { if (*debuglevel > 0) Rprintf(" @ node %s is cannot be part of a cycle.\n", NODE(i)); /* update the adjacency matrix and the row/column totals. */ for (j = 0; j < nrows; j++) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; rowsums[i] = colsums[i] = crossprod[i] = 0; /* mark the node as good. */ status[i] = GOOD; check_status--; }/*THEN*/ else if (crossprod[i] == 1) { /* find the other of the undirected arc. */ for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) break; /* safety check, just in case. */ if (i == j) continue; if (((colsums[i] == 1) && (colsums[j] == 1)) || ((rowsums[i] == 1) && (rowsums[j] == 1))) { if (*debuglevel > 0) Rprintf(" @ arc %s - %s is cannot be part of a cycle.\n", NODE(i), NODE(j)); /* update the adjacency matrix and the row/column totals. */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; crossprod[i] = 0; rowsums[i]--; colsums[i]--; rowsums[j]--; colsums[j]--; /* jump back to the first check; if either the row or column total * was equal to 1 only because of the undirected arc, the node can * now be marked as good. */ if ((rowsums[i] == 0) || (colsums[i] == 0)) goto there; }/*THEN*/ }/*THEN*/ }/*FOR*/ /* at least three nodes are needed to have a cycle. */ if (check_status < 3) { if (*debuglevel > 0) Rprintf("@ at least three nodes are needed to have a cycle.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ /* if there are three or more bad nodes and there was no change in * the last iteration, the algorithm is stuck on a cycle. */ if (check_status_old == check_status) { if (*debuglevel > 0) Rprintf("@ no change in the last iteration.\n"); /* give up and call c_has_path() to kill some undirected arcs. */ for (i = 0; i < nrows; i++) for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) { /* remove the arc from the adjacency matrix while testing it, * there's a path is always found (the arc itself). */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; if(!c_has_path(i, j, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE) && !c_has_path(j, i, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE)) { if (*debuglevel > 0) Rprintf("@ arc %s - %s is not part of any cycle, removing.\n", NODE(i), NODE(j)); /* increase the iteration counter and start again. */ z++; goto start; }/*THEN*/ else { /* at least one cycle is really present; give up and return. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*ELSE*/ }/*THEN*/ /* give up if there are no undirected arcs, cycles composed * entirely by directed arcs are never false positives. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ else { check_status_old = check_status; }/*ELSE*/ }/*FOR*/ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*IS_PDAG_ACYCLIC*/
double castelo_prior(SEXP beta, SEXP target, SEXP parents, SEXP children, SEXP debug) { int i = 0, k = 0, t = 0, nnodes = 0, cur_arc = 0; int nbeta = LENGTH(VECTOR_ELT(beta, 0)); int *temp = NULL, *debuglevel = LOGICAL(debug), *aid = INTEGER(VECTOR_ELT(beta, 2)); double prior = 0, result = 0; double *bkwd = REAL(VECTOR_ELT(beta, 4)), *fwd = REAL(VECTOR_ELT(beta, 3)); short int *adjacent = NULL; SEXP nodes, try; /* get the node labels. */ nodes = getAttrib(beta, install("nodes")); nnodes = LENGTH(nodes); /* match the target node. */ PROTECT(try = match(nodes, target, 0)); t = INT(try); UNPROTECT(1); /* find out which nodes are parents and which nodes are children. */ adjacent = allocstatus(nnodes); PROTECT(try = match(nodes, parents, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = PARENT; UNPROTECT(1); PROTECT(try = match(nodes, children, 0)); temp = INTEGER(try); for (i = 0; i < LENGTH(try); i++) adjacent[temp[i] - 1] = CHILD; UNPROTECT(1); /* prior probabilities table lookup. */ for (i = t + 1; i <= nnodes; i++) { /* compute the arc id. */ cur_arc = UPTRI3(t, i, nnodes); /* look up the prior probability. */ for (/*k,*/ prior = ((double)1/3); k < nbeta; k++) { /* arcs are ordered, so we can stop early in the lookup. */ if (aid[k] > cur_arc) break; if (aid[k] == cur_arc) { switch(adjacent[i - 1]) { case PARENT: prior = bkwd[k]; break; case CHILD: prior = fwd[k]; break; default: prior = 1 - bkwd[k] - fwd[k]; }/*SWITCH*/ break; }/*THEN*/ }/*FOR*/ if (*debuglevel > 0) { switch(adjacent[i - 1]) { case PARENT: Rprintf(" > found arc %s -> %s, prior pobability is %lf.\n", NODE(i - 1), NODE(t - 1), prior); break; case CHILD: Rprintf(" > found arc %s -> %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); break; default: Rprintf(" > no arc between %s and %s, prior probability is %lf.\n", NODE(t - 1), NODE(i - 1), prior); }/*SWITCH*/ }/*THEN*/ /* move to log-scale and divide by the non-informative log(1/3), so that * the contribution of each arc whose prior has not been not specified by * the user is zero; overflow is likely otherwise. */ result += log(prior / ((double)1/3)); }/*FOR*/ return result; }/*CASTELO_PRIOR*/ /* complete a prior as per Castelo & Siebes. */ SEXP castelo_completion(SEXP prior, SEXP nodes) { int i = 0, k = 0, cur = 0, narcs1 = 0, narcs2 = 0, nnodes = LENGTH(nodes); int *m1 = NULL, *m2 = NULL, *und = NULL, *aid = NULL, *poset = NULL, *id = NULL; double *d1 = NULL, *d2 = NULL, *p = NULL; SEXP df, arc_id, undirected, a1, a2, match1, match2, prob; SEXP result, colnames, from, to, nid, dir1, dir2; /* compute numeric IDs for the arcs. */ a1 = VECTOR_ELT(prior, 0); a2 = VECTOR_ELT(prior, 1); narcs1 = LENGTH(a1); PROTECT(match1 = match(nodes, a1, 0)); PROTECT(match2 = match(nodes, a2, 0)); m1 = INTEGER(match1); m2 = INTEGER(match2); PROTECT(arc_id = allocVector(INTSXP, narcs1)); aid = INTEGER(arc_id); c_arc_hash(&narcs1, &nnodes, m1, m2, aid, NULL, TRUE); /* duplicates correspond to undirected arcs. */ PROTECT(undirected = dupe(arc_id)); und = INTEGER(undirected); /* extract the components from the prior. */ prob = VECTOR_ELT(prior, 2); p = REAL(prob); /* count output arcs. */ for (i = 0; i < narcs1; i++) narcs2 += 2 - und[i]; narcs2 /= 2; /* allocate the columns of the return value. */ PROTECT(from = allocVector(STRSXP, narcs2)); PROTECT(to = allocVector(STRSXP, narcs2)); PROTECT(nid = allocVector(INTSXP, narcs2)); id = INTEGER(nid); PROTECT(dir1 = allocVector(REALSXP, narcs2)); d1 = REAL(dir1); PROTECT(dir2 = allocVector(REALSXP, narcs2)); d2 = REAL(dir2); /* sort the strength coefficients. */ poset = alloc1dcont(narcs1); for (k = 0; k < narcs1; k++) poset[k] = k; R_qsort_int_I(aid, poset, 1, narcs1); for (i = 0, k = 0; i < narcs1; i++) { cur = poset[i]; #define ASSIGN(A1, A2, D1, D2) \ SET_STRING_ELT(from, k, STRING_ELT(A1, cur)); \ SET_STRING_ELT(to, k, STRING_ELT(A2, cur)); \ id[k] = aid[i]; \ D1[k] = p[cur]; \ if ((und[cur] == TRUE) && (i < narcs1 - 1)) \ D2[k] = p[poset[++i]]; \ else \ D2[k] = (1 - D1[k])/2; /* copy the node labels. */ if (m1[cur] < m2[cur]) { ASSIGN(a1, a2, d1, d2); }/*THEN*/ else { ASSIGN(a2, a1, d2, d1); }/*ELSE*/ if (d1[k] + d2[k] > 1) { UNPROTECT(9); error("the probabilities for arc %s -> %s sum to %lf.", CHAR(STRING_ELT(from, k)), CHAR(STRING_ELT(to, k)), d1[k] + d2[k]); }/*THEN*/ /* move to the next arc. */ k++; }/*FOR*/ /* set up the return value. */ PROTECT(result = allocVector(VECSXP, 5)); SET_VECTOR_ELT(result, 0, from); SET_VECTOR_ELT(result, 1, to); SET_VECTOR_ELT(result, 2, nid); SET_VECTOR_ELT(result, 3, dir1); SET_VECTOR_ELT(result, 4, dir2); PROTECT(colnames = allocVector(STRSXP, 5)); SET_STRING_ELT(colnames, 0, mkChar("from")); SET_STRING_ELT(colnames, 1, mkChar("to")); SET_STRING_ELT(colnames, 2, mkChar("aid")); SET_STRING_ELT(colnames, 3, mkChar("fwd")); SET_STRING_ELT(colnames, 4, mkChar("bkwd")); setAttrib(result, R_NamesSymbol, colnames); PROTECT(df = minimal_data_frame(result)); UNPROTECT(12); return df; }/*CASTELO_COMPLETION*/
/* predict the value of the training variable in a naive Bayes or Tree-Augmented * naive Bayes classifier. */ SEXP naivepred(SEXP fitted, SEXP data, SEXP parents, SEXP training, SEXP prior, SEXP prob, SEXP debug) { int i = 0, j = 0, k = 0, n = 0, nvars = LENGTH(fitted), nmax = 0, tr_nlevels = 0; int *res = NULL, **ex = NULL, *ex_nlevels = NULL; int idx = 0, *tr_id = INTEGER(training); int *iscratch = NULL, *maxima = NULL, *prn = NULL, *debuglevel = LOGICAL(debug); int *include_prob = LOGICAL(prob); double **cpt = NULL, *pr = NULL, *scratch = NULL, *buf = NULL, *pt = NULL; double sum = 0; SEXP class, temp, tr, tr_levels, result, nodes, probtab, dimnames; /* cache the node labels. */ nodes = getAttrib(fitted, R_NamesSymbol); /* cache the pointers to all the variables. */ ex = (int **) alloc1dpointer(nvars); ex_nlevels = alloc1dcont(nvars); for (i = 0; i < nvars; i++) { temp = VECTOR_ELT(data, i); ex[i] = INTEGER(temp); ex_nlevels[i] = NLEVELS(temp); }/*FOR*/ /* get the training variable and its levels. */ n = LENGTH(VECTOR_ELT(data, 0)); tr = getListElement(VECTOR_ELT(fitted, *tr_id - 1), "prob"); tr_levels = VECTOR_ELT(getAttrib(tr, R_DimNamesSymbol), 0); tr_nlevels = LENGTH(tr_levels); /* get the prior distribution. */ pr = REAL(prior); if (*debuglevel > 0) { Rprintf("* the prior distribution for the target variable is:\n"); PrintValue(prior); }/*THEN*/ /* allocate the scratch space used to compute posterior probabilities. */ scratch = alloc1dreal(tr_nlevels); buf = alloc1dreal(tr_nlevels); /* cache the pointers to the conditional probability tables. */ cpt = (double **) alloc1dpointer(nvars); for (i = 0; i < nvars; i++) cpt[i] = REAL(getListElement(VECTOR_ELT(fitted, i), "prob")); /* dereference the parents' vector. */ prn = INTEGER(parents); /* create the vector of indexes. */ iscratch = alloc1dcont(tr_nlevels); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(tr_nlevels); /* allocate the return value. */ PROTECT(result = allocVector(INTSXP, n)); res = INTEGER(result); /* allocate and initialize the table of the posterior probabilities. */ if (*include_prob > 0) { PROTECT(probtab = allocMatrix(REALSXP, tr_nlevels, n)); pt = REAL(probtab); memset(pt, '\0', n * tr_nlevels * sizeof(double)); }/*THEN*/ /* initialize the random seed, just in case we need it for tie breaking. */ GetRNGstate(); /* for each observation... */ for (i = 0; i < n; i++) { /* ... reset the scratch space and the indexes array... */ for (k = 0; k < tr_nlevels; k++) { scratch[k] = log(pr[k]); iscratch[k] = k + 1; }/*FOR*/ if (*debuglevel > 0) Rprintf("* predicting the value of observation %d.\n", i + 1); /* ... and for each conditional probability table... */ for (j = 0; j < nvars; j++) { /* ... skip the training variable... */ if (*tr_id == j + 1) continue; /* ... (this is the root node of the Chow-Liu tree) ... */ if (prn[j] == NA_INTEGER) { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d) from the CPT (p = %lf).\n", NODE(j), CMC(ex[j][i] - 1, k, ex_nlevels[j]), ex[j][i], k + 1, cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*FOR*/ }/*THEN*/ else { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { /* (the first dimension corresponds to the current node [X], the second * to the training node [Y], the third to the only parent of the current * node [Z]; CMC coordinates are computed as X + Y * NX + Z * NX * NY. */ idx = (ex[j][i] - 1) + k * ex_nlevels[j] + (ex[prn[j] - 1][i] - 1) * ex_nlevels[j] * tr_nlevels; if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d, %d) from the CPT (p = %lf).\n", NODE(j), idx, ex[j][i], k + 1, ex[prn[j] - 1][i], cpt[j][idx]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][idx]); }/*FOR*/ }/*ELSE*/ }/*FOR*/ /* find out the mode(s). */ all_max(scratch, tr_nlevels, maxima, &nmax, iscratch, buf); /* compute the posterior probabilities on the right scale, to attach them * to the return value. */ if (*include_prob) { /* copy the log-probabilities from scratch. */ memcpy(pt + i * tr_nlevels, scratch, tr_nlevels * sizeof(double)); /* transform log-probabilitiees into plain probabilities. */ for (k = 0, sum = 0; k < tr_nlevels; k++) sum += pt[i * tr_nlevels + k] = exp(pt[i * tr_nlevels + k] - scratch[maxima[0] - 1]); /* rescale them to sum up to 1. */ for (k = 0; k < tr_nlevels; k++) pt[i * tr_nlevels + k] /= sum; }/*THEN*/ if (nmax == 1) { res[i] = maxima[0]; if (*debuglevel > 0) { Rprintf(" @ prediction for observation %d is '%s' with (log-)posterior:\n", i + 1, CHAR(STRING_ELT(tr_levels, res[i] - 1))); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ SampleReplace(1, nmax, res + i, maxima); if (*debuglevel > 0) { Rprintf(" @ there are %d levels tied for prediction of observation %d, applying tie breaking.\n", nmax, i + 1); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); Rprintf(" @ tied levels are:"); for (k = 0; k < nmax; k++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[k] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ }/*FOR*/ /* save the state of the random number generator. */ PutRNGstate(); /* add back the attributes and the class to the return value. */ PROTECT(class = allocVector(STRSXP, 1)); SET_STRING_ELT(class, 0, mkChar("factor")); setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, class); if (*include_prob > 0) { /* set the levels of the taregt variable as rownames. */ PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, tr_levels); setAttrib(probtab, R_DimNamesSymbol, dimnames); /* add the posterior probabilities to the return value. */ setAttrib(result, install("prob"), probtab); UNPROTECT(4); }/*THEN*/ else { UNPROTECT(2); }/*ELSE*/ return result; }/*NAIVEPRED*/
/* unconditional Monte Carlo simulation for discrete tests. */ SEXP mcarlo(SEXP x, SEXP y, SEXP lx, SEXP ly, SEXP length, SEXP samples, SEXP test, SEXP alpha) { double *fact = NULL, *res = NULL, observed = 0; int *n = NULL, *ncolt = NULL, *nrowt = NULL, *workspace = NULL; int *num = INTEGER(length), *nr = INTEGER(lx), *nc = INTEGER(ly); int *xx = INTEGER(x), *yy = INTEGER(y), *B = INTEGER(samples); int i = 0, k = 0, npermuts = 0, enough = ceil(NUM(alpha) * (*B)) + 1; SEXP result; /* allocate and initialize the result. */ PROTECT(result = allocVector(REALSXP, 3)); res = REAL(result); res[0] = res[1] = res[2] = 0; // initial test score / p-value / nb permutations /* allocate and compute the factorials needed by rcont2. */ allocfact(*num); /* allocate and initialize the workspace for rcont2. */ workspace = alloc1dcont(*nc); /* initialize the contingency table. */ n = alloc1dcont(*nr * (*nc)); /* initialize the marginal frequencies. */ nrowt = alloc1dcont(*nr); ncolt = alloc1dcont(*nc); /* compute the joint frequency of x and y. */ for (k = 0; k < *num; k++) n[CMC(xx[k] - 1, yy[k] - 1, *nr)]++; /* compute the marginals. */ for (i = 0; i < *nr; i++) for (k = 0; k < *nc; k++) { nrowt[i] += n[CMC(i, k, *nr)]; ncolt[k] += n[CMC(i, k, *nr)]; }/*FOR*/ /* initialize the random number generator. */ GetRNGstate(); /* pick up the observed value of the test statistic, then generate a set of random contingency tables (given row and column totals) and check how many tests are greater than the original one.*/ switch(INT(test)) { case MUTUAL_INFORMATION: observed = _mi(n, nrowt, ncolt, nr, nc, num); for (k = 0; k < *B; k++) { rcont2(nr, nc, nrowt, ncolt, num, fact, workspace, n); if (_mi(n, nrowt, ncolt, nr, nc, num) > observed) { sequential_counter_check(res[1]); }/*THEN*/ npermuts++; }/*FOR*/ observed = 2 * observed; break; case PEARSON_X2: observed = _x2(n, nrowt, ncolt, nr, nc, num); for (k = 0; k < *B; k++) { rcont2(nr, nc, nrowt, ncolt, num, fact, workspace, n); if (_x2(n, nrowt, ncolt, nr, nc, num) > observed) { sequential_counter_check(res[1]); }/*THEN*/ npermuts++; }/*FOR*/ break; }/*SWITCH*/ PutRNGstate(); /* save the observed value of the statistic, the corresponding p-value, and the number of permutations performed. */ res[0] = observed; res[1] /= (*B); res[2] = npermuts; UNPROTECT(1); return result; }/*MCARLO*/
/* generate a connected graph with uniform probability, subject to some * constraints on the degree of the nodes. */ SEXP ide_cozman_graph(SEXP nodes, SEXP num, SEXP burn_in, SEXP max_in_degree, SEXP max_out_degree, SEXP max_degree, SEXP connected, SEXP debug) { int i = 0, k = 0, nnodes = LENGTH(nodes), *n = INTEGER(num); int changed = 0, *work = NULL, *arc = NULL, *a = NULL, *burn = INTEGER(burn_in); int *degree = NULL, *in_degree = NULL, *out_degree = NULL; int *debuglevel = LOGICAL(debug), *cozman = LOGICAL(connected); double *max_in = REAL(max_in_degree), *max_out = REAL(max_out_degree), *max = REAL(max_degree); SEXP list, res, args, argnames, amat, arcs, cached, debug2, null, temp; char *label = (*cozman > 0) ? "ic-dag" : "melancon"; /* a fake debug argument (set to FALSE) for cache_structure(). */ PROTECT(debug2 = allocVector(LGLSXP, 1)); LOGICAL(debug2)[0] = FALSE; /* the list of optional arguments. */ PROTECT(argnames = allocVector(STRSXP, 4)); SET_STRING_ELT(argnames, 0, mkChar("burn.in")); SET_STRING_ELT(argnames, 1, mkChar("max.in.degree")); SET_STRING_ELT(argnames, 2, mkChar("max.out.degree")); SET_STRING_ELT(argnames, 3, mkChar("max.degree")); PROTECT(args = allocVector(VECSXP, 4)); setAttrib(args, R_NamesSymbol, argnames); SET_VECTOR_ELT(args, 0, burn_in); SET_VECTOR_ELT(args, 1, max_in_degree); SET_VECTOR_ELT(args, 2, max_out_degree); SET_VECTOR_ELT(args, 3, max_degree); /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, nnodes, nnodes)); a = INTEGER(amat); memset(a, '\0', nnodes * nnodes * sizeof(int)); /* initialize a simple ordered tree with n nodes, where all nodes * have just one parent, except the first one that does not have * any parent. */ for (i = 1; i < nnodes; i++) a[CMC(i - 1, i, nnodes)] = 1; /* allocate the arrays needed by SampleNoReplace. */ arc = alloc1dcont(2); work = alloc1dcont(nnodes); /* allocate and initialize the degree arrays. */ degree = alloc1dcont(nnodes); in_degree = alloc1dcont(nnodes); out_degree = alloc1dcont(nnodes); for (i = 0; i < nnodes; i++) { in_degree[i] = out_degree[i] = 1; degree[i] = 2; }/*FOR*/ in_degree[0] = out_degree[nnodes - 1] = 0; degree[0] = degree[nnodes - 1] = 1; GetRNGstate(); /* wait for the markov chain monte carlo simulation to reach stationarity. */ for (k = 0; k < *burn; k++) { if (*debuglevel > 0) Rprintf("* current model (%d):\n", k + 1); changed = ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); /* print the model string to allow a sane debugging experience; note that this * has a huge impact on performance, so use it with care. */ if ((*debuglevel > 0) && (changed)) { PROTECT(null = allocVector(NILSXP, 1)); PROTECT(res = bn_base_structure(nodes, args, null, null, 0, "none", label)); PROTECT(arcs = amat2arcs(amat, nodes)); PROTECT(cached = cache_structure(nodes, amat, debug2)); SET_VECTOR_ELT(res, 1, cached); SET_VECTOR_ELT(res, 2, arcs); print_modelstring(res); UNPROTECT(4); }/*THEN*/ }/*FOR*/ #define UPDATE_NODE_CACHE(cur) \ if (*debuglevel > 0) \ Rprintf(" > updating cached information about node %s.\n", NODE(cur)); \ memset(work, '\0', nnodes * sizeof(int)); \ PROTECT(temp = c_cache_partial_structure(cur, nodes, amat, work, debug2)); \ SET_VECTOR_ELT(cached, cur, temp); \ UNPROTECT(1); /* return a list if more than one bn is generated. */ if (*n > 1) { if (*debuglevel > 0) Rprintf("* end of the burn-in iterations.\n"); PROTECT(list = allocVector(VECSXP, *n)); PROTECT(null = allocVector(NILSXP, 1)); /* generate the "bn" structure, with dummy NULLs for the "arcs" and * "nodes" elements (which will be initialized later on). */ PROTECT(res = bn_base_structure(nodes, args, null, null, 0, "none", label)); for (k = 0; k < *n; k++) { if (*debuglevel > 0) Rprintf("* current model (%d):\n", *burn + k + 1); changed = ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); if (changed || (k == 0)) { /* generate the arc set and the cached information from the adjacency * matrix. */ if (k > 0) { /* if a complete "bn" object is available, we can retrieve the cached * information about the nodes from the structure stored in the last * iteration and update only the elements that really need it. */ temp = VECTOR_ELT(VECTOR_ELT(list, k - 1), 1); PROTECT(cached = duplicate(temp)); /* update the first sampled nodes; both of them gain/lose either * a parent or a child. */ UPDATE_NODE_CACHE(arc[0] - 1); UPDATE_NODE_CACHE(arc[1] - 1); /* all the parents of the second sampled node gain/lose a node in * the markov blanket (the first sampled node, which shares a child * with all of them). */ for (i = 0; i < nnodes; i++) { if ((i != arc[0] - 1) && (a[CMC(i, arc[1] - 1, nnodes)] == 1)) { UPDATE_NODE_CACHE(i); }/*THEN*/ }/*FOR*/ }/*THEN*/ else { PROTECT(cached = cache_structure(nodes, amat, debug2)); }/*ELSE*/ PROTECT(arcs = amat2arcs(amat, nodes)); SET_VECTOR_ELT(res, 1, cached); SET_VECTOR_ELT(res, 2, arcs); /* print the model string to allow a sane debugging experience. */ if (*debuglevel > 0) print_modelstring(res); /* save the structure in the list. */ PROTECT(temp = duplicate(res)); SET_VECTOR_ELT(list, k, temp); UNPROTECT(3); }/*THEN*/ else { /* the adjacency matrix is unchanged; so we can just copy the bayesian * network from the previous iteration in the k-th slot of the list. */ SET_VECTOR_ELT(list, k, VECTOR_ELT(list, k - 1)); }/*ELSE*/ }/*FOR*/ PutRNGstate(); UNPROTECT(7); return list; }/*THEN*/ else { if (*debuglevel > 0) Rprintf("* end of the burn-in.\n* current model (%d):\n", *burn + 1); ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); /* generate the arc set and the cached information form the adjacency * matrix. */ PROTECT(arcs = amat2arcs(amat, nodes)); PROTECT(cached = cache_structure(nodes, amat, debug2)); /* generate the "bn" structure. */ PROTECT(res = bn_base_structure(nodes, args, arcs, cached, 0, "none", label)); /* print the model string to allow a sane debugging experience. */ if (*debuglevel > 0) print_modelstring(res); PutRNGstate(); UNPROTECT(7); return res; }/*ELSE*/ }/*IDE_COZMAN_GRAPH*/
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncols = LENGTH(data); int num = LENGTH(VECTOR_ELT(data, i)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, *clevels = NULL, *est = INTEGER(estimator); int *wl = NULL, *bl = NULL, *poset = NULL, *debuglevel = LOGICAL(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = alloc1dcont(1); *clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = alloc1dreal(UPTRI3_MATRIX(ncols)); include = allocstatus(UPTRI3_MATRIX(ncols)); /* compute the pairwise mutual information coefficients. */ if (*debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncols, nlevels, &num, cond, clevels, est); LIST_MUTUAL_INFORMATION_COEFS() /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (LENGTH(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = LENGTH(wlist); for (i = 0; i < nwl; i++) { if (*debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (LENGTH(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = LENGTH(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = alloc1dcont(UPTRI3_MATRIX(ncols)); for (i = 0; i < UPTRI3_MATRIX(ncols); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncols)); for (i = UPTRI3_MATRIX(ncols) - 1; i > 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncols, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncols - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (*debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, debug_coord[0], debug_coord[1], ncols, nodes, FALSE)) { if (*debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (*debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (LENGTH(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncols - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncols - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncols - 1)); return arcs; }/*CHOW_LIU*/