/* return the complete orientation of a graph (the nodes argument gives * the node ordering). */ SEXP pdag2dag(SEXP arcs, SEXP nodes) { int i = 0, j = 0, n = length(nodes); int *a = NULL; SEXP amat, res; /* build the adjacency matrix. */ PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* scan the adjacency matrix. */ for (i = 0; i < n; i++) { for (j = i + 1; j < n; j++) { /* if an arc is undirected, kill the orientation that violates the * specified node ordering (the one which is located in the lower * half of the matrix). */ if ((a[CMC(i, j, n)] == 1) && (a[CMC(j, i, n)] == 1)) a[CMC(j, i, n)] = 0; }/*FOR*/ }/*FOR*/ /* build the return value. */ PROTECT(res = amat2arcs(amat, nodes)); UNPROTECT(2); return res; }/*PDAG2DAG*/
static int all_adjacent(int *a, int node, int k, int nnodes, int *nbr) { int j = 0, l = 0; for (j = 0; j < k; j++) { /* for every node that is connected to the current node by an undirected * arc, we need to check that that node is adjacent to all other nodes * that are adjacent to the current node; the implication is that we can * skip nodes that are connected to the current node by a directed arc. */ if ((a[CMC(nbr[j], node, nnodes)] == 0) || (a[CMC(node, nbr[j], nnodes)] == 0)) continue; for (l = 0; l < k; l++) { if (l == j) continue; if ((a[CMC(nbr[j], nbr[l], nnodes)] == 0) && (a[CMC(nbr[l], nbr[j], nnodes)] == 0)) { /* this node violates the condition above. */ return FALSE; }/*THEN*/ }/*FOR*/ }/*FOR*/ return TRUE; }/*ALL_ADJACENT*/
/* determine whether a graph is DAG or a PDAG/UG. */ SEXP is_dag(SEXP arcs, SEXP nnodes) { int i = 0, nrows = length(arcs)/2, n = INT(nnodes); int *a = INTEGER(arcs); short int *checklist = NULL; /* allocate and initialize the checklist. */ checklist = allocstatus(UPTRI_MATRIX(n)); for (i = 0; i < nrows; i++) { if (checklist[UPTRI(a[CMC(i, 0, nrows)], a[CMC(i, 1, nrows)], n)] == 0) { /* this arc is not present in the checklist; add it. */ checklist[UPTRI(a[CMC(i, 0, nrows)], a[CMC(i, 1, nrows)], n)] = 1; }/*THEN*/ else { /* this arc or its opposite already present in the checklist; the graph * has at least an undirected arc, so return FALSE. */ return ScalarLogical(FALSE); }/*THEN*/ }/*FOR*/ return ScalarLogical(TRUE); }/*IS_DAG*/
static void is_a_sink(int *a, int node, int *k, int nnodes, int *nbr, short int *matched) { int j = 0; /* check whether the current node has outgoing arcs. */ for (j = 0, *k = 0; j < nnodes; j++) { /* nodes that has satisfied the conditions and had their undirected arcs * changed into directed arcs should be ignored in later iterations, along * with any incident arcs. */ if (matched[j] != 0) continue; if ((a[CMC(j, node, nnodes)] == 0) && (a[CMC(node, j, nnodes)] == 1)) { /* this node is not a candidate, go to the next one. */ *k = -1; break; }/*THEN*/ else if ((a[CMC(j, node, nnodes)] == 1) || (a[CMC(node, j, nnodes)] == 1)) { /* save adjacent nodes (connected by either an undirected or a directed * arc). */ nbr[(*k)++] = j; }/*THEN*/ }/*FOR*/ }/*IS_A_SINK*/
void build_tau(double **data, double *tau, int *ncols, int *nrows, int *imaginary, double *phi) { int i = 0, j = 0, res_ncols = *ncols + 1; double temp = 0; double *mean = NULL, *mat = NULL; /* allocate mean vector and covariance matrix. */ mean = alloc1dreal(*ncols); mat = alloc1dreal((*ncols) * (*ncols)); /* compute the mean values. */ for (i = 0; i < *ncols; i++) { for (j = 0 ; j < *nrows; j++) mean[i] += data[i][j]; mean[i] /= (*nrows); }/*FOR*/ /* compute the covariance matrix... */ c_covmat(data, mean, ncols, nrows, mat); /* ... multiply it by the phi coefficient... */ for (i = 0; i < *ncols; i++) for (j = 0; j < *ncols; j++) mat[CMC(i, j, *ncols)] *= (*phi); /* ... compute the pseudoinverse... */ c_ginv(mat, ncols, mat); /* ... and store it in the bottom-right corner of the tau matrix. */ for (i = 1; i < res_ncols; i++) for (j = 1; j < res_ncols; j++) tau[CMC(i, j, res_ncols)] = mat[CMC(i - 1, j - 1, *ncols)]; /* fill the top-right and bottom-left corners. */ for (i = 1; i < *ncols + 1; i++) { temp = 0; for (j = 0; j < *ncols; j++) temp += mean[j] * mat[CMC(j, i - 1, *ncols)]; tau[CMC(i, 0, res_ncols)] = tau[CMC(0, i, res_ncols)] = -temp; }/*FOR*/ /* fill the top-left corner. */ for (i = 1; i < res_ncols; i++) tau[CMC(0, 0, res_ncols)] += - mean[i - 1] * tau[CMC(i, 0, res_ncols)]; tau[CMC(0, 0, res_ncols)] += 1/((double) *imaginary); /* perform the final (pseudo)inversion. */ c_ginv(tau, &res_ncols, tau); }/*BUILD_TAU*/
/* C-level function to compute Moore-Penrose Generalized Inverse of a square matrix. */ void c_ginv(double *covariance, int ncols, double *mpinv) { int i = 0, j = 0, errcode = 0; double *u = NULL, *d = NULL, *vt = NULL, *backup = NULL; double sv_tol = 0, zero = 0, one = 1; char transa = 'N', transb = 'N'; c_udvt(&u, &d, &vt, ncols); if (covariance != mpinv) { backup = Calloc1D(ncols * ncols, sizeof(double)); memcpy(backup, covariance, ncols * ncols * sizeof(double)); }/*THEN*/ /* compute the SVD decomposition. */ c_svd(covariance, u, d, vt, &ncols, &ncols, &ncols, FALSE, &errcode); /* if SVD fails, catch the error code and free all buffers. */ if (errcode == 0) { /* set the threshold for the singular values as in corpcor. */ sv_tol = ncols * d[0] * MACHINE_TOL * MACHINE_TOL; /* the first multiplication, U * D^{-1} is easy. */ for (i = 0; i < ncols; i++) for (j = 0; j < ncols; j++) u[CMC(i, j, ncols)] = u[CMC(i, j, ncols)] * ((d[j] > sv_tol) ? 1/d[j] : 0); /* the second one, (U * D^{-1}) * Vt is a real matrix multiplication. */ F77_CALL(dgemm)(&transa, &transb, &ncols, &ncols, &ncols, &one, u, &ncols, vt, &ncols, &zero, mpinv, &ncols); }/*THEN*/ if (covariance != mpinv) { memcpy(covariance, backup, ncols * ncols * sizeof(double)); Free1D(backup); }/*THEN*/ Free1D(u); Free1D(d); Free1D(vt); if (errcode) error("an error (%d) occurred in the call to c_ginv().\n", errcode); }/*C_GINV*/
/* multinomial loss for a single node. */ double c_dloss(int *cur, SEXP cur_parents, int *configs, double *prob, SEXP data, SEXP nodes, int ndata, int nlevels, double *per_sample) { int i = 0, dropped = 0, *obs = NULL; double logprob = 0, result = 0; SEXP temp_df; /* get the target variable. */ obs = INTEGER(VECTOR_ELT(data, *cur)); /* get the parents' configurations. */ if (LENGTH(cur_parents) > 0) { PROTECT(temp_df = c_dataframe_column(data, cur_parents, FALSE, FALSE)); cfg(temp_df, configs, NULL); for (i = 0; i < ndata; i++) { logprob = log(prob[CMC(obs[i] - 1, configs[i], nlevels)]); if (!R_FINITE(logprob) || ISNAN(logprob)) dropped++; else result += logprob; if (per_sample) per_sample[i] += logprob; }/*FOR*/ UNPROTECT(1); }/*THEN*/ else { for (i = 0; i < ndata; i++) { logprob = log(prob[obs[i] - 1]); if (!R_FINITE(logprob) || ISNAN(logprob)) dropped++; else result += logprob; if (per_sample) per_sample[i] += logprob; }/*FOR*/ }/*ELSE*/ /* switch to the negentropy. */ result /= -(ndata - dropped); /* print a warning if data were dropped. */ if (dropped > 0) warning("%d observations were dropped because the corresponding probabilities for node %s were 0 or NaN.", dropped, NODE(*cur)); return result; }/*C_DLOSS*/
/* enumerate all subsets of a certain size (R interface). */ SEXP r_subsets(SEXP elems, SEXP size) { int i = 0, k = 0, n = length(elems), r = INT(size), *id = NULL; double nsub = choose(n, r); SEXP result; if (nsub * r > INT_MAX) error("too many subsets of size %d.", r); /* allocate the scratch space and the return value. */ id = Calloc(r, int); PROTECT(result = allocMatrix(STRSXP, nsub, r)); /* iterate over subsets. */ first_subset(id, r, 0); for (k = 0; k < nsub; k++) { for (i = 0; i < r; i++) SET_STRING_ELT(result, CMC(k, i, nsub), STRING_ELT(elems, id[i])); next_subset(id, r, n, 0); }/*FOR*/ Free(id); UNPROTECT(1); return result; }/*R_SUBSETS*/
SEXP smart_network_averaging(SEXP arcs, SEXP nodes, SEXP weights) { int k = 0, from = 0, to = 0, nrows = length(arcs) / 2, dims = length(nodes); int *a = NULL, *coords = NULL, *poset = NULL, *path = NULL, *scratch = NULL; double *w = NULL; SEXP weights2, amat, try, acyclic; /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, dims, dims)); a = INTEGER(amat); memset(a, '\0', sizeof(int) * dims * dims); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); /* duplicate the weights to preserve the original ones. */ PROTECT(weights2 = duplicate(weights)); w = REAL(weights2); /* sort the strength coefficients. */ poset = Calloc1D(nrows, sizeof(int)); for (k = 0; k < nrows; k++) poset[k] = k; R_qsort_I(w, poset, 1, nrows); /* allocate buffers for c_has_path(). */ path = Calloc1D(dims, sizeof(int)); scratch = Calloc1D(dims, sizeof(int)); /* iterate over the arcs in reverse order wrt their strength coefficients. */ for (k = 0; k < nrows; k++) { from = coords[poset[k]] - 1; to = coords[poset[k] + nrows] - 1; /* add an arc only if it does not introduce cycles. */ if (!c_has_path(to, from, a, dims, nodes, FALSE, TRUE, path, scratch, FALSE)) a[CMC(from, to, dims)] = 1; else warning("arc %s -> %s would introduce cycles in the graph, ignoring.", NODE(from), NODE(to)); }/*FOR*/ /* convert the adjacency matrix back to an arc set and return it. */ acyclic = amat2arcs(amat, nodes); Free1D(path); Free1D(scratch); Free1D(poset); UNPROTECT(3); return acyclic; }/*SMART_NETWORK_AVERAGING*/
/* compute Pearson's X^2 coefficient from the joint and marginal frequencies. */ static double _x2(int *n, int *nrowt, int *ncolt, int *nrows, int *ncols, int *length) { int i = 0, j = 0; double res = 0; for (i = 0; i < *nrows; i++) for (j = 0; j < *ncols; j++) { if (n[CMC(i, j, *nrows)] != 0) res += (n[CMC(i, j, *nrows)] - nrowt[i] * (double)ncolt[j] / (*length)) * (n[CMC(i, j, *nrows)] - nrowt[i] * (double)ncolt[j] / (*length)) / (nrowt[i] * (double)ncolt[j] / (*length)); }/*FOR*/ return res; }/*_X2*/
/* shrink the covariance matrix (except the diagonal, which stays the same). */ void covmat_shrink(double *var, int ncols, double lambda) { int i = 0, j = 0; for (i = 0; i < ncols; i++) for (j = 0; j < ncols; j++) if (i != j) var[CMC(i, j, ncols)] *= 1 - lambda; }/*COVMAT_SHRINK*/
/* compute the shrinkage intensity lambda for a covariance matrix. */ double covmat_lambda(double **column, double *mean, double *var, int n, int ncols) { int i = 0, j = 0, k = 0, cur = 0; long double lambda = 0, sumcors = 0, sumvars = 0, temp = 0; for (i = 0; i < ncols; i++) { for (j = i; j < ncols; j++) { cur = CMC(i, j, ncols); if (i != j) { /* do the first round of computations for the shrinkage intensity. */ for (k = 0; k < n; k++) { temp = (column[i][k] - mean[i]) * (column[j][k] - mean[j]) - (var[cur] * (double)(n - 1) / (double)n); sumvars += temp * temp; }/*FOR*/ sumcors += var[cur] * var[cur]; }/*THEN*/ }/*FOR*/ }/*FOR*/ if (sumvars > MACHINE_TOL) { /* compute lambda, the shrinkage intensity, on a log-scale for numerical * stability (if lambda is equal to zero, just keep it as it is). */ lambda = exp(log(sumvars) + log((double)n) - 3 * log((double)(n - 1)) - log(sumcors)); /* truncate the shrinkage intensity in the [0,1] interval; this is not an * error, but a measure to increase the quality of the shrinked estimate. */ TRUNCATE_LAMBDA(lambda); }/*THEN*/ else { lambda = 0; }/*ELSE*/ return (double)lambda; }/*COVMAT_LAMBDA*/
/* compute the mutual information from the joint and marginal frequencies. */ static double _mi(int *n, int *nrowt, int *ncolt, int *nrows, int *ncols, int *length) { int i = 0, j = 0; double res = 0; for (i = 0; i < *nrows; i++) for (j = 0; j < *ncols; j++) res += MI_PART(n[CMC(i, j, *nrows)], nrowt[i], ncolt[j], *length); return res; }/*_MI*/
/* check if a square matrix is symmetric in a zero-copy way. */ SEXP is_symmetric(SEXP matrix) { int i = 0, j = 0, n = nrows(matrix); double *m = NULL; SEXP result; /* dereference the matrix. */ m = REAL(matrix); /* allocate and initialize the return value. */ PROTECT(result = allocVector(LGLSXP, 1)); LOGICAL(result)[0] = TRUE; for (i = 0; i < n; i++) { for (j = i + 1; j < n; j++) { /* two cells do no match; it's useless to go on, set the return * value to FALSE and jump at the end of the function. */ if (m[CMC(i, j, n)] != m[CMC(j, i, n)]) { LOGICAL(result)[0] = FALSE; goto end; }/*THEN*/ }/*FOR*/ }/*FOR*/ end: UNPROTECT(1); return result; }/*IS_SYMMETRIC*/
/* adjusted arc counting for boot.strength(). */ SEXP bootstrap_strength_counters(SEXP prob, SEXP weight, SEXP arcs, SEXP nodes) { int i = 0, j = 0, n = length(nodes), *a = NULL; double *p = NULL, *w = NULL; SEXP amat; /* build the adjacency matrix for the current network. */ PROTECT(amat = arcs2amat(arcs, nodes)); /* map the contents of the SEXPs for easy access. */ a = INTEGER(amat); p = REAL(prob); w = REAL(weight); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { /* increase the counter of 1/2 for an undirected arc (the other half * is added to the symmetric element in the matrix) or of 1 for a * direcxted arc. */ if (a[CMC(i, j, n)] == 1) { if (a[CMC(j, i, n)] == 1) p[CMC(i, j, n)] += 0.5 * (*w); else p[CMC(i, j, n)] += 1 * (*w); }/*THEN*/ }/*FOR*/ }/*FOR*/ UNPROTECT(1); return prob; }/*BOOTSTRAP_STRENGTH*/
/* compute the Pearson's conditional X^2 coefficient from the joint and marginal frequencies. */ static double _cx2(int **n, int **nrowt, int **ncolt, int *ncond, int *nr, int *nc, int *nl) { int i = 0, j = 0, k = 0; double res = 0; for (k = 0; k < *nl; k++) for (j = 0; j < *nc; j++) for (i = 0; i < *nr; i++) { if (n[k][CMC(i, j, *nr)] != 0) { res += (n[k][CMC(i, j, *nr)] - nrowt[k][i] * (double)ncolt[k][j] / ncond[k]) * (n[k][CMC(i, j, *nr)] - nrowt[k][i] * (double)ncolt[k][j] / ncond[k]) / (nrowt[k][i] * (double)ncolt[k][j] / ncond[k]); }/*THEN*/ }/*FOR*/ return res; }/*_CX2*/
/* compute the conditional mutual information from the joint and marginal frequencies. */ static double _cmi(int **n, int **nrowt, int **ncolt, int *ncond, int *nr, int *nc, int *nl) { int i = 0, j = 0, k = 0; double res = 0; for (k = 0; k < *nl; k++) for (j = 0; j < *nc; j++) for (i = 0; i < *nr; i++) res += MI_PART(n[k][CMC(i, j, *nr)], nrowt[k][i], ncolt[k][j], ncond[k]); return res; }/*_CMI*/
/* check whether a symmetric square matrix is Cauchy-Schwarz compliant. */ SEXP is_cauchy_schwarz(SEXP matrix) { int i = 0, j = 0, n = nrows(matrix); double *m = NULL; SEXP result; /* dereference the matrix. */ m = REAL(matrix); /* allocate and initialize the return value. */ PROTECT(result = allocVector(LGLSXP, 1)); LOGICAL(result)[0] = TRUE; for (i = 0; i < n; i++) { for (j = i + 1; j < n; j++) { if (m[CMC(i, j, n)] * m[CMC(i, j, n)] > m[CMC(i, i, n)] * m[CMC(j, j, n)]) { LOGICAL(result)[0] = FALSE; goto end; }/*THEN*/ }/*FOR*/ }/*FOR*/ end: UNPROTECT(1); return result; }/*IS_CAUCHY_SCHWARZ*/
void CondProbSampleReplace(int r, int c, double *p, int *conf, int *perm, int nans, int *ans, int *warn) { int i = 0, j = 0; double rU = 0; /* record element identities. */ for (i = 0; i < r; i ++) for (j = 0; j < c; j++) perm[CMC(i, j, r)] = i + 1; /* sort the probabilities into descending order. */ for (j = 0; j < c; j++) revsort(p + j * r, perm + j * r, r); /* compute cumulative probabilities. */ for (j = 0; j < c; j++) for (i = 1 ; i < r; i++) p[CMC(i, j, r)] += p[CMC(i - 1, j, r)]; /* compute the sample. */ for (i = 0; i < nans; i++) { /* check whether the parents' configuration is missing. */ if (conf[i] == NA_INTEGER) { ans[i] = NA_INTEGER; *warn = TRUE; continue; }/*THEN*/ /* check whether the conditional distribution is missing. */ if (ISNAN(p[CMC(0, conf[i], r)])) { ans[i] = NA_INTEGER; *warn = TRUE; continue; }/*THEN*/ rU = unif_rand(); for (j = 0; j < r; j++) if (rU <= p[CMC(j, conf[i], r)]) break; ans[i] = perm[CMC(j, conf[i], r)]; }/*FOR*/ }/*CONDPROBSAMPLEREPLACE*/
/* faster rbind() implementation for arc sets. */ SEXP arcs_rbind (SEXP matrix1, SEXP matrix2, SEXP reverse2) { int i = 0, j = 0, m1 = length(matrix1)/2, m2 = length(matrix2)/2; SEXP res; /* allocate the return value. */ PROTECT(res = allocMatrix(STRSXP, m1 + m2, 2)); /* allocate and initialize the column names. */ finalize_arcs(res); /* copy the elements of the first matrix. */ for (i = 0; i < m1; i++) for (j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i, j, m1 + m2), STRING_ELT(matrix1, CMC(i, j, m1))); /* copy the elements of the second matrix, reversing the order of the * columns as needed. */ if (isTRUE(reverse2)) { for (i = 0; i < m2; i++) for(j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i + m1, j, m1 + m2), STRING_ELT(matrix2, CMC(i, 1 - j, m2))); }/*THEN*/ else { for (i = 0; i < m2; i++) for(j = 0; j < 2; j++) SET_STRING_ELT(res, CMC(i + m1, j, m1 + m2), STRING_ELT(matrix2, CMC(i, j, m2))); }/*ELSE*/ UNPROTECT(1); return res; }/*ARCS_RBIND*/
SEXP is_pdag_acyclic(SEXP arcs, SEXP nodes, SEXP return_nodes, SEXP directed, SEXP debug) { int i = 0, j = 0, z = 0; int nrows = LENGTH(nodes); int check_status = nrows, check_status_old = nrows; int *rowsums = NULL, *colsums = NULL, *crossprod = NULL, *a = NULL; int *debuglevel = NULL; short int *status = NULL; SEXP amat; /* dereference the debug parameter. */ debuglevel = LOGICAL(debug); /* build the adjacency matrix from the arc set. */ if (*debuglevel > 0) Rprintf("* building the adjacency matrix.\n"); PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* should we consider only directed arcs? */ if (isTRUE(directed)) { /* removing undirected arcs, so that only cycles made only of directed * arcs will make the function return TRUE. */ for (i = 0; i < nrows; i++) for (j = 0; j < nrows; j++) if ((a[CMC(i, j, nrows)] == 1) && (a[CMC(j, i, nrows)] == 1)) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; }/*THEN*/ /* initialize the status, {row,col}sums and crossprod arrays. */ status = allocstatus(nrows); rowsums = alloc1dcont(nrows); colsums = alloc1dcont(nrows); crossprod = alloc1dcont(nrows); if (*debuglevel > 0) Rprintf("* checking whether the partially directed graph is acyclic.\n"); /* even in the worst case scenario at least two nodes are marked as * good at each iteration, so even ceil(nrows/2) iterations should be * enough. */ for (z = 0; z < nrows; z++) { start: if (*debuglevel > 0) Rprintf("* beginning iteration %d.\n", z + 1); for (i = 0; i < nrows; i++) { /* skip known-good nodes. */ if (status[i] == GOOD) continue; /* reset and update row and column totals. */ rowsums[i] = colsums[i] = crossprod[i] = 0; /* compute row and column totals for the i-th node. */ for (j = 0; j < nrows; j++) { rowsums[i] += a[CMC(i, j, nrows)]; colsums[i] += a[CMC(j, i, nrows)]; crossprod[i] += a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)]; }/*FOR*/ there: if (*debuglevel > 0) Rprintf(" > checking node %s (%d child(ren), %d parent(s), %d neighbours).\n", NODE(i), rowsums[i], colsums[i], crossprod[i]); /* if either total is zero, the node is either a root node or a * leaf node, and is not part of any cycle. */ if (((rowsums[i] == 0) || (colsums[i] == 0)) || ((crossprod[i] == 1) && (rowsums[i] == 1) && (colsums[i] == 1))) { if (*debuglevel > 0) Rprintf(" @ node %s is cannot be part of a cycle.\n", NODE(i)); /* update the adjacency matrix and the row/column totals. */ for (j = 0; j < nrows; j++) a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; rowsums[i] = colsums[i] = crossprod[i] = 0; /* mark the node as good. */ status[i] = GOOD; check_status--; }/*THEN*/ else if (crossprod[i] == 1) { /* find the other of the undirected arc. */ for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) break; /* safety check, just in case. */ if (i == j) continue; if (((colsums[i] == 1) && (colsums[j] == 1)) || ((rowsums[i] == 1) && (rowsums[j] == 1))) { if (*debuglevel > 0) Rprintf(" @ arc %s - %s is cannot be part of a cycle.\n", NODE(i), NODE(j)); /* update the adjacency matrix and the row/column totals. */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; crossprod[i] = 0; rowsums[i]--; colsums[i]--; rowsums[j]--; colsums[j]--; /* jump back to the first check; if either the row or column total * was equal to 1 only because of the undirected arc, the node can * now be marked as good. */ if ((rowsums[i] == 0) || (colsums[i] == 0)) goto there; }/*THEN*/ }/*THEN*/ }/*FOR*/ /* at least three nodes are needed to have a cycle. */ if (check_status < 3) { if (*debuglevel > 0) Rprintf("@ at least three nodes are needed to have a cycle.\n"); UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ /* if there are three or more bad nodes and there was no change in * the last iteration, the algorithm is stuck on a cycle. */ if (check_status_old == check_status) { if (*debuglevel > 0) Rprintf("@ no change in the last iteration.\n"); /* give up and call c_has_path() to kill some undirected arcs. */ for (i = 0; i < nrows; i++) for (j = 0; j < i; j++) if (a[CMC(i, j, nrows)] * a[CMC(j, i, nrows)] == 1) { /* remove the arc from the adjacency matrix while testing it, * there's a path is always found (the arc itself). */ a[CMC(i, j, nrows)] = a[CMC(j, i, nrows)] = 0; if(!c_has_path(i, j, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE) && !c_has_path(j, i, INTEGER(amat), nrows, nodes, FALSE, TRUE, FALSE)) { if (*debuglevel > 0) Rprintf("@ arc %s - %s is not part of any cycle, removing.\n", NODE(i), NODE(j)); /* increase the iteration counter and start again. */ z++; goto start; }/*THEN*/ else { /* at least one cycle is really present; give up and return. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*ELSE*/ }/*THEN*/ /* give up if there are no undirected arcs, cycles composed * entirely by directed arcs are never false positives. */ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*THEN*/ else { check_status_old = check_status; }/*ELSE*/ }/*FOR*/ UNPROTECT(1); return build_return_array(nodes, status, nrows, check_status, return_nodes); }/*IS_PDAG_ACYCLIC*/
/* Shrinked Covariance Matrix. */ SEXP cov_lambda(SEXP data, SEXP length) { int i = 0, j = 0, k = 0, cur = 0; int *n = INTEGER(length), ncols = LENGTH(data); double *mean = NULL, *var = NULL, **column = NULL; double lambda = 0, sumcors = 0, sumvars = 0; SEXP res; /* allocate the covariance matrix. */ PROTECT(res = allocMatrix(REALSXP, ncols, ncols)); var = REAL(res); memset(var, '\0', ncols * ncols * sizeof(double)); /* allocate an array to store the mean values. */ mean = alloc1dreal(ncols); /* allocate and initialize an array of pointers for the variables. */ column = (double **) alloc1dpointer(ncols); for (i = 0; i < ncols; i++) column[i] = REAL(VECTOR_ELT(data, i)); /* compute the mean values */ for (i = 0; i < ncols; i++) { for (j = 0 ; j < *n; j++) mean[i] += column[i][j]; mean[i] /= (*n); }/*FOR*/ for (i = 0; i < ncols; i++) { for (j = i; j < ncols; j++) { cur = CMC(i, j, ncols); /* compute the actual variance/covariance. */ for (k = 0; k < *n; k++) var[cur] += (column[i][k] - mean[i]) * (column[j][k] - mean[j]); if (i != j) { /* do the first round of computations for the shrinkage intensity. */ for (k = 0; k < *n; k++) { sumvars += ((column[i][k] - mean[i]) * (column[j][k] - mean[j]) - var[cur] / (*n)) * ((column[i][k] - mean[i]) * (column[j][k] - mean[j]) - var[cur] / (*n)); }/*FOR*/ sumcors += (var[cur] / (*n - 1)) * (var[cur] / (*n - 1)); }/*THEN*/ /* use the unbiased estimator for variances/covariances. */ var[cur] /= (*n) - 1; /* fill in the symmetric element of the matrix. */ var[CMC(j, i, ncols)] = var[cur]; }/*FOR*/ }/*FOR*/ /* wrap up the computation of the shrinkage intensity. */ lambda = sumvars * (*n) / (*n - 1) / (*n -1) / (*n -1) / sumcors; /* truncate the shrinkage intensity in the [0,1] interval; this is not an * error, but a measure to increase the quality of the shrinked estimate. */ if (lambda > 1) { lambda = 1; }/*THEN*/ else if (lambda < 0) { lambda = 0; }/*THEN*/ /* shrink the covariance matrix (except the diagonal, which stays the same). */ for (i = 0; i < ncols; i++) for (j = 0; j < ncols; j++) if (i != j) var[CMC(i, j, ncols)] *= 1 - lambda; UNPROTECT(1); return res; }/*COV_LAMBDA*/
/** * Main driver. * Read in all program options from the user using boost::program_options and setup the simulation * cell, initial conditions and both the interaction and external potential. Either equilibrate or * restart a simulation, then start measuring. We output all the simulation parameters to disk as a * log file so that it can be restart again assigning it a unique PIMCID. * @see boost::program_options -- http://www.boost.org/doc/libs/1_43_0/doc/html/program_options.html */ int main (int argc, char *argv[]) { /* Get initial time */ time_t start_time = time(NULL); time_t current_time; //current time bool wallClockReached = false; uint32 seed = 139853; // The seed for the random number generator Setup setup; /* Attempt to parse the command line options */ try { setup.getOptions(argc,argv); } catch(exception& ex) { cerr << "error: " << ex.what() << "\n"; return 1; } catch(...) { cerr << "Exception of unknown type!\n"; } /* Parse the setup options and possibly exit */ if (setup.parseOptions()) return 1; /* The global random number generator, we add the process number to the seed (for * use in parallel simulations.*/ seed = setup.seed(seed); MTRand random(seed); /* Get the simulation box */ Container *boxPtr = NULL; boxPtr = setup.cell(); /* Create the worldlines */ if (setup.worldlines()) return 1; /* Setup the simulation constants */ setup.setConstants(); /* Setup the simulation communicator */ setup.communicator(); /* Get number of paths to use */ int Npaths = constants()->Npaths(); /* Create and initialize the Nearest Neighbor Lookup Table */ boost::ptr_vector<LookupTable> lookupPtrVec; for(int i=0; i<Npaths; i++){ lookupPtrVec.push_back( new LookupTable(boxPtr,constants()->numTimeSlices(), constants()->initialNumParticles())); } /* Create and initialize the potential pointers */ PotentialBase *interactionPotentialPtr = NULL; interactionPotentialPtr = setup.interactionPotential(); PotentialBase *externalPotentialPtr = NULL; externalPotentialPtr = setup.externalPotential(boxPtr); /* Get the initial conditions associated with the external potential */ /* Must use the copy constructor as we return a copy */ Array<dVec,1> initialPos = externalPotentialPtr->initialConfig(boxPtr,random,constants()->initialNumParticles()); externalPotentialPtr->output(9.0); exit(-1); // dVec pos; // pos = 0.0; // cout << "Testing Potential = " << externalPotentialPtr->V(pos) << endl; // exit(-1); /* Perform a classical canonical pre-equilibration to obtain a suitable * initial state */ if (!constants()->restart()) { ClassicalMonteCarlo CMC(externalPotentialPtr,interactionPotentialPtr,random,boxPtr, initialPos); CMC.run(constants()->numEqSteps(),0); } /* Setup the path data variable */ vector<Path *> pathPtrVec; for(int i=0; i<Npaths; i++){ pathPtrVec.push_back( new Path(boxPtr,lookupPtrVec[i],constants()->numTimeSlices(), initialPos,constants()->numBroken())); } /* The Trial Wave Function (constant for pimc) */ WaveFunctionBase *waveFunctionPtr = NULL; waveFunctionPtr = setup.waveFunction(*pathPtrVec.front()); /* Setup the action */ vector<ActionBase *> actionPtrVec; for(int i=0; i<Npaths; i++){ actionPtrVec.push_back( setup.action(*pathPtrVec[i],lookupPtrVec[i],externalPotentialPtr, interactionPotentialPtr,waveFunctionPtr) ); } /* The list of Monte Carlo updates (moves) that will be performed */ vector< boost::ptr_vector<MoveBase> * > movesPtrVec; for(int i=0; i<Npaths;i++){ movesPtrVec.push_back( setup.moves(*pathPtrVec[i],actionPtrVec[i],random).release()); } /* The list of estimators that will be performed */ /*vector< boost::ptr_vector<EstimatorBase> * > estimatorsPtrVec; for(int i=0; i<Npaths;i++){ estimatorsPtrVec.push_back( setup.estimators(*pathPtrVec[i],actionPtrVec[i],random).release()); if(i > 0) { for(uint32 j=0; j<estimatorsPtrVec.back()->size(); j++) estimatorsPtrVec.back()->at(j).appendLabel(str(format("%d") % (i+1))); } } */ /* The list of estimators that will be performed */ vector< boost::ptr_vector<EstimatorBase> * > estimatorsPtrVec; for(int i=0; i<Npaths;i++){ estimatorsPtrVec.push_back(setup.estimators(*pathPtrVec[i],actionPtrVec[i],random).release()); if(i>0){ stringstream tmpSS; for(unsigned j=0; j<estimatorsPtrVec.back()->size(); j++){ tmpSS.str(""); tmpSS << i+1 ; estimatorsPtrVec.back()->at(j).appendLabel(tmpSS.str()); } } } /* Setup the multi-path estimators */ if(Npaths>1){ estimatorsPtrVec.push_back(setup.multiPathEstimators(pathPtrVec,actionPtrVec).release()); } /* Setup the pimc object */ PathIntegralMonteCarlo pimc(pathPtrVec,random,movesPtrVec,estimatorsPtrVec, !setup.params["start_with_state"].as<string>().empty(), setup.params["bin_size"].as<int>()); /* If this is a fresh run, we equilibrate and output simulation parameters to disk */ if (!constants()->restart()) { /* Equilibrate */ cout << format("[PIMCID: %09d] - Equilibration Stage.") % constants()->id() << endl; for (uint32 n = 0; n < constants()->numEqSteps(); n++) pimc.equilStep(n,setup.params.count("relax"),setup.params.count("relaxmu")); /* Output simulation details/parameters */ setup.outputOptions(argc,argv,seed,boxPtr,lookupPtrVec.front().getNumNNGrid()); } cout << format("[PIMCID: %09d] - Measurement Stage.") % constants()->id() << endl; /* Sample */ int oldNumStored = 0; int outNum = 0; int numOutput = setup.params["output_config"].as<int>(); uint32 n = 0; do { pimc.step(); if (pimc.numStoredBins > oldNumStored) { oldNumStored = pimc.numStoredBins; cout << format("[PIMCID: %09d] - Bin #%4d stored to disk.") % constants()->id() % oldNumStored << endl; } n++; /* Output configurations to disk */ if ((numOutput > 0) && ((n % numOutput) == 0)) { pathPtrVec.front()->outputConfig(outNum); outNum++; } /* Check if we've reached the wall clock limit*/ if(constants()->wallClockOn()){ current_time = time(NULL); if ( uint32(current_time) > (uint32(start_time) + constants()->wallClock()) ){ wallClockReached = true; break; } } } while (pimc.numStoredBins < setup.params["number_bins_stored"].as<int>()); if (wallClockReached) cout << format("[PIMCID: %09d] - Wall clock limit reached.") % constants()->id() << endl; else cout << format("[PIMCID: %09d] - Measurement complete.") % constants()->id() << endl; /* Output Results */ if (!constants()->saveStateFiles()) pimc.saveStateFromStr(); pimc.finalOutput(); /* Free up memory */ delete interactionPotentialPtr; delete externalPotentialPtr; delete boxPtr; delete waveFunctionPtr; initialPos.free(); return 1; }
/* construct a consistent DAG extension of a CPDAG. */ SEXP pdag_extension(SEXP arcs, SEXP nodes, SEXP debug) { int i = 0, j = 0, k = 0, t = 0, nnodes = length(nodes); int changed = 0, left = nnodes; int *a = NULL, *nbr = NULL, debuglevel = isTRUE(debug); short int *matched = NULL; SEXP amat, result; /* build and dereference the adjacency matrix. */ PROTECT(amat = arcs2amat(arcs, nodes)); a = INTEGER(amat); /* allocate and initialize the neighbours and matched vectors. */ nbr = Calloc1D(nnodes, sizeof(int)); matched = Calloc1D(nnodes, sizeof(short int)); for (t = 0; t < nnodes; t++) { if (debuglevel > 0) { Rprintf("----------------------------------------------------------------\n"); Rprintf("> performing pass %d.\n", t + 1); Rprintf("> candidate nodes: "); for (j = 0; j < nnodes; j++) if (matched[j] == 0) Rprintf("%s ", NODE(j)); Rprintf("\n"); }/*THEN*/ for (i = 0; i < nnodes; i++) { /* if the node is already ok, skip it. */ if (matched[i] != 0) continue; /* check whether the node is a sink (that is, whether is does not have * any child). */ is_a_sink(a, i, &k, nnodes, nbr, matched); /* if the node is not a sink move on. */ if (k == -1) { if (debuglevel > 0) Rprintf(" * node %s is not a sink.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) Rprintf(" * node %s is a sink.\n", NODE(i)); }/*ELSE*/ if (!all_adjacent(a, i, k, nnodes, nbr)) { if (debuglevel > 0) Rprintf(" * not all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); continue; }/*THEN*/ else { if (debuglevel > 0) { if (k == 0) Rprintf(" * no node is linked to %s by an undirected arc.\n", NODE(i)); else Rprintf(" * all nodes linked to %s by an undirected arc are adjacent.\n", NODE(i)); }/*THEN*/ }/*ELSE*/ /* the current node meets all the conditions, direct all the arcs towards it. */ if (k == 0) { if (debuglevel > 0) Rprintf(" @ no undirected arc to direct towards %s.\n", NODE(i)); }/*THEN*/ else { for (j = 0; j < k; j++) a[CMC(i, nbr[j], nnodes)] = 0; if (debuglevel > 0) Rprintf(" @ directing all incident undirected arcs towards %s.\n", NODE(i)); }/*ELSE*/ /* set the changed flag. */ changed = 1; /* exclude the node from later iterations. */ matched[i] = 1; left--; }/*FOR*/ /* if nothing changed in the last iteration or there are no more candidate * nodes, there is nothing else to do. */ if ((changed == 0) || (left == 0)) break; else changed = 0; }/*FOR*/ /* build the new arc set from the adjacency matrix. */ PROTECT(result = amat2arcs(amat, nodes)); Free1D(nbr); Free1D(matched); UNPROTECT(2); return result; }/*PDAG_EXTENSION*/
/* mean strength for p-values and score deltas. */ static void mean_strength_overall(SEXP *mean_df, SEXP strength, SEXP nodes, int nrows, int nstr, SEXP ref_hash, double *w) { int i = 0, j = 0, *t = NULL; double *mstr = NULL, *cur_strength = NULL; long double cumw = 0; SEXP mean_str, cur, cur_hash, try; /* allocate the strength accumulator vector. */ PROTECT(mean_str = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 2, mean_str); mstr = REAL(mean_str); memset(mstr, '\0', nrows * sizeof(double)); for (i = 0; i < nstr; i++) { /* move to the next object. */ cur = VECTOR_ELT(strength, i); /* get the strength values from the bn.strength object. */ cur_strength = REAL(VECTOR_ELT(cur, 2)); /* get the arc IDs to use to correctly match strengths. */ PROTECT(cur_hash = arc_hash(cur, nodes, FALSE, FALSE)); /* match the current arc IDs to the reference arc IDs. */ PROTECT(try = match(ref_hash, cur_hash, 0)); t = INTEGER(try); for (j = 0; j < nrows; j++) mstr[t[j] - 1] += w[i] * cur_strength[j]; /* update the total weight mass. */ cumw += w[i]; UNPROTECT(2); }/*FOR*/ /* rescale by the total weight mass. */ for (j = 0; j < nrows; j++) mstr[j] /= cumw; UNPROTECT(1); }/*MEAN_STRENGTH_OVERALL*/ /* mean strength for bootstrap probabilities. */ static void mean_strength_direction(SEXP *mean_df, SEXP strength, SEXP nodes, int nrows, int nstr, SEXP ref_hash, double *w) { int i = 0, j = 0, *t = NULL, nnodes = length(nodes); double *mstr = NULL, *mdir = NULL, *cur_strength = NULL, *cur_dir = NULL; double fwd = 0, bkwd = 0; long double cumw = 0; SEXP mean_str, mean_dir, cur, cur_hash, try; /* allocate vectors for strength and direction. */ PROTECT(mean_str = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 2, mean_str); mstr = REAL(mean_str); memset(mstr, '\0', nrows * sizeof(double)); PROTECT(mean_dir = allocVector(REALSXP, nrows)); SET_VECTOR_ELT(*mean_df, 3, mean_dir); mdir = REAL(mean_dir); memset(mdir, '\0', nrows * sizeof(double)); for (i = 0; i < nstr; i++) { /* move to the next object. */ cur = VECTOR_ELT(strength, i); /* get the strength and direction values from the bn.strength object. */ cur_strength = REAL(VECTOR_ELT(cur, 2)); cur_dir = REAL(VECTOR_ELT(cur, 3)); /* get the arc IDs to use to correctly match strengths. */ PROTECT(cur_hash = arc_hash(cur, nodes, FALSE, FALSE)); /* match the current arc IDs to the reference arc IDs. */ PROTECT(try = match(ref_hash, cur_hash, 0)); t = INTEGER(try); for (j = 0; j < nrows; j++) mstr[t[j] - 1] += w[i] * (cur_strength[j] * cur_dir[j]); /* update the total weight mass. */ cumw += w[i]; UNPROTECT(2); }/*FOR*/ /* rescale by the total weight mass. */ for (j = 0; j < nrows; j++) mstr[j] /= cumw; /* split arc strength from direction strength. */ for (i = 0; i < nnodes; i++) { for (j = i + 1; j < nnodes; j++) { fwd = mstr[CMC(j, i, nnodes) - i - 1]; bkwd = mstr[CMC(i, j, nnodes) - j]; mstr[CMC(j, i, nnodes) - i - 1] = mstr[CMC(i, j, nnodes) - j] = fwd + bkwd; if (bkwd + fwd > 0) { mdir[CMC(j, i, nnodes) - i - 1] = fwd / (fwd + bkwd); mdir[CMC(i, j, nnodes) - j] = bkwd / (fwd + bkwd); }/*THEN*/ else { mdir[CMC(j, i, nnodes) - i - 1] = mdir[CMC(i, j, nnodes) - j] = 0; }/*ELSE*/ }/*FOR*/ }/*FOR*/ UNPROTECT(2); }/*MEAN_STRENGTH_DIRECTION*/ /* average multiple bn.strength objects, with weights. */ SEXP mean_strength(SEXP strength, SEXP nodes, SEXP weights) { int nstr = length(weights), ncols = 0, nrows = 0; double *w = REAL(weights); const char *m = NULL; SEXP ref, ref_hash, mean_df, method; /* initialize the result using the first bn.strength object as a reference. */ ref = VECTOR_ELT(strength, 0); ncols = length(ref); nrows = length(VECTOR_ELT(ref, 0)); PROTECT(mean_df = allocVector(VECSXP, ncols)); setAttrib(mean_df, R_NamesSymbol, getAttrib(ref, R_NamesSymbol)); SET_VECTOR_ELT(mean_df, 0, VECTOR_ELT(ref, 0)); SET_VECTOR_ELT(mean_df, 1, VECTOR_ELT(ref, 1)); /* make it a data frame */ minimal_data_frame(mean_df); /* compute the arc IDs to match arcs of later bn.strength objects. */ PROTECT(ref_hash = arc_hash(ref, nodes, FALSE, FALSE)); /* switch backend according to how the strengths were computed. */ method = getAttrib(ref, BN_MethodSymbol); m = CHAR(STRING_ELT(method, 0)); if ((strcmp(m, "score") == 0) || (strcmp(m, "test") == 0)) mean_strength_overall(&mean_df, strength, nodes, nrows, nstr, ref_hash, w); else if (strcmp(m, "bootstrap") == 0) mean_strength_direction(&mean_df, strength, nodes, nrows, nstr, ref_hash, w); UNPROTECT(2); return mean_df; }/*MEAN_STRENGTH*/
/* predict the value of the training variable in a naive Bayes or Tree-Augmented * naive Bayes classifier. */ SEXP naivepred(SEXP fitted, SEXP data, SEXP parents, SEXP training, SEXP prior, SEXP prob, SEXP debug) { int i = 0, j = 0, k = 0, n = 0, nvars = LENGTH(fitted), nmax = 0, tr_nlevels = 0; int *res = NULL, **ex = NULL, *ex_nlevels = NULL; int idx = 0, *tr_id = INTEGER(training); int *iscratch = NULL, *maxima = NULL, *prn = NULL, *debuglevel = LOGICAL(debug); int *include_prob = LOGICAL(prob); double **cpt = NULL, *pr = NULL, *scratch = NULL, *buf = NULL, *pt = NULL; double sum = 0; SEXP class, temp, tr, tr_levels, result, nodes, probtab, dimnames; /* cache the node labels. */ nodes = getAttrib(fitted, R_NamesSymbol); /* cache the pointers to all the variables. */ ex = (int **) alloc1dpointer(nvars); ex_nlevels = alloc1dcont(nvars); for (i = 0; i < nvars; i++) { temp = VECTOR_ELT(data, i); ex[i] = INTEGER(temp); ex_nlevels[i] = NLEVELS(temp); }/*FOR*/ /* get the training variable and its levels. */ n = LENGTH(VECTOR_ELT(data, 0)); tr = getListElement(VECTOR_ELT(fitted, *tr_id - 1), "prob"); tr_levels = VECTOR_ELT(getAttrib(tr, R_DimNamesSymbol), 0); tr_nlevels = LENGTH(tr_levels); /* get the prior distribution. */ pr = REAL(prior); if (*debuglevel > 0) { Rprintf("* the prior distribution for the target variable is:\n"); PrintValue(prior); }/*THEN*/ /* allocate the scratch space used to compute posterior probabilities. */ scratch = alloc1dreal(tr_nlevels); buf = alloc1dreal(tr_nlevels); /* cache the pointers to the conditional probability tables. */ cpt = (double **) alloc1dpointer(nvars); for (i = 0; i < nvars; i++) cpt[i] = REAL(getListElement(VECTOR_ELT(fitted, i), "prob")); /* dereference the parents' vector. */ prn = INTEGER(parents); /* create the vector of indexes. */ iscratch = alloc1dcont(tr_nlevels); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(tr_nlevels); /* allocate the return value. */ PROTECT(result = allocVector(INTSXP, n)); res = INTEGER(result); /* allocate and initialize the table of the posterior probabilities. */ if (*include_prob > 0) { PROTECT(probtab = allocMatrix(REALSXP, tr_nlevels, n)); pt = REAL(probtab); memset(pt, '\0', n * tr_nlevels * sizeof(double)); }/*THEN*/ /* initialize the random seed, just in case we need it for tie breaking. */ GetRNGstate(); /* for each observation... */ for (i = 0; i < n; i++) { /* ... reset the scratch space and the indexes array... */ for (k = 0; k < tr_nlevels; k++) { scratch[k] = log(pr[k]); iscratch[k] = k + 1; }/*FOR*/ if (*debuglevel > 0) Rprintf("* predicting the value of observation %d.\n", i + 1); /* ... and for each conditional probability table... */ for (j = 0; j < nvars; j++) { /* ... skip the training variable... */ if (*tr_id == j + 1) continue; /* ... (this is the root node of the Chow-Liu tree) ... */ if (prn[j] == NA_INTEGER) { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d) from the CPT (p = %lf).\n", NODE(j), CMC(ex[j][i] - 1, k, ex_nlevels[j]), ex[j][i], k + 1, cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][CMC(ex[j][i] - 1, k, ex_nlevels[j])]); }/*FOR*/ }/*THEN*/ else { /* ... and for each row of the conditional probability table... */ for (k = 0; k < tr_nlevels; k++) { /* (the first dimension corresponds to the current node [X], the second * to the training node [Y], the third to the only parent of the current * node [Z]; CMC coordinates are computed as X + Y * NX + Z * NX * NY. */ idx = (ex[j][i] - 1) + k * ex_nlevels[j] + (ex[prn[j] - 1][i] - 1) * ex_nlevels[j] * tr_nlevels; if (*debuglevel > 0) { Rprintf(" > node %s: picking cell %d (%d, %d, %d) from the CPT (p = %lf).\n", NODE(j), idx, ex[j][i], k + 1, ex[prn[j] - 1][i], cpt[j][idx]); }/*THEN*/ /* ... update the posterior probability. */ scratch[k] += log(cpt[j][idx]); }/*FOR*/ }/*ELSE*/ }/*FOR*/ /* find out the mode(s). */ all_max(scratch, tr_nlevels, maxima, &nmax, iscratch, buf); /* compute the posterior probabilities on the right scale, to attach them * to the return value. */ if (*include_prob) { /* copy the log-probabilities from scratch. */ memcpy(pt + i * tr_nlevels, scratch, tr_nlevels * sizeof(double)); /* transform log-probabilitiees into plain probabilities. */ for (k = 0, sum = 0; k < tr_nlevels; k++) sum += pt[i * tr_nlevels + k] = exp(pt[i * tr_nlevels + k] - scratch[maxima[0] - 1]); /* rescale them to sum up to 1. */ for (k = 0; k < tr_nlevels; k++) pt[i * tr_nlevels + k] /= sum; }/*THEN*/ if (nmax == 1) { res[i] = maxima[0]; if (*debuglevel > 0) { Rprintf(" @ prediction for observation %d is '%s' with (log-)posterior:\n", i + 1, CHAR(STRING_ELT(tr_levels, res[i] - 1))); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ SampleReplace(1, nmax, res + i, maxima); if (*debuglevel > 0) { Rprintf(" @ there are %d levels tied for prediction of observation %d, applying tie breaking.\n", nmax, i + 1); Rprintf(" "); for (k = 0; k < tr_nlevels; k++) Rprintf(" %lf", scratch[k]); Rprintf("\n"); Rprintf(" @ tied levels are:"); for (k = 0; k < nmax; k++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[k] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ }/*FOR*/ /* save the state of the random number generator. */ PutRNGstate(); /* add back the attributes and the class to the return value. */ PROTECT(class = allocVector(STRSXP, 1)); SET_STRING_ELT(class, 0, mkChar("factor")); setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, class); if (*include_prob > 0) { /* set the levels of the taregt variable as rownames. */ PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, tr_levels); setAttrib(probtab, R_DimNamesSymbol, dimnames); /* add the posterior probabilities to the return value. */ setAttrib(result, install("prob"), probtab); UNPROTECT(4); }/*THEN*/ else { UNPROTECT(2); }/*ELSE*/ return result; }/*NAIVEPRED*/
/* predict the value of a discrete node with one or more parents. */ SEXP cdpred(SEXP fitted, SEXP data, SEXP parents, SEXP debug) { int i = 0, k = 0, ndata = LENGTH(data), nrows = 0, ncols = 0; int *configs = INTEGER(parents), *debuglevel = LOGICAL(debug); int *iscratch = NULL, *maxima = NULL, *nmax = NULL, *res = NULL; double *prob = NULL, *dscratch = NULL, *buf = NULL; SEXP temp, result, tr_levels = getAttrib(data, R_LevelsSymbol); /* get the probabilities of the multinomial distribution. */ temp = getListElement(fitted, "prob"); nrows = INT(getAttrib(temp, R_DimSymbol)); ncols = LENGTH(temp) / nrows; prob = REAL(temp); /* create the vector of indexes. */ iscratch = alloc1dcont(nrows); /* create a scratch copy of the array. */ buf = alloc1dreal(nrows); dscratch = alloc1dreal(nrows * ncols); memcpy(dscratch, prob, nrows * ncols * sizeof(double)); /* allocate the array for the indexes of the maxima. */ maxima = alloc1dcont(nrows * ncols); /* allocate the maxima counters. */ nmax = alloc1dcont(ncols); /* get the mode for each configuration. */ for (i = 0; i < ncols; i++) { /* initialize the vector of indexes. */ for (k = 0; k < nrows; k++) iscratch[k] = k + 1; /* find out the mode(s). */ all_max(dscratch + i * nrows, nrows, maxima + i * nrows, nmax + i, iscratch, buf); }/*FOR*/ /* allocate and initialize the return value. */ PROTECT(result = allocVector(INTSXP, ndata)); res = INTEGER(result); /* initialize the random seed, just in case we need it for tie breaking. */ GetRNGstate(); /* copy the index of the mode in the return value. */ for (i = 0; i < ndata; i++) { if (nmax[configs[i] - 1] == 1) { res[i] = maxima[CMC(0, configs[i] - 1, nrows)]; if (*debuglevel > 0) { if (res[i] == NA_INTEGER) Rprintf(" > prediction for observation %d is NA with probabilities:\n"); else Rprintf(" > prediction for observation %d is '%s' with probabilities:\n", i + 1, CHAR(STRING_ELT(tr_levels, res[i] - 1))); Rprintf(" "); for (int k = 0; k < nrows; k++) Rprintf(" %lf", (prob + nrows * (configs[i] - 1))[k]); Rprintf("\n"); }/*THEN*/ }/*THEN*/ else { /* break ties: sample with replacement from all the maxima. */ SampleReplace(1, nmax[configs[i] - 1], res + i, maxima + (configs[i] - 1) * nrows); if (*debuglevel > 0) { Rprintf(" > there are %d levels tied for prediction of observation %d, applying tie breaking.\n", nmax[configs[i] - 1], i + 1); Rprintf(" > tied levels are:"); for (k = 0; k < nmax[configs[i] - 1]; k++) Rprintf(" %s", CHAR(STRING_ELT(tr_levels, maxima[CMC(k, configs[i] - 1, nrows)] - 1))); Rprintf(".\n"); }/*THEN*/ }/*ELSE*/ }/*FOR*/ /* save the state of the random number generator. */ PutRNGstate(); /* copy the labels and the class from the input data. */ setAttrib(result, R_LevelsSymbol, tr_levels); setAttrib(result, R_ClassSymbol, getAttrib(data, R_ClassSymbol)); UNPROTECT(1); return result; }/*CDPRED*/
SEXP hc_to_be_added(SEXP arcs, SEXP blacklist, SEXP whitelist, SEXP nparents, SEXP maxp, SEXP nodes, SEXP convert) { int i = 0, j = 0, narcs = 0, dims = length(nodes); int *a = NULL, *coords = NULL; double *mp = REAL(maxp), *np = NULL; short int referenced = 0; SEXP try, result = R_NilValue, result2; /* transform the arc set into an adjacency matrix, if it's not one already. */ if (isInteger(arcs)) { if ((referenced = MAYBE_REFERENCED(arcs))) PROTECT(result = duplicate(arcs)); }/*THEN*/ else { PROTECT(result = arcs2amat(arcs, nodes)); }/*ELSE*/ /* dereference the adjacency matrix once and for all. */ a = INTEGER(result); /* compute the number the parents of each node, unless provided. */ if (nparents == R_NilValue) { np = Calloc1D(dims, sizeof(double)); for (i = 0; i < dims; i++) for (j = 0; j < dims; j++) np[j] = a[CMC(i, j, dims)]; }/*THEN*/ else { np = REAL(nparents); }/*ELSE*/ /* flip all the nondiagonal cells. */ for (j = 0; j < dims; j++) { for (i = 0; i < dims; i++) { /* diagonal elements are always equal to zero, skip them. */ if (i == j) continue; a[CMC(i, j, dims)] = 1 - a[CMC(i, j, dims)]; }/*FOR*/ }/*FOR*/ /* if an arc is present in the graph in one direction, you cannot add it in * the other direction (it would be a reversal); flip both in the adjacency * matrix. */ for (j = 0; j < dims; j++) for (i = j + 1; i < dims; i++) a[CMC(j, i, dims)] = a[CMC(i, j, dims)] = a[CMC(i, j, dims)] * a[CMC(j, i, dims)]; /* if a node has already reached its maximum number parents, do not add * more arcs pointing to that node. */ for (j = 0; j < dims; j++) if (np[j] >= *mp) memset(a + j * dims, '\0', dims * sizeof(int)); #define FLIP_FROM_LIST(list, value) \ if (!isNull(list)) { \ if (!isInteger(list)) { \ PROTECT(try = match(nodes, list, 0)); \ coords = INTEGER(try); \ narcs = length(try)/2; \ for (i = 0; i < narcs; i++) \ a[CMC(coords[i] - 1, coords[i + narcs] - 1, dims)] = value; \ UNPROTECT(1); \ }/*THEN*/ \ else { \ coords = INTEGER(list); \ for (i = 0; i < dims * dims; i ++) \ if (coords[i] == 1) \ a[i] = value; \ }/*ELSE*/ \ }/*THEN*/ /* now the blacklist gets involved. */ FLIP_FROM_LIST(blacklist, 0); /* and, last but not least, the whitelist gets involved. */ FLIP_FROM_LIST(whitelist, 1); if (nparents == R_NilValue) Free1D(np); /* return either the adjacency matrix or the arc set. */ if (isTRUE(convert)) { PROTECT(result2 = amat2arcs(result, nodes)); if (referenced || !isInteger(arcs)) UNPROTECT(2); else UNPROTECT(1); return result2; }/*THEN*/ else { if (referenced || !isInteger(arcs)) UNPROTECT(1); return result; }/*ELSE*/ }/*HC_TO_BE_ADDED*/
/* generate a connected graph with uniform probability, subject to some * constraints on the degree of the nodes. */ SEXP ide_cozman_graph(SEXP nodes, SEXP num, SEXP burn_in, SEXP max_in_degree, SEXP max_out_degree, SEXP max_degree, SEXP connected, SEXP debug) { int i = 0, k = 0, nnodes = LENGTH(nodes), *n = INTEGER(num); int changed = 0, *work = NULL, *arc = NULL, *a = NULL, *burn = INTEGER(burn_in); int *degree = NULL, *in_degree = NULL, *out_degree = NULL; int *debuglevel = LOGICAL(debug), *cozman = LOGICAL(connected); double *max_in = REAL(max_in_degree), *max_out = REAL(max_out_degree), *max = REAL(max_degree); SEXP list, res, args, argnames, amat, arcs, cached, debug2, null, temp; char *label = (*cozman > 0) ? "ic-dag" : "melancon"; /* a fake debug argument (set to FALSE) for cache_structure(). */ PROTECT(debug2 = allocVector(LGLSXP, 1)); LOGICAL(debug2)[0] = FALSE; /* the list of optional arguments. */ PROTECT(argnames = allocVector(STRSXP, 4)); SET_STRING_ELT(argnames, 0, mkChar("burn.in")); SET_STRING_ELT(argnames, 1, mkChar("max.in.degree")); SET_STRING_ELT(argnames, 2, mkChar("max.out.degree")); SET_STRING_ELT(argnames, 3, mkChar("max.degree")); PROTECT(args = allocVector(VECSXP, 4)); setAttrib(args, R_NamesSymbol, argnames); SET_VECTOR_ELT(args, 0, burn_in); SET_VECTOR_ELT(args, 1, max_in_degree); SET_VECTOR_ELT(args, 2, max_out_degree); SET_VECTOR_ELT(args, 3, max_degree); /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, nnodes, nnodes)); a = INTEGER(amat); memset(a, '\0', nnodes * nnodes * sizeof(int)); /* initialize a simple ordered tree with n nodes, where all nodes * have just one parent, except the first one that does not have * any parent. */ for (i = 1; i < nnodes; i++) a[CMC(i - 1, i, nnodes)] = 1; /* allocate the arrays needed by SampleNoReplace. */ arc = alloc1dcont(2); work = alloc1dcont(nnodes); /* allocate and initialize the degree arrays. */ degree = alloc1dcont(nnodes); in_degree = alloc1dcont(nnodes); out_degree = alloc1dcont(nnodes); for (i = 0; i < nnodes; i++) { in_degree[i] = out_degree[i] = 1; degree[i] = 2; }/*FOR*/ in_degree[0] = out_degree[nnodes - 1] = 0; degree[0] = degree[nnodes - 1] = 1; GetRNGstate(); /* wait for the markov chain monte carlo simulation to reach stationarity. */ for (k = 0; k < *burn; k++) { if (*debuglevel > 0) Rprintf("* current model (%d):\n", k + 1); changed = ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); /* print the model string to allow a sane debugging experience; note that this * has a huge impact on performance, so use it with care. */ if ((*debuglevel > 0) && (changed)) { PROTECT(null = allocVector(NILSXP, 1)); PROTECT(res = bn_base_structure(nodes, args, null, null, 0, "none", label)); PROTECT(arcs = amat2arcs(amat, nodes)); PROTECT(cached = cache_structure(nodes, amat, debug2)); SET_VECTOR_ELT(res, 1, cached); SET_VECTOR_ELT(res, 2, arcs); print_modelstring(res); UNPROTECT(4); }/*THEN*/ }/*FOR*/ #define UPDATE_NODE_CACHE(cur) \ if (*debuglevel > 0) \ Rprintf(" > updating cached information about node %s.\n", NODE(cur)); \ memset(work, '\0', nnodes * sizeof(int)); \ PROTECT(temp = c_cache_partial_structure(cur, nodes, amat, work, debug2)); \ SET_VECTOR_ELT(cached, cur, temp); \ UNPROTECT(1); /* return a list if more than one bn is generated. */ if (*n > 1) { if (*debuglevel > 0) Rprintf("* end of the burn-in iterations.\n"); PROTECT(list = allocVector(VECSXP, *n)); PROTECT(null = allocVector(NILSXP, 1)); /* generate the "bn" structure, with dummy NULLs for the "arcs" and * "nodes" elements (which will be initialized later on). */ PROTECT(res = bn_base_structure(nodes, args, null, null, 0, "none", label)); for (k = 0; k < *n; k++) { if (*debuglevel > 0) Rprintf("* current model (%d):\n", *burn + k + 1); changed = ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); if (changed || (k == 0)) { /* generate the arc set and the cached information from the adjacency * matrix. */ if (k > 0) { /* if a complete "bn" object is available, we can retrieve the cached * information about the nodes from the structure stored in the last * iteration and update only the elements that really need it. */ temp = VECTOR_ELT(VECTOR_ELT(list, k - 1), 1); PROTECT(cached = duplicate(temp)); /* update the first sampled nodes; both of them gain/lose either * a parent or a child. */ UPDATE_NODE_CACHE(arc[0] - 1); UPDATE_NODE_CACHE(arc[1] - 1); /* all the parents of the second sampled node gain/lose a node in * the markov blanket (the first sampled node, which shares a child * with all of them). */ for (i = 0; i < nnodes; i++) { if ((i != arc[0] - 1) && (a[CMC(i, arc[1] - 1, nnodes)] == 1)) { UPDATE_NODE_CACHE(i); }/*THEN*/ }/*FOR*/ }/*THEN*/ else { PROTECT(cached = cache_structure(nodes, amat, debug2)); }/*ELSE*/ PROTECT(arcs = amat2arcs(amat, nodes)); SET_VECTOR_ELT(res, 1, cached); SET_VECTOR_ELT(res, 2, arcs); /* print the model string to allow a sane debugging experience. */ if (*debuglevel > 0) print_modelstring(res); /* save the structure in the list. */ PROTECT(temp = duplicate(res)); SET_VECTOR_ELT(list, k, temp); UNPROTECT(3); }/*THEN*/ else { /* the adjacency matrix is unchanged; so we can just copy the bayesian * network from the previous iteration in the k-th slot of the list. */ SET_VECTOR_ELT(list, k, VECTOR_ELT(list, k - 1)); }/*ELSE*/ }/*FOR*/ PutRNGstate(); UNPROTECT(7); return list; }/*THEN*/ else { if (*debuglevel > 0) Rprintf("* end of the burn-in.\n* current model (%d):\n", *burn + 1); ic_logic(a, nodes, &nnodes, arc, work, degree, max, in_degree, max_in, out_degree, max_out, cozman, debuglevel); /* generate the arc set and the cached information form the adjacency * matrix. */ PROTECT(arcs = amat2arcs(amat, nodes)); PROTECT(cached = cache_structure(nodes, amat, debug2)); /* generate the "bn" structure. */ PROTECT(res = bn_base_structure(nodes, args, arcs, cached, 0, "none", label)); /* print the model string to allow a sane debugging experience. */ if (*debuglevel > 0) print_modelstring(res); PutRNGstate(); UNPROTECT(7); return res; }/*ELSE*/ }/*IDE_COZMAN_GRAPH*/
SEXP score_cache_fill(SEXP nodes, SEXP data, SEXP network, SEXP score, SEXP extra, SEXP reference, SEXP equivalence, SEXP decomposability, SEXP updated, SEXP amat, SEXP cache, SEXP blmat, SEXP debug) { int *colsum = NULL, nnodes = length(nodes), lupd = length(updated); int *a = NULL, *upd = NULL, *b = NULL, debuglevel = isTRUE(debug); int i = 0, j = 0, k = 0; double *cache_value = NULL; SEXP arc, delta, op, temp; /* save a pointer to the adjacency matrix, the blacklist and the * updated nodes. */ a = INTEGER(amat); b = INTEGER(blmat); upd = INTEGER(updated); /* if there are no nodes to update, return. */ if (lupd == 0) return cache; /* set up row and column total to check for score equivalence; * zero means no parent nodes. */ if (isTRUE(equivalence)) { colsum = Calloc1D(nnodes, sizeof(int)); for (i = 0; i < nnodes; i++) for (j = 0; j < nnodes; j++) colsum[j] += a[CMC(i, j, nnodes)]; }/*THEN*/ /* allocate and initialize the cache. */ cache_value = REAL(cache); /* allocate a two-slot character vector. */ PROTECT(arc = allocVector(STRSXP, 2)); /* allocate and initialize the fake score delta. */ PROTECT(delta = ScalarReal(0)); /* allocate and initialize the score.delta() operator. */ PROTECT(op = mkString("set")); for (i = 0; i < nnodes; i++) { for (j = 0; j < nnodes; j++) { /* incident nodes must be different from each other. */ if (i == j) continue; /* if only one or two nodes' caches need updating, skip the rest. */ for (k = 0; k < lupd; k++) if (upd[k] == j) goto there; continue; there: /* no need to compute the score delta for blacklisted arcs. */ if (b[CMC(i, j, nnodes)] == 1) continue; /* use score equivalence if possible to check only one orientation. */ if (isTRUE(equivalence)) { /* if the following conditions are met, look up the score delta of * the reverse of the current arc: * 1) that score delta has already been computed. * 2) both incident nodes have no parent, so the arc is really * score equivalent (no v-structures). * 3) the reversed arc has not been blacklisted, as the score delta * is not computed in this case. */ if ((i > j) && (colsum[i] + colsum[j] == 0) && (b[CMC(j, i, nnodes)] == 0)) { cache_value[CMC(i, j, nnodes)] = cache_value[CMC(j, i, nnodes)]; continue; }/*THEN*/ }/*THEN*/ /* save the nodes incident on the arc. */ SET_STRING_ELT(arc, 0, STRING_ELT(nodes, i)); SET_STRING_ELT(arc, 1, STRING_ELT(nodes, j)); /* if the arc is not present in the graph it should be added; * otherwise it should be removed. */ if (a[CMC(i, j, nnodes)] == 0) SET_STRING_ELT(op, 0, mkChar("set")); else SET_STRING_ELT(op, 0, mkChar("drop")); /* checkpoint allocated memory. */ /* evaluate the call to score.delta() for the arc. */ PROTECT(temp = score_delta(arc, network, data, score, delta, reference, op, extra, decomposability)); cache_value[CMC(i, j, nnodes)] = NUM(VECTOR_ELT(temp, 1)); UNPROTECT(1); if (debuglevel > 0) Rprintf("* caching score delta for arc %s -> %s (%lf).\n", CHAR(STRING_ELT(nodes, i)), CHAR(STRING_ELT(nodes, j)), cache_value[CMC(i, j, nnodes)]); }/*FOR*/ }/*FOR*/ UNPROTECT(3); if (isTRUE(equivalence)) Free1D(colsum); return cache; }/*HC_CACHE_FILL*/