SEXP smart_network_averaging(SEXP arcs, SEXP nodes, SEXP weights) { int k = 0, from = 0, to = 0, nrows = length(arcs) / 2, dims = length(nodes); int *a = NULL, *coords = NULL, *poset = NULL, *path = NULL, *scratch = NULL; double *w = NULL; SEXP weights2, amat, try, acyclic; /* allocate and initialize the adjacency matrix. */ PROTECT(amat = allocMatrix(INTSXP, dims, dims)); a = INTEGER(amat); memset(a, '\0', sizeof(int) * dims * dims); /* match the node labels in the arc set. */ PROTECT(try = match(nodes, arcs, 0)); coords = INTEGER(try); /* duplicate the weights to preserve the original ones. */ PROTECT(weights2 = duplicate(weights)); w = REAL(weights2); /* sort the strength coefficients. */ poset = Calloc1D(nrows, sizeof(int)); for (k = 0; k < nrows; k++) poset[k] = k; R_qsort_I(w, poset, 1, nrows); /* allocate buffers for c_has_path(). */ path = Calloc1D(dims, sizeof(int)); scratch = Calloc1D(dims, sizeof(int)); /* iterate over the arcs in reverse order wrt their strength coefficients. */ for (k = 0; k < nrows; k++) { from = coords[poset[k]] - 1; to = coords[poset[k] + nrows] - 1; /* add an arc only if it does not introduce cycles. */ if (!c_has_path(to, from, a, dims, nodes, FALSE, TRUE, path, scratch, FALSE)) a[CMC(from, to, dims)] = 1; else warning("arc %s -> %s would introduce cycles in the graph, ignoring.", NODE(from), NODE(to)); }/*FOR*/ /* convert the adjacency matrix back to an arc set and return it. */ acyclic = amat2arcs(amat, nodes); Free1D(path); Free1D(scratch); Free1D(poset); UNPROTECT(3); return acyclic; }/*SMART_NETWORK_AVERAGING*/
/* R function qsort(x, index.return) */ SEXP attribute_hidden do_qsort(SEXP call, SEXP op, SEXP args, SEXP rho) { SEXP x, sx; int indx_ret, n; double *vx = NULL; int *ivx = NULL; Rboolean x_real, x_int; checkArity(op, args); x = CAR(args); if (!isNumeric(x)) error(_("argument is not a numeric vector")); x_real= TYPEOF(x) == REALSXP; x_int = !x_real && (TYPEOF(x) == INTSXP || TYPEOF(x) == LGLSXP); PROTECT(sx = (x_real || x_int) ? duplicate(x) : coerceVector(x, REALSXP)); SET_ATTRIB(sx, R_NilValue); SET_OBJECT(sx, 0); /* if x has names, drop them, since they won't be ordered if (!isNull(getAttrib(sx, R_NamesSymbol))) setAttrib(sx, R_NamesSymbol, R_NilValue); */ indx_ret = asLogical(CADR(args)); n = LENGTH(x); if(x_int) ivx = INTEGER(sx); else vx = REAL(sx); if(indx_ret) { SEXP ans, ansnames, indx; int i, *ix; /* answer will have x = sorted x , ix = index :*/ PROTECT(ans = allocVector(VECSXP, 2)); PROTECT(ansnames = allocVector(STRSXP, 2)); PROTECT(indx = allocVector(INTSXP, n)); ix = INTEGER(indx); for(i = 0; i < n; i++) ix[i] = i+1; if(x_int) R_qsort_int_I(ivx, ix, 1, n); else R_qsort_I(vx, ix, 1, n); SET_VECTOR_ELT(ans, 0, sx); SET_VECTOR_ELT(ans, 1, indx); SET_STRING_ELT(ansnames, 0, mkChar("x")); SET_STRING_ELT(ansnames, 1, mkChar("ix")); setAttrib(ans, R_NamesSymbol, ansnames); UNPROTECT(4); return ans; } else { if(x_int) R_qsort_int(ivx, 1, n); else R_qsort(vx, 1, n); UNPROTECT(1); return sx; } }
void makeA(double *x, const int mdim, const int nsample, int *cat, int *a, int *b) { /* makeA() constructs the mdim by nsample integer array a. For each numerical variable with values x(m, n), n=1, ...,nsample, the x-values are sorted from lowest to highest. Denote these by xs(m, n). Then a(m,n) is the case number in which xs(m, n) occurs. The b matrix is also contructed here. If the mth variable is categorical, then a(m, n) is the category of the nth case number. */ int i, j, n1, n2; double *v= (double *) calloc(nsample, sizeof(double)); int *index = (int *) calloc(nsample, sizeof(int)); for (i = 0; i < mdim; ++i) { if (cat[i] == 1) { /* numerical predictor */ for (j = 0; j < nsample; ++j) { v[j] = x[i + j * mdim]; index[j] = j + 1; } R_qsort_I(v, index, 1, nsample); /* this sorts the v(n) in ascending order. index(n) is the case number of that v(n) nth from the lowest (assume the original case numbers are 1,2,...). */ for (j = 0; j < nsample-1; ++j) { n1 = index[j]; n2 = index[j + 1]; a[i + j * mdim] = n1; if (j == 0) b[i + (n1-1) * mdim] = 1; b[i + (n2-1) * mdim] = (v[j] < v[j + 1]) ? b[i + (n1-1) * mdim] + 1 : b[i + (n1-1) * mdim]; } a[i + (nsample-1) * mdim] = index[nsample-1]; } else { /* categorical predictor */ for (j = 0; j < nsample; ++j) a[i + j*mdim] = (int) x[i + j * mdim]; } } free(index); free(v); }
/* Chow-Liu structure learning algorithm. */ SEXP chow_liu(SEXP data, SEXP nodes, SEXP estimator, SEXP whitelist, SEXP blacklist, SEXP conditional, SEXP debug) { int i = 0, j = 0, k = 0, debug_coord[2], ncol = length(data); int num = length(VECTOR_ELT(data, 0)), narcs = 0, nwl = 0, nbl = 0; int *nlevels = NULL, clevels = 0, *est = INTEGER(estimator), *depth = NULL; int *wl = NULL, *bl = NULL, *poset = NULL, debuglevel = isTRUE(debug); void **columns = NULL, *cond = NULL; short int *include = NULL; double *mim = NULL, *means = NULL, *sse = NULL; SEXP arcs, wlist, blist; /* dereference the columns of the data frame. */ DEREFERENCE_DATA_FRAME() /* only TAN uses a conditional variable, so assume it's discrete and go ahead. */ if (conditional != R_NilValue) { cond = (void *) INTEGER(conditional); clevels = NLEVELS(conditional); }/*THEN*/ /* allocate the mutual information matrix and the status vector. */ mim = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(double)); include = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(short int)); /* compute the pairwise mutual information coefficients. */ if (debuglevel > 0) Rprintf("* computing pairwise mutual information coefficients.\n"); mi_matrix(mim, columns, ncol, nlevels, &num, cond, &clevels, means, sse, est); LIST_MUTUAL_INFORMATION_COEFS(); /* add whitelisted arcs first. */ if ((!isNull(whitelist)) && (length(whitelist) > 0)) { PROTECT(wlist = arc_hash(whitelist, nodes, TRUE, TRUE)); wl = INTEGER(wlist); nwl = length(wlist); for (i = 0; i < nwl; i++) { if (debuglevel > 0) { Rprintf("* adding whitelisted arcs first.\n"); if (include[wl[i]] == 0) { Rprintf(" > arc %s - %s has been added to the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*THEN*/ else { Rprintf(" > arc %s - %s was already present in the graph.\n", CHAR(STRING_ELT(whitelist, i)), CHAR(STRING_ELT(whitelist, i + nwl))); }/*ELSE*/ }/*THEN*/ /* update the counter if need be. */ if (include[wl[i]] == 0) narcs++; /* include the arc in the graph. */ include[wl[i]] = 1; }/*FOR*/ UNPROTECT(1); }/*THEN*/ /* cache blacklisted arcs. */ if ((!isNull(blacklist)) && (length(blacklist) > 0)) { PROTECT(blist = arc_hash(blacklist, nodes, TRUE, TRUE)); bl = INTEGER(blist); nbl = length(blist); }/*THEN*/ /* sort the mutual information coefficients and keep track of the elements' index. */ poset = Calloc1D(UPTRI3_MATRIX(ncol), sizeof(int)); for (i = 0; i < UPTRI3_MATRIX(ncol); i++) poset[i] = i; R_qsort_I(mim, poset, 1, UPTRI3_MATRIX(ncol)); depth = Calloc1D(ncol, sizeof(int)); for (i = UPTRI3_MATRIX(ncol) - 1; i >= 0; i--) { /* get back the coordinates from the position in the half-matrix. */ INV_UPTRI3(poset[i], ncol, debug_coord); /* already included all the arcs we had to, exiting. */ if (narcs >= ncol - 1) break; /* arc already present in the graph, nothing to do. */ if (include[poset[i]] == 1) continue; if (bl) { if (chow_liu_blacklist(bl, &nbl, poset + i)) { if (debuglevel > 0) { Rprintf("* arc %s - %s is blacklisted, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ }/*THEN*/ if (c_uptri3_path(include, depth, debug_coord[0], debug_coord[1], ncol, nodes, FALSE)) { if (debuglevel > 0) { Rprintf("* arc %s - %s introduces cycles, skipping.\n", NODE(debug_coord[0]), NODE(debug_coord[1])); }/*THEN*/ continue; }/*THEN*/ if (debuglevel > 0) { Rprintf("* adding arc %s - %s with mutual information %lf.\n", NODE(debug_coord[0]), NODE(debug_coord[1]), mim[i]); }/*THEN*/ /* include the arc in the graph. */ include[poset[i]] = 1; /* update the counter. */ narcs++; }/*FOR*/ if ((!isNull(blacklist)) && (length(blacklist) > 0)) UNPROTECT(1); /* sanity check for blacklist-related madnes. */ if (narcs != ncol - 1) error("learned %d arcs instead of %d, this is not a tree spanning all the nodes.", narcs, ncol - 1); CONVERT_TO_ARC_SET(include, 0, 2 * (ncol - 1)); Free1D(depth); Free1D(mim); Free1D(include); Free1D(poset); Free1D(columns); if (nlevels) Free1D(nlevels); if (means) Free1D(means); if (sse) Free1D(sse); return arcs; }/*CHOW_LIU*/
/* R function qsort(x, index.return) */ SEXP attribute_hidden do_qsort(SEXP call, SEXP op, SEXP args, SEXP rho) { SEXP x, sx; int indx_ret; double *vx = NULL; int *ivx = NULL; Rboolean x_real, x_int; checkArity(op, args); x = CAR(args); if (!isNumeric(x)) error(_("argument is not a numeric vector")); x_real= TYPEOF(x) == REALSXP; x_int = !x_real && (TYPEOF(x) == INTSXP || TYPEOF(x) == LGLSXP); PROTECT(sx = (x_real || x_int) ? duplicate(x) : coerceVector(x, REALSXP)); SET_ATTRIB(sx, R_NilValue); SET_OBJECT(sx, 0); indx_ret = asLogical(CADR(args)); R_xlen_t n = XLENGTH(x); #ifdef LONG_VECTOR_SUPPORT Rboolean isLong = n > INT_MAX; #endif if(x_int) ivx = INTEGER(sx); else vx = REAL(sx); if(indx_ret) { SEXP ans, ansnames, indx; /* answer will have x = sorted x , ix = index :*/ PROTECT(ans = allocVector(VECSXP, 2)); PROTECT(ansnames = allocVector(STRSXP, 2)); #ifdef LONG_VECTOR_SUPPORT if (isLong) { PROTECT(indx = allocVector(REALSXP, n)); double *ix = REAL(indx); for(R_xlen_t i = 0; i < n; i++) ix[i] = (double) (i+1); if(x_int) R_qsort_int_R(ivx, ix, 1, n); else R_qsort_R(vx, ix, 1, n); } else #endif { PROTECT(indx = allocVector(INTSXP, n)); int *ix = INTEGER(indx); int nn = (int) n; for(int i = 0; i < nn; i++) ix[i] = i+1; if(x_int) R_qsort_int_I(ivx, ix, 1, nn); else R_qsort_I(vx, ix, 1, nn); } SET_VECTOR_ELT(ans, 0, sx); SET_VECTOR_ELT(ans, 1, indx); SET_STRING_ELT(ansnames, 0, mkChar("x")); SET_STRING_ELT(ansnames, 1, mkChar("ix")); setAttrib(ans, R_NamesSymbol, ansnames); UNPROTECT(4); return ans; } else { if(x_int) R_qsort_int(ivx, 1, n); else R_qsort(vx, 1, n); UNPROTECT(1); return sx; } }
/* These are exposed in Utils.h and are misguidely in the API */ void F77_SUB(qsort4)(double *v, int *indx, int *ii, int *jj) { R_qsort_I(v, indx, *ii, *jj); }
void findBestSplit(double *x, int *jdex, double *y, int mdim, int nsample, int ndstart, int ndend, int *msplit, double *decsplit, double *ubest, int *ndendl, int *jstat, int mtry, double sumnode, int nodecnt, int *cat) { int last, ncat[32], icat[32], lc, nl, nr, npopl, npopr; int i, j, kv, l; static int *mind, *ncase; static double *xt, *ut, *v, *yl; double sumcat[32], avcat[32], tavcat[32], ubestt; double crit, critmax, critvar, suml, sumr, d, critParent; if (in_findBestSplit==-99){ free(ncase); free(mind); //had to remove this so that it wont crash for when mdim=0, strangely happened for replace=0 free(v); free(yl); free(xt); free(ut); // PRINTF("giving up mem in findBestSplit\n"); return; } if (in_findBestSplit==0){ in_findBestSplit=1; ut = (double *) calloc(nsample, sizeof(double)); xt = (double *) calloc(nsample, sizeof(double)); v = (double *) calloc(nsample, sizeof(double)); yl = (double *) calloc(nsample, sizeof(double)); mind = (int *) calloc(mdim+1, sizeof(int)); //seems that the sometimes i am asking for kv[10] and that causes problesmms //so allocate 1 more. helps with not crashing in windows ncase = (int *) calloc(nsample, sizeof(int)); } zeroDouble(ut, nsample); zeroDouble(xt, nsample); zeroDouble(v, nsample); zeroDouble(yl, nsample); zeroInt(mind, mdim); zeroInt(ncase, nsample); zeroDouble(avcat, 32); zeroDouble(tavcat, 32); /* START BIG LOOP */ *msplit = -1; *decsplit = 0.0; critmax = 0.0; ubestt = 0.0; for (i=0; i < mdim; ++i) mind[i] = i; last = mdim - 1; for (i = 0; i < mtry; ++i) { critvar = 0.0; j = (int) (unif_rand() * (last+1)); //PRINTF("j=%d, last=%d mind[j]=%d\n", j, last, mind[j]);fflush(stdout); kv = mind[j]; //if(kv>100){ // 1; // getchar(); //} swapInt(mind[j], mind[last]); /* mind[j] = mind[last]; * mind[last] = kv; */ last--; lc = cat[kv]; if (lc == 1) { /* numeric variable */ for (j = ndstart; j <= ndend; ++j) { xt[j] = x[kv + (jdex[j] - 1) * mdim]; yl[j] = y[jdex[j] - 1]; } } else { /* categorical variable */ zeroInt(ncat, 32); zeroDouble(sumcat, 32); for (j = ndstart; j <= ndend; ++j) { l = (int) x[kv + (jdex[j] - 1) * mdim]; sumcat[l - 1] += y[jdex[j] - 1]; ncat[l - 1] ++; } /* Compute means of Y by category. */ for (j = 0; j < lc; ++j) { avcat[j] = ncat[j] ? sumcat[j] / ncat[j] : 0.0; } /* Make the category mean the `pseudo' X data. */ for (j = 0; j < nsample; ++j) { xt[j] = avcat[(int) x[kv + (jdex[j] - 1) * mdim] - 1]; yl[j] = y[jdex[j] - 1]; } } /* copy the x data in this node. */ for (j = ndstart; j <= ndend; ++j) v[j] = xt[j]; for (j = 1; j <= nsample; ++j) ncase[j - 1] = j; R_qsort_I(v, ncase, ndstart + 1, ndend + 1); if (v[ndstart] >= v[ndend]) continue; /* ncase(n)=case number of v nth from bottom */ /* Start from the right and search to the left. */ critParent = sumnode * sumnode / nodecnt; suml = 0.0; sumr = sumnode; npopl = 0; npopr = nodecnt; crit = 0.0; /* Search through the "gaps" in the x-variable. */ for (j = ndstart; j <= ndend - 1; ++j) { d = yl[ncase[j] - 1]; suml += d; sumr -= d; npopl++; npopr--; if (v[j] < v[j+1]) { crit = (suml * suml / npopl) + (sumr * sumr / npopr) - critParent; if (crit > critvar) { ubestt = (v[j] + v[j+1]) / 2.0; critvar = crit; } } } if (critvar > critmax) { *ubest = ubestt; *msplit = kv + 1; critmax = critvar; for (j = ndstart; j <= ndend; ++j) { ut[j] = xt[j]; } if (cat[kv] > 1) { for (j = 0; j < cat[kv]; ++j) tavcat[j] = avcat[j]; } } } *decsplit = critmax; /* If best split can not be found, set to terminal node and return. */ if (*msplit != -1) { nl = ndstart; for (j = ndstart; j <= ndend; ++j) { if (ut[j] <= *ubest) { nl++; ncase[nl-1] = jdex[j]; } } *ndendl = imax2(nl - 1, ndstart); nr = *ndendl + 1; for (j = ndstart; j <= ndend; ++j) { if (ut[j] > *ubest) { if (nr >= nsample) break; nr++; ncase[nr - 1] = jdex[j]; } } if (*ndendl >= ndend) *ndendl = ndend - 1; for (j = ndstart; j <= ndend; ++j) jdex[j] = ncase[j]; lc = cat[*msplit - 1]; if (lc > 1) { for (j = 0; j < lc; ++j) { icat[j] = (tavcat[j] < *ubest) ? 1 : 0; } *ubest = pack(lc, icat); } } else *jstat = 1; }