/* Dirichlet posterior probabilities (covers BDe and K2 scores). */ double dirichlet_node(SEXP target, SEXP x, SEXP data, SEXP iss, SEXP prior, SEXP beta, SEXP experimental, int sparse, int debuglevel) { char *t = (char *)CHAR(STRING_ELT(target, 0)); double prob = 0, prior_prob = 0; SEXP nodes, node_t, data_t, exp_data, parents, parent_vars, config; /* get the node cached information. */ nodes = getListElement(x, "nodes"); node_t = getListElement(nodes, t); /* get the parents of the node. */ parents = getListElement(node_t, "parents"); /* extract the node's column from the data frame. */ data_t = c_dataframe_column(data, target, TRUE, FALSE); /* extract the list of eperimental data. */ exp_data = c_dataframe_column(experimental, target, TRUE, FALSE); /* compute the prior probability component for the node. */ prior_prob = graph_prior_prob(prior, target, node_t, beta, debuglevel); if (length(parents) == 0) { prob = dpost(data_t, iss, exp_data); }/*THEN*/ else { /* generate the configurations of the parents. */ PROTECT(parent_vars = c_dataframe_column(data, parents, FALSE, FALSE)); PROTECT(config = c_configurations(parent_vars, TRUE, !sparse)); /* compute the marginal likelihood. */ prob = cdpost(data_t, config, iss, exp_data); UNPROTECT(2); }/*ELSE*/ if (debuglevel > 0) { Rprintf(" > (log)prior probability is %lf.\n", prior_prob); Rprintf(" > (log)posterior density is %lf.\n", prob); }/*THEN*/ /* add the (log)prior to the marginal (log)likelihood to get the (log)posterior. */ prob += prior_prob; return prob; }/*DIRICHLET_NODE*/
/* multinomial loss for a single node. */ double c_dloss(int *cur, SEXP cur_parents, int *configs, double *prob, SEXP data, SEXP nodes, int ndata, int nlevels, double *per_sample) { int i = 0, dropped = 0, *obs = NULL; double logprob = 0, result = 0; SEXP temp_df; /* get the target variable. */ obs = INTEGER(VECTOR_ELT(data, *cur)); /* get the parents' configurations. */ if (LENGTH(cur_parents) > 0) { PROTECT(temp_df = c_dataframe_column(data, cur_parents, FALSE, FALSE)); cfg(temp_df, configs, NULL); for (i = 0; i < ndata; i++) { logprob = log(prob[CMC(obs[i] - 1, configs[i], nlevels)]); if (!R_FINITE(logprob) || ISNAN(logprob)) dropped++; else result += logprob; if (per_sample) per_sample[i] += logprob; }/*FOR*/ UNPROTECT(1); }/*THEN*/ else { for (i = 0; i < ndata; i++) { logprob = log(prob[obs[i] - 1]); if (!R_FINITE(logprob) || ISNAN(logprob)) dropped++; else result += logprob; if (per_sample) per_sample[i] += logprob; }/*FOR*/ }/*ELSE*/ /* switch to the negentropy. */ result /= -(ndata - dropped); /* print a warning if data were dropped. */ if (dropped > 0) warning("%d observations were dropped because the corresponding probabilities for node %s were 0 or NaN.", dropped, NODE(*cur)); return result; }/*C_DLOSS*/
double loglik_dnode(SEXP target, SEXP x, SEXP data, double *nparams, int debuglevel) { double loglik = 0; char *t = (char *)CHAR(STRING_ELT(target, 0)); SEXP nodes, node_t, parents, data_t, parent_vars, config; /* get the node cached information. */ nodes = getListElement(x, "nodes"); node_t = getListElement(nodes, t); /* get the parents of the node. */ parents = getListElement(node_t, "parents"); /* extract the node's column from the data frame. */ PROTECT(data_t = c_dataframe_column(data, target, TRUE, FALSE)); if (length(parents) == 0) { loglik = dlik(data_t, nparams); }/*THEN*/ else { /* generate the configurations of the parents. */ PROTECT(parent_vars = c_dataframe_column(data, parents, FALSE, FALSE)); PROTECT(config = c_configurations(parent_vars, TRUE, TRUE)); /* compute the log-likelihood. */ loglik = cdlik(data_t, config, nparams); UNPROTECT(2); }/*ELSE*/ if (debuglevel > 0) Rprintf(" > loglikelihood is %lf.\n", loglik); UNPROTECT(1); return loglik; }/*LOGLIK_DNODE*/
/* unconditional independence tests. */ SEXP utest(SEXP x, SEXP y, SEXP data, SEXP test, SEXP B, SEXP alpha, SEXP learning) { int ntests = length(x), nobs = 0; double *pvalue = NULL, statistic = 0, df = NA_REAL; const char *t = CHAR(STRING_ELT(test, 0)); test_e test_type = test_label(t); SEXP xx, yy, result; /* allocate the return value, which has the same length as x. */ PROTECT(result = allocVector(REALSXP, ntests)); setAttrib(result, R_NamesSymbol, x); pvalue = REAL(result); /* set all elements to zero. */ memset(pvalue, '\0', ntests * sizeof(double)); /* extract the variables from the data. */ PROTECT(xx = c_dataframe_column(data, x, FALSE, FALSE)); PROTECT(yy = c_dataframe_column(data, y, TRUE, FALSE)); nobs = length(yy); if (IS_DISCRETE_ASYMPTOTIC_TEST(test_type)) { /* parametric tests for discrete variables. */ statistic = ut_discrete(xx, yy, nobs, ntests, pvalue, &df, test_type); }/*THEN*/ else if ((test_type == COR) || (test_type == ZF) || (test_type == MI_G) || (test_type == MI_G_SH)) { /* parametric tests for Gaussian variables. */ statistic = ut_gaustests(xx, yy, nobs, ntests, pvalue, &df, test_type); }/*THEN*/ else if (test_type == MI_CG) { /* conditional linear Gaussian mutual information test. */ statistic = ut_micg(xx, yy, nobs, ntests, pvalue, &df); }/*THEN*/ else if (IS_DISCRETE_PERMUTATION_TEST(test_type)) { statistic = ut_dperm(xx, yy, nobs, ntests, pvalue, &df, test_type, INT(B), IS_SMC(test_type) ? NUM(alpha) : 1); }/*THEN*/ else if (IS_CONTINUOUS_PERMUTATION_TEST(test_type)) { statistic = ut_gperm(xx, yy, nobs, ntests, pvalue, test_type, INT(B), IS_SMC(test_type) ? NUM(alpha) : 1); }/*THEN*/ UNPROTECT(3); /* catch-all for unknown tests (after deallocating memory.) */ if (test_type == ENOTEST) error("unknown test statistic '%s'.", t); /* increase the test counter. */ test_counter += ntests; if (isTRUE(learning)) return result; else return c_create_htest(statistic, test, pvalue[ntests - 1], df, B); }/*UTEST*/
SEXP entropy_loss(SEXP fitted, SEXP orig_data, SEXP by_sample, SEXP keep, SEXP debug) { int i = 0, k = 0, ndata = 0, nnodes = LENGTH(fitted), nlevels = 0, type = 0; int *configs = NULL, *debuglevel = LOGICAL(debug), *by = LOGICAL(by_sample); int *to_keep = NULL; double *res = 0, *res_sample = NULL, **columns = 0, cur_loss = 0; const char *class = NULL; SEXP data, cur_node, nodes, result, result_sample, coefs, sd, parents, try; /* get the node labels. */ nodes = getAttrib(fitted, R_NamesSymbol); /* rearrange the columns of the data to match the network. */ PROTECT(data = c_dataframe_column(orig_data, nodes, FALSE, TRUE)); /* get the sample size. */ ndata = LENGTH(VECTOR_ELT(data, 0)); /* allocate and initialize the return value. */ PROTECT(result = allocVector(REALSXP, 1)); res = REAL(result); *res = 0; /* allocate the sample's contributions if needed. */ if (*by > 0) { PROTECT(result_sample = allocVector(REALSXP, ndata)); res_sample = REAL(result_sample); memset(res_sample, '\0', ndata * sizeof(double)); }/*THEN*/ /* find out which nodes to use in computing the entropy loss. */ PROTECT(try = match(nodes, keep, 0)); to_keep = INTEGER(try); R_isort(to_keep, LENGTH(try)); /* determine the class of the fitted network. */ class = CHAR(STRING_ELT(getAttrib(VECTOR_ELT(fitted, 0), R_ClassSymbol), 0)); if (strcmp(class, "bn.fit.gnode") == 0) { /* dereference the data set's columns. */ columns = (double **) alloc1dpointer(nnodes); for (i = 0; i < nnodes; i++) columns[i] = REAL(VECTOR_ELT(data, i)); type = GAUSSIAN; }/*THEN*/ else if ((strcmp(class, "bn.fit.dnode") == 0) || (strcmp(class, "bn.fit.onode") == 0)) { /* allocate an array for parents' configurations. */ configs = alloc1dcont(ndata); type = DISCRETE; }/*THEN*/ /* iterate over the nodes. */ for (i = 0; i < nnodes; i++) { if (i == to_keep[k] - 1) { k++; }/*THEN*/ else { if (*debuglevel > 0) Rprintf(" > skipping node %s.\n", NODE(i)); continue; }/*ELSE*/ /* get the current node. */ cur_node = VECTOR_ELT(fitted, i); /* get the parents of the node. */ parents = getListElement(cur_node, "parents"); /* get the parameters (regression coefficients and residuals' standard * deviation for Gaussian nodes, conditional probabilities for discrete * nodes), and compute the loss. */ switch(type) { case GAUSSIAN: coefs = getListElement(cur_node, "coefficients"); sd = getListElement(cur_node, "sd"); cur_loss = c_gloss(&i, parents, REAL(coefs), REAL(sd), columns, nodes, ndata, res_sample); break; case DISCRETE: coefs = getListElement(cur_node, "prob"); nlevels = INT(getAttrib(coefs, R_DimSymbol)); cur_loss = c_dloss(&i, parents, configs, REAL(coefs), data, nodes, ndata, nlevels, res_sample); break; }/*SWITCH*/ if (*debuglevel > 0) Rprintf(" > log-likelihood loss for node %s is %lf.\n", NODE(i), cur_loss); /* add the node contribution to the return value. */ *res += cur_loss; }/*FOR*/ if (*by > 0) { UNPROTECT(4); return result_sample; }/*THEN*/ else { UNPROTECT(3); return result; }/*ELSE*/ }/*ENTROPY_LOSS*/