/* Dirichlet posterior probabilities (covers BDe and K2 scores). */
double dirichlet_node(SEXP target, SEXP x, SEXP data, SEXP iss, SEXP prior,
    SEXP beta, SEXP experimental, int sparse, int debuglevel) {

char *t = (char *)CHAR(STRING_ELT(target, 0));
double prob = 0, prior_prob = 0;
SEXP nodes, node_t, data_t, exp_data, parents, parent_vars, config;

  /* get the node cached information. */
  nodes = getListElement(x, "nodes");
  node_t = getListElement(nodes, t);
  /* get the parents of the node. */
  parents = getListElement(node_t, "parents");
  /* extract the node's column from the data frame. */
  data_t = c_dataframe_column(data, target, TRUE, FALSE);
  /* extract the list of eperimental data. */
  exp_data = c_dataframe_column(experimental, target, TRUE, FALSE);
  /* compute the prior probability component for the node. */
  prior_prob = graph_prior_prob(prior, target, node_t, beta, debuglevel);

  if (length(parents) == 0) {

    prob = dpost(data_t, iss, exp_data);

  }/*THEN*/
  else {

    /* generate the configurations of the parents. */
    PROTECT(parent_vars = c_dataframe_column(data, parents, FALSE, FALSE));
    PROTECT(config = c_configurations(parent_vars, TRUE, !sparse));
    /* compute the marginal likelihood. */
    prob = cdpost(data_t, config, iss, exp_data);

    UNPROTECT(2);

  }/*ELSE*/

  if (debuglevel > 0) {

    Rprintf("  > (log)prior probability is %lf.\n", prior_prob);
    Rprintf("  > (log)posterior density is %lf.\n", prob);

  }/*THEN*/

  /* add the (log)prior to the marginal (log)likelihood to get the (log)posterior. */
  prob += prior_prob;

  return prob;

}/*DIRICHLET_NODE*/
Ejemplo n.º 2
0
/* multinomial loss for a single node. */
double c_dloss(int *cur, SEXP cur_parents, int *configs, double *prob,
    SEXP data, SEXP nodes, int ndata, int nlevels, double *per_sample) {

int i = 0, dropped = 0, *obs = NULL;
double logprob = 0, result = 0;
SEXP temp_df;

  /* get the target variable. */
  obs = INTEGER(VECTOR_ELT(data, *cur));
  /* get the parents' configurations. */
  if (LENGTH(cur_parents) > 0) {

    PROTECT(temp_df = c_dataframe_column(data, cur_parents, FALSE, FALSE));
    cfg(temp_df, configs, NULL);

    for (i = 0; i < ndata; i++) {

      logprob = log(prob[CMC(obs[i] - 1, configs[i], nlevels)]);

      if (!R_FINITE(logprob) || ISNAN(logprob))
        dropped++;
      else
        result += logprob;

      if (per_sample)
        per_sample[i] += logprob;

    }/*FOR*/

    UNPROTECT(1);

  }/*THEN*/
  else {

    for (i = 0; i < ndata; i++) {

      logprob = log(prob[obs[i] - 1]);

      if (!R_FINITE(logprob) || ISNAN(logprob))
        dropped++;
      else
        result += logprob;

      if (per_sample)
        per_sample[i] += logprob;

    }/*FOR*/

  }/*ELSE*/

  /* switch to the negentropy. */
  result /= -(ndata - dropped);

  /* print a warning if data were dropped. */
  if (dropped > 0)
    warning("%d observations were dropped because the corresponding probabilities for node %s were 0 or NaN.", dropped, NODE(*cur));

  return result;

}/*C_DLOSS*/
Ejemplo n.º 3
0
double loglik_dnode(SEXP target, SEXP x, SEXP data, double *nparams, int debuglevel) {

double loglik = 0;
char *t = (char *)CHAR(STRING_ELT(target, 0));
SEXP nodes, node_t, parents, data_t, parent_vars, config;

  /* get the node cached information. */
  nodes = getListElement(x, "nodes");
  node_t = getListElement(nodes, t);
  /* get the parents of the node. */
  parents = getListElement(node_t, "parents");
  /* extract the node's column from the data frame. */
  PROTECT(data_t = c_dataframe_column(data, target, TRUE, FALSE));

  if (length(parents) == 0) {

    loglik = dlik(data_t, nparams);

  }/*THEN*/
  else {

    /* generate the configurations of the parents. */
    PROTECT(parent_vars = c_dataframe_column(data, parents, FALSE, FALSE));
    PROTECT(config = c_configurations(parent_vars, TRUE, TRUE));
    /* compute the log-likelihood. */
    loglik = cdlik(data_t, config, nparams);

    UNPROTECT(2);

  }/*ELSE*/

  if (debuglevel > 0)
    Rprintf("  > loglikelihood is %lf.\n", loglik);

  UNPROTECT(1);

  return loglik;

}/*LOGLIK_DNODE*/
Ejemplo n.º 4
0
/* unconditional independence tests. */
SEXP utest(SEXP x, SEXP y, SEXP data, SEXP test, SEXP B, SEXP alpha,
    SEXP learning) {

int ntests = length(x), nobs = 0;
double *pvalue = NULL, statistic = 0, df = NA_REAL;
const char *t = CHAR(STRING_ELT(test, 0));
test_e test_type = test_label(t);
SEXP xx, yy, result;

  /* allocate the return value, which has the same length as x. */
  PROTECT(result = allocVector(REALSXP, ntests));
  setAttrib(result, R_NamesSymbol, x);
  pvalue = REAL(result);
  /* set all elements to zero. */
  memset(pvalue, '\0', ntests * sizeof(double));

  /* extract the variables from the data. */
  PROTECT(xx = c_dataframe_column(data, x, FALSE, FALSE));
  PROTECT(yy = c_dataframe_column(data, y, TRUE, FALSE));
  nobs = length(yy);

  if (IS_DISCRETE_ASYMPTOTIC_TEST(test_type)) {

    /* parametric tests for discrete variables. */
    statistic = ut_discrete(xx, yy, nobs, ntests, pvalue, &df, test_type);

  }/*THEN*/
  else if ((test_type == COR) || (test_type == ZF) || (test_type == MI_G) ||
           (test_type == MI_G_SH)) {

    /* parametric tests for Gaussian variables. */
    statistic = ut_gaustests(xx, yy, nobs, ntests, pvalue, &df, test_type);

  }/*THEN*/
  else if (test_type == MI_CG) {

    /* conditional linear Gaussian mutual information test. */
    statistic = ut_micg(xx, yy, nobs, ntests, pvalue, &df);

  }/*THEN*/
  else if (IS_DISCRETE_PERMUTATION_TEST(test_type)) {

    statistic = ut_dperm(xx, yy, nobs, ntests, pvalue, &df, test_type, INT(B),
                  IS_SMC(test_type) ? NUM(alpha) : 1);

  }/*THEN*/
  else if (IS_CONTINUOUS_PERMUTATION_TEST(test_type)) {

    statistic = ut_gperm(xx, yy, nobs, ntests, pvalue, test_type, INT(B),
                  IS_SMC(test_type) ? NUM(alpha) : 1);

  }/*THEN*/

  UNPROTECT(3);

  /* catch-all for unknown tests (after deallocating memory.) */
  if (test_type == ENOTEST)
    error("unknown test statistic '%s'.", t);

  /* increase the test counter. */
  test_counter += ntests;

  if (isTRUE(learning))
    return result;
  else
    return c_create_htest(statistic, test, pvalue[ntests - 1], df, B);

}/*UTEST*/
Ejemplo n.º 5
0
SEXP entropy_loss(SEXP fitted, SEXP orig_data, SEXP by_sample, SEXP keep,
    SEXP debug) {

int i = 0, k = 0, ndata = 0, nnodes = LENGTH(fitted), nlevels = 0, type = 0;
int *configs = NULL, *debuglevel = LOGICAL(debug), *by = LOGICAL(by_sample);
int *to_keep = NULL;
double *res = 0, *res_sample = NULL, **columns = 0, cur_loss = 0;
const char *class = NULL;
SEXP data, cur_node, nodes, result, result_sample, coefs, sd, parents, try;

  /* get the node labels. */
  nodes = getAttrib(fitted, R_NamesSymbol);
  /* rearrange the columns of the data to match the network. */
  PROTECT(data = c_dataframe_column(orig_data, nodes, FALSE, TRUE));
  /* get the sample size. */
  ndata = LENGTH(VECTOR_ELT(data, 0));
  /* allocate and initialize the return value. */
  PROTECT(result = allocVector(REALSXP, 1));
  res = REAL(result);
  *res = 0;
  /* allocate the sample's contributions if needed. */
  if (*by > 0) {

    PROTECT(result_sample = allocVector(REALSXP, ndata));
    res_sample = REAL(result_sample);
    memset(res_sample, '\0', ndata * sizeof(double));

  }/*THEN*/

  /* find out which nodes to use in computing the entropy loss. */
  PROTECT(try = match(nodes, keep, 0));
  to_keep = INTEGER(try);
  R_isort(to_keep, LENGTH(try));

  /* determine the class of the fitted network. */
  class = CHAR(STRING_ELT(getAttrib(VECTOR_ELT(fitted, 0), R_ClassSymbol), 0));

  if (strcmp(class, "bn.fit.gnode") == 0) {

    /* dereference the data set's columns. */
    columns = (double **) alloc1dpointer(nnodes);
    for (i = 0; i < nnodes; i++)
      columns[i] = REAL(VECTOR_ELT(data, i));

    type = GAUSSIAN;

  }/*THEN*/
  else if ((strcmp(class, "bn.fit.dnode") == 0) || (strcmp(class, "bn.fit.onode") == 0)) {

    /* allocate an array for parents' configurations. */
    configs = alloc1dcont(ndata);

    type = DISCRETE;

  }/*THEN*/

  /* iterate over the nodes. */
  for (i = 0; i < nnodes; i++) {

    if (i == to_keep[k] - 1) {

      k++;

    }/*THEN*/
    else {

      if (*debuglevel > 0)
        Rprintf("  > skipping node %s.\n", NODE(i));

      continue;

    }/*ELSE*/

    /* get the current node. */
    cur_node = VECTOR_ELT(fitted, i);
    /* get the parents of the node. */
    parents = getListElement(cur_node, "parents");
    /* get the parameters (regression coefficients and residuals' standard
     * deviation for Gaussian nodes, conditional probabilities for discrete
     * nodes), and compute the loss. */
    switch(type)  {

      case GAUSSIAN:

        coefs = getListElement(cur_node, "coefficients");
        sd = getListElement(cur_node, "sd");

        cur_loss = c_gloss(&i, parents, REAL(coefs), REAL(sd), columns, nodes,
                     ndata, res_sample);
        break;

      case DISCRETE:

        coefs = getListElement(cur_node, "prob");
        nlevels = INT(getAttrib(coefs, R_DimSymbol));

        cur_loss = c_dloss(&i, parents, configs, REAL(coefs), data, nodes,
                     ndata, nlevels, res_sample);
        break;

    }/*SWITCH*/

    if (*debuglevel > 0)
      Rprintf("  > log-likelihood loss for node %s is %lf.\n", NODE(i), cur_loss);

    /* add the node contribution to the return value. */
    *res += cur_loss;

  }/*FOR*/

  if (*by > 0) {

    UNPROTECT(4);
    return result_sample;

  }/*THEN*/
  else {

    UNPROTECT(3);
    return result;

  }/*ELSE*/

}/*ENTROPY_LOSS*/