Ejemplo n.º 1
0
void calc_entropy (
  MODEL *model,			/* the model */
  DATASET *dataset  		/* the dataset */
)
{
  int i, j;
  double *rentropy = model->rentropy;		/* IC of each column */
  double ent = 0;				/* entropy per column */
  double rel = 0;				/* relative entropy per col. */
  int alength = dataset->alength;		/* length of alphabet */
  double *back = dataset->back;			/* background model freqs */
  THETA obs = model->obs;			/* observed frequencies */
  int w = model->w;				/* width of motif */
  int N = model->nsites_dis;			/* number of sites */
  double log_pop;				/* log product of col p-value */
  double max_ic = LOG(alength)/LOG(2);		// maximum IC per column
  //double e = (alength-1) / (2 * LOG(2) * N);	// small sample correction
  double ic = 0;				// "corrected" IC

  /* calculate the relative entropy of each column in motif */
  model->llr = log_pop = 0;
  for (i=0; i<w; i++) {			/* position */
    double llr;				/* log likelihood ratio of column */
    rentropy[i] = 0.0; 
    double H = 0;			// negative entropy in this column
    for (j=0; j<alength; j++) {		/* alphabet letter */
      double f = obs(i, j);		/* motif freq */
      double p = back[j];		/* background freq */
      double h = f ? f * LOG2(f) : 0;	// entropy of current letter
      rel += p ? f * LOG2(p) : 0;	/* total relative entropy */
      ent += h;				/* total entropy */
      H += -h;				// negative entropy in this column
      rentropy[i] += (f && p) ? f * LOG(f/p) : 0;
    } /* alphabet letter */
    llr = N * rentropy[i];		/* log likelihood ratio */
    RND(llr, RNDDIG, llr);		/* round to RNDDIG places */
    model->llr += llr;                  /* llr for model */
    log_pop += get_llr_pv(llr, N, 1, LLR_RANGE, 1.0, alength, back);
    rentropy[i] /= LOG(2);
    // ic += MAX(0, (max_ic - (H + e)));
    ic += max_ic - H;
  } /* position in motif */

  /* compute the log E-value of the motif */
  model->logev = get_log_sig(-log_pop, model->mtype, w, N, N, model->invcomp,
    model->pal, dataset);

  model->rel = (ent - rel)/w;		/* compute rel. entropy/col */

  // LOGO total information content
  RND(ic, RNDDIG, ic);			// round to RNDDIG places
  model->ic = ic;

} /* calc_entropy */
Ejemplo n.º 2
0
/**
 * search_matrix_partition
 *
 * Searches the specified partition of the matrix, for the S_POINT with the
 * smallest significance value.
 *
 * \return The S_POINT with the smallest significance value. Returns NULL if no
 * S_POINT (in the partition) had a significance less than BIG.
 */
static S_POINT *search_matrix_partition (
  SP_MATRIX *sp_mat, ///< The matrix containing the partition of interest
  DATASET *data,     ///< Sequence dataset.
  MODEL *model,      ///< The nascent motif model. Contains parameters needed
                     ///< in order to calculate significance of scores
  PARTITION *part    ///< The coordinates of the partition within the matrix
) {
  // Consider each S_POINT in the partition. Find the S_POINT with the
  // smallest significance value...
  double curr_best_sig = BIG;
  S_POINT *curr_best_sp = NULL;
  
  int curr_w, curr_n;
  for (curr_w = part->min_w; curr_w <= part->max_w; curr_w++) {
    assert (curr_w <= get_max_width(sp_mat));
    for (curr_n = part->min_n; curr_n <= part->max_n; curr_n++) {
      assert (curr_n <= get_max_nsites(sp_mat));

      int w_idx = curr_w - get_min_width(sp_mat);
      int n_idx = curr_n - get_min_nsites(sp_mat);
      // Get the current S_POINT from the SP_MATRIX:
      S_POINT *curr_sp = get_spoint(sp_mat, w_idx, n_idx);

      // Only consider the current s_point if it has been initialised. Note
      // that (spoint initialised <==> contains non-empty cons0 string):
      double curr_sig = BIG; // s_point is non-significant by default.
      if ((curr_sp->cons0 != NULL) && (strcmp(curr_sp->cons0, "") != 0)) {
        curr_sig = get_log_sig(
                     curr_sp->score,
                     model->mtype,
                     curr_sp->w0,
                     curr_sp->wgt_nsites,
                     curr_sp->nsites0,
                     model->invcomp,
                     model->pal,
                     data
                     );

        curr_sp->sig = curr_sig;

        if (curr_sig <= curr_best_sig) {
          curr_best_sig = curr_sig;
          curr_best_sp = curr_sp;
        }
      } // Only considering s_point if initialised
    } // curr_n
  } // curr_w

  return curr_best_sp;
} // search_matrix_partition