Example #1
0
extern void init_llr_pv_tables(
  int min,				/* minimum number of sites */
  int max,				/* maximum number of sites */
  int alength,				/* alphabet length */
  double *back,				/* background frequencies */
  BOOLEAN pal				/* sites are palindromes */
)
{
  int nsites;				/* number of sites */

  /* set effective number of sites to double if pal */
  if (pal) { min *= 2; max *= 2; }

  if (!NO_STATUS)
    fprintf(stderr,
      "Initializing the motif probability tables for %d to %d sites...\n",
	min, max);

  /* make sure the distr table gets initialized on all nodes */
  (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back);

  for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) {   /* nsites */

    /* allocate space for table */
    (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back);

    if (!load_balance_llr(nsites, pal)) {
      continue;	/* for parallel */
    } 

    /* create table */
    if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); }
    (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back);

  } /* nsites */
  broadcast_llr(min, max, pal);		/* for parallel; collect the tables */

#ifdef DEBUG
  /* print results */
  int n;
  for (n=min; n<=max; n++) {
    int I;
    int w = 1;
    printf("# N    I    llr         1-cdf\n");
    for (I=0; I<=distrs[n].range[w]; I++) {		/* LLR */
      double m2, e2;
      if (distrs[n].cdf[w][I] == LOGZERO) {
        m2 = e2 = 0;
      } else {
        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);
      }
      printf("%3d %3d %5.1f %3.1fe%+05.0f\n",
        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2);
    } /* LLR */
  }
#endif

  if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n");

} /* init_llr_pv_tables */
void calc_entropy (
  MODEL *model,			/* the model */
  DATASET *dataset  		/* the dataset */
)
{
  int i, j;
  double *rentropy = model->rentropy;		/* IC of each column */
  double ent = 0;				/* entropy per column */
  double rel = 0;				/* relative entropy per col. */
  int alength = dataset->alength;		/* length of alphabet */
  double *back = dataset->back;			/* background model freqs */
  THETA obs = model->obs;			/* observed frequencies */
  int w = model->w;				/* width of motif */
  int N = model->nsites_dis;			/* number of sites */
  double log_pop;				/* log product of col p-value */
  double max_ic = LOG(alength)/LOG(2);		// maximum IC per column
  //double e = (alength-1) / (2 * LOG(2) * N);	// small sample correction
  double ic = 0;				// "corrected" IC

  /* calculate the relative entropy of each column in motif */
  model->llr = log_pop = 0;
  for (i=0; i<w; i++) {			/* position */
    double llr;				/* log likelihood ratio of column */
    rentropy[i] = 0.0; 
    double H = 0;			// negative entropy in this column
    for (j=0; j<alength; j++) {		/* alphabet letter */
      double f = obs(i, j);		/* motif freq */
      double p = back[j];		/* background freq */
      double h = f ? f * LOG2(f) : 0;	// entropy of current letter
      rel += p ? f * LOG2(p) : 0;	/* total relative entropy */
      ent += h;				/* total entropy */
      H += -h;				// negative entropy in this column
      rentropy[i] += (f && p) ? f * LOG(f/p) : 0;
    } /* alphabet letter */
    llr = N * rentropy[i];		/* log likelihood ratio */
    RND(llr, RNDDIG, llr);		/* round to RNDDIG places */
    model->llr += llr;                  /* llr for model */
    log_pop += get_llr_pv(llr, N, 1, LLR_RANGE, 1.0, alength, back);
    rentropy[i] /= LOG(2);
    // ic += MAX(0, (max_ic - (H + e)));
    ic += max_ic - H;
  } /* position in motif */

  /* compute the log E-value of the motif */
  model->logev = get_log_sig(-log_pop, model->mtype, w, N, N, model->invcomp,
    model->pal, dataset);

  model->rel = (ent - rel)/w;		/* compute rel. entropy/col */

  // LOGO total information content
  RND(ic, RNDDIG, ic);			// round to RNDDIG places
  model->ic = ic;

} /* calc_entropy */
Example #3
0
extern double get_llr_pv(
  double llr,				/* log likelihood ratio */
  double n,				/* wgtd number sequences in alignment */
  int w,				/* width of alignment */
  int range,				/* desired range for resolution */
  double frac,				/* speedup factor */
  int alength,				/* length of alphabet */
  double *dd 				/* alphabet frequency distribution */
)
{
  int i, N;
  double I;				/* weighted log likelihood ratio */
  int I0, I1;				/* position of llr in table */
  double logpv;				/* log pvalue */
  double n0, n1;			/* floor and ceil of n */
  double alpha;				/* scale factor used */

  if (n<=1) return 0.0;			/* only one site p-value = 1.0 */

  /* return geometric mean if N is not integral */
  if ( (n0=floor(n)) != (n1=ceil(n)) )
    return ( 
      (n1-n)*get_llr_pv(llr, n0, w, range, frac, alength, dd) +
      (n-n0)*get_llr_pv(llr, n1, w, range, frac, alength, dd) 
    );

  N = (int) n;				/* make n an integer */

  /* N larger than any previous N? */
  if (ndistrs < N) {			/* first call */
    Resize(distrs, N+1, DISTR);		/* create array of distributions */
    for (i=ndistrs+1; i<=N; i++) {
      distrs[i].w = 0;
      distrs[i].offset = NULL;
      distrs[i].range = NULL;
      distrs[i].d = NULL;
      distrs[i].cdf = NULL;
      distrs[i].mean = 0;
    }
    ndistrs = N;				/* set maximum N */
  }

  /* done if w == 0 */
  if (w == 0) return 0.0;

  /* w larger than any previous w for this N? */
  if (distrs[N].w < w) {			/* larger w */
    Resize(distrs[N].d, w+1, double *);
    Resize(distrs[N].cdf, w+1, double *);
    Resize(distrs[N].offset, w+1, int);
    Resize(distrs[N].range, w+1, int);

    /* first time? */ 
    if (distrs[N].w == 0) {			/* get the w=1 distribution */
      distrs[N].d[1] = llr_distr(alength, dd, N, range, frac,
        &distrs[N].alpha, &distrs[N].offset[1], &distrs[N].range[1]);
      /* get mean of LLR for w = 1 */
      for (i=0; i<=distrs[N].range[1]; i++) {
        double llr = (i + distrs[N].offset[1]) / distrs[N].alpha;
        distrs[N].mean += exp(distrs[N].d[1][i])*llr;
      }
      distrs[N].cdf[1] = cdf(distrs[N].d[1], distrs[N].range[1]);
      distrs[N].w = 1;
    } /* first time */

    /* get the distributions for widths oldw .. maxw */
    /*fprintf(stderr, "enter cdf N= %d w= %d oldw= %d\n", N, w, distrs[N].w);*/
    for (i=distrs[N].w+1; i<=w; i++) {		/* width */
      distrs[N].d[i] = sum_distr(
        distrs[N].d[i-1], 
        distrs[N].range[i-1], 
        distrs[N].d[1], 
        distrs[N].range[1], 
        &distrs[N].range[i]
      );
      distrs[N].offset[i] = distrs[N].offset[i-1] + distrs[N].offset[1];
      distrs[N].cdf[i] = cdf(distrs[N].d[i], distrs[N].range[i]);
    } /* width */
    /*fprintf(stderr, "leave cdf\n");*/
    distrs[N].w = w; 				/* set maximum w */
  } /* new w */
Example #4
0
File: subseq7.c Project: CPFL/gmeme
static int score_llr_pop(
  MOTYPE mtype,				/* type of model */
  int w,				/* width of motif */
  DATASET *dataset,			/* the dataset */
  int iseq,				/* sequence number of starting point */
  int ioff,				/* sequence offset of starting point */
  char *eseq,				/* integer encoded subsequence */
  char *name,				/* name of sequence */
  int n_nsites0,			/* number of nsites0 values to try */
  int n_maxima,				/* number of local maxima */
  P_PROB maxima,			/* sorted local maxima indices */
  double *col_scores,			/* column scores for last start point */
  S_POINT s_points[]			/* array of starting points */
)
{
  int i, j, k, i_nsites0;
  int next_seq;				/* index of next subsequence to align */
  int n_starts = 0;			/* number of nsites0 tried */
  int nsites0;				/* starting nsites rounded down */
  int alength = dataset->alength;	/* lenght of alphabet */
  double *back = dataset->back;		/* background frequencies */
  SAMPLE **samples = dataset->samples;	/* the sequences */
  double counts[MAXSITE][MAXALPH];	/* array to hold observed counts */
  double wN;				/* weighted number of sites */
  double log_pop;			/* log product of p-values */
  double min_ic = dataset->min_ic;	/* min. per-column IC */

  /* initialize letter counts to 0 */
  wN = 0;				/* weighted number of sites */
  for (i=0; i<w; i++) for (j=0; j<alength; j++) { counts[i][j] = 0; }

  /* calculate the product of p-values of information content
     of the top nsite0 probability positions 
  */
  for (i_nsites0=0, next_seq=0; i_nsites0 < n_nsites0; i_nsites0++) {

    /* don't score this start if not enough maxima found */
    nsites0 = (int) s_points[i_nsites0].nsites0;	/* round down */
    if (n_maxima < nsites0) {
      continue;
    }
    n_starts++;					/* number of nsites0 tried */

    /* Align the next highest probability sites 
	1) count the number of occurrences of each letter in each column 
	   of the motif and, 
        2) compute the log likelihood of the sites under the background model
    */
    for (k=next_seq; k<nsites0; k++) {		/* site */
      int jj;
      BOOLEAN ic = maxima[k].ic;		/* on - strand */
      int y = maxima[k].y;			/* position of site */
      SAMPLE *s = samples[maxima[k].x];		/* sequence */
      int off = ic ? s->length-w-y : y;		/* - strand offset from rgt. */
      char *res = ic ? s->resic+off : s->res+off;	/* integer sequence */
      double sw = s->sw;			/* sequence weight */
      //
      // TLB: Note that log_not_o contains Pr(site) scaled to have max=1
      // when called from subseq7() but not when called from discretize().
      //
      // Why not revert to not_o[y] here?  TLB: Because the other one works
      // much better, although its kind of a hack.
      //double esw = sw * s->not_o[y];		// Pr(site not overlapped)
      //
      // FIXME: We are assumming that priors are always symmetrical here.
      double esw = sw * INT_DELOG(s->log_not_o[y]);	// Pr(site not overlapped) * Pr(site) 
      wN += esw;				/* total sequence wgt */

      /* residue counts */
      for (j=0; j<w; j++) {			/* position in sequence */
        int c = res[j];
        if (c < alength) {			/* normal letter */
          counts[j][c] += esw;
	} else {				/* 'X' : esw * back[letter] */
          for (jj=0; jj<alength; jj++) counts[j][jj] += esw * back[jj];
	}
        
      } /* position */

    } /* site */
    next_seq = k;				/* next site to align */
    
    /* 
      For DNA palindromes, combine the counts in symmetrically opposing columns
    */
    if (dataset->pal) palindrome(counts, counts, w, alength);

    // Updated on 13-12-06: Only calculate objective function score if the
    // current s_point is supposed to be evaluated:
    if (s_points[i_nsites0].evaluate) {
      /* 
	convert COUNTS to FREQUENCIES and calculate log likelihood ratio
      */
      log_pop = 0;				/* product of p-values */
      for (i=0; i<w; i++) {			/* position in site */
	double llr = 0;				/* log-like-ratio of column */
	double log_pv;				/* log of column p-value */
	double ic;

	/* compute log likelihood for position i */
	for (j=0; j<alength; j++) {		/* letter */
	  double f = wN ? counts[i][j] / wN : 1; 	/* observed letter frequency */
	  double p = back[j];			/* backgrnd letter frequency */
	  double log_f = LOGL(f);
	  double log_p = LOGL(p);
	  double llr_ij = (f&&p) ? f*(log_f - log_p) : 0;
	  llr += llr_ij;
	} /* letter */
	RND(llr/0.6934, RNDDIG, ic);		/* info content in bits */
	llr *= wN;				/* convert entropy to ll */ 
	RND(llr, RNDDIG, llr);			/* round to RNDDIG places */
	log_pv = get_llr_pv(llr, wN, 1, LLR_RANGE, 1.0, alength, back); 
	if (ic < min_ic) log_pv = 0; 		/* ignore low ic columns */

	if (dataset->use_llr) {
	  // Using llr instead of pop:
	  col_scores[i] = log_pv;
	  log_pop -= llr;
	} else {
	  log_pop += col_scores[i] = log_pv;
	}
      } /* position in site */
      RND(log_pop, RNDDIG, log_pop);

      /* print the start sequence and other stuff */
      if (TRACE) {
	if (eseq) {
	  char seq[MAXSITE+1];
	  r2seq(seq, eseq, w);
	  fprintf(stdout, 
	    "( %3d %3d ) ( %*.*s ) %.*s logpop %8.3f nsites0 %6d\n",
	    iseq+1, ioff+1, MSN, MSN, name, w, seq, -log_pop, nsites0);
	} else {
	  fprintf(stdout, 
	    "l_off %3d w %d logpop %8.3f nsites0 %6d\n",
	    iseq, w, -log_pop, nsites0);
	}
      }

      /* save the best start */
      if (-log_pop > s_points[i_nsites0].score) {
	/* Save the starting point and offset so we can re-calculate
	   eseq later. */
	s_points[i_nsites0].iseq = iseq;
	s_points[i_nsites0].ioff = ioff;
	s_points[i_nsites0].e_cons0 = eseq;
	s_points[i_nsites0].wgt_nsites = wN;
	s_points[i_nsites0].score = -log_pop;
      }
    } // Evaluating only if told to do so.
  } /* nsites0 */

  return n_starts;
} /* score_llr_pop */