예제 #1
0
double *sum_distr(
  double *d1,				/* (log) distribution of RV1 */
  int r1,				/* range of RV1 */
  double *d2,				/* (log) distribution of RV2 */
  int r2, 				/* range of RV2 */
  int *r_sum				/* range of sum of RV1 and RV2 */
)
{
  int i, j, k;
  int range = r1 + r2;			/* potential range of sum */
  double *d_sum = NULL;			/* distribution of sum */

  Resize(d_sum, range+1, double);	/* space for distribution */

  for (i=0; i<=range; i++) {		/* value of sum */
    d_sum[i] = LOGZERO;
  }

  for (i=0; i<=r1; i++) {		/* range of RV1 */
    if (d1[i]==LOGZERO) continue;
    for (j=0, k=i; j<=r2; j++, k++) {	/* range of RV2 */
      if (d2[j]==LOGZERO) continue;
      d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]);
    } /* RV2 */
  } /* RV1 */

  for (i=range; i>=0; i--) {		/* value of sum */
    if (d_sum[i] > LOGZERO) break;
  }
  *r_sum = i;				/* non-zero range */

  return d_sum;
} /* sum_distr */
예제 #2
0
static double *cdf(
  double *d,				/* integer valued distribution */
  int r					/* range [0..r] */
)
{
  double *cdf=NULL, slope=0;
  int I, i, j, k;

  Resize(cdf, r+1, double);
  cdf[r] = d[r];
  for (I=r-1; I>=0; I--) {
    cdf[I] = LOGL_SUM(cdf[I+1], d[I]);
  }

  /* smooth cdf by linear interpolation in logs */
  for (i=r; i>0; i=j) {
    for (j=i-1; j>0 && d[j]==LOGZERO; j--) ;	/* find next non-zero p */ 
    if (i!=j) slope = (cdf[i]-cdf[j])/(i-j);	/* slope */
    for (k=j+1; k<i; k++) cdf[k] = cdf[j] + (k-j)*slope;
  }

  return cdf;
} /* cdf */
예제 #3
0
double *llr_distr(
  int A,				/* dimension of discrete distribution */
  double *dd,				/* discrete distribution */
  int N,				/* number of samples */
  int desired_range,			/* desired range for scaled LLR */
  double frac,				/* fraction of scores to use */
  double *alpha,			/* scale factor for scaled LLR */
  int *offset,				/* prob[0] = prob(offset) */
  int *range				/* range for scaled LLR */
)
{
  int i; 				/* index over alphabet */
  int n;				/* index over samples */
  int k;				/* other index */
  int I;				/* LLR */
  double dd_sum;			/* sum of dd */
  int **IP;				/* I'_i[n] */
  int *minI=NULL;			/* minimum intermediate value of I */
  int *maxI=NULL;			/* maximum intermediate value of I */
  int Irange;				/* maxI-minI+1 */
  double logNfact;			/* log N! */
  double **logP;			/* log P_i[n] */
  double **logSP;			/* log script_P[# samples][LLR] */
  double *prob=NULL;			/* final probability distribution */
  double min, max, min_dd;

  /* create space for IP, P, minI and maxI */
  create_2array(IP, int, A, N+1);
  create_2array(logP, double, A, N+1);
  Resize(minI, N+1, int);
  Resize(maxI, N+1, int);

  /* make sure distribution sums to 1.0 and has no 0's */
  for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON;
  for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum;

  /* compute N! */
  logNfact = 0;
  for (i=2; i<=N; i++) logNfact += log(i); 

  /* get estimates of minimum and miximum values of llr */
  for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]);
  max = NINT(-N * log(min_dd));
  for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i]));
  min = NINT(min);
  /*printf("min = %f max = %f\n", min, max);*/

  /* set alpha to achieve the desired range */
  *alpha = desired_range/((max-min));
  /* *alpha = NINT(((int)desired_range)/((max-min)));
  if (*alpha < 1) *alpha = 1;*/
  /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/

  /* compute I', P, minI and maxI */ 
  for (n=0; n<=N; n++) minI[n] = maxI[n] = 0;
  for (i=0; i<A; i++) {				/* index over alphabet */
    double logdd = LOG(dd[i]);			/* log(dd[i]) */
    IP[i][0] = 0; 				/* I'_i(0) */
    logP[i][0] = 0;				/* log P_i(0) */
    for (n=1; n<=N; n++) {			/* index over samples */
      IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); 	/* I'_i(n) */
      logP[i][n] = logP[i][n-1] + logdd - log(n);	/* log P_i(n) */
      for (k=1; k<=n; k++) {			/* index over samples of new */
	minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]);
	maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]);
      }
    }
  }

  /* get overall minI and maxI */
  for (n=1; n<=N; n++) {
    /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/
    minI[0] = MIN(minI[0], minI[n]);		/* min for intermediates */
    maxI[0] = MAX(maxI[0], maxI[n]);		/* max for intermediates */
    minI[n] = LOGZEROI;
    maxI[n] = 0;
  }
  Irange = maxI[0] - minI[0] + 2;
  *offset = minI[0] - 1;			/* I offset: I=-1 is array 0 */
  /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/
  minI[0] = LOGZEROI;
  maxI[0] = 0;

  /* create script_P arrays with enough space for intermediate calculations */
  create_2array(logSP, double, N+1, Irange+1);
  
  /* clear intermediate probability array */
  for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO;

  /* init probability array for first letter in alphabet */
  for (n=0; n<=N; n++) {
    I = IP[0][n] - *offset;			/* offset I */
    logSP[n][I] = logNfact + logP[0][n];	/* init */
    minI[n] = maxI[n] = I;
  }

  /* compute probabilities recursively */
  for (i=1; i<A; i++) {			/* index over (rest of) alphabet */
    for (n=N; n>=0; n--) {		/* index over samples */
      for (k=1; k<=n; k++) {		/* index over samples of new letter */
        int min = minI[n-k];
        int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1));
        /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/
        for (I=min; I<=max; I++) {	/* index over I */
          if (logSP[n-k][I] > LOGZERO) {
	    /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/
            logSP[n][I+IP[i][k]] = 
              LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]);
          }
	}
	/* get current minimum and maximum I in intermediate arrays */
	minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]);
	maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]);
      }
      if (n==N && i==A-1) break;	/* all done */
    }
  }

  /* compute range */
  /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/
  *range = maxI[N] - minI[N]; 

  /* move to probability array with prob(offset) in position 0 */
  *offset += minI[N];			/* prob[0] = prob(offset) */
  Resize(prob, *range+2, double);
  for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I];
  /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, 
    *offset, *alpha);*/
      
  /* free up space */
  free_2array(IP, A);
  free_2array(logP, A);
  free_2array(logSP, N+1);
  myfree(minI);
  myfree(maxI);

  return prob;
} /* llr_distr */
예제 #4
0
파일: tcm.c 프로젝트: BackofenLab/MEMERIS
double tcm_e_step(
  MODEL *model,			/* the model */
  DATASET *dataset  		/* the dataset */
)
{
  int i, j, k, ii;
  THETA logtheta1 = model->logtheta;	/* motif log(theta) */
  int w = model->w;			/* motif width */
  int n_samples = dataset->n_samples;	/* number of sequences */
  BOOLEAN invcomp = model->invcomp;     /* use reverse complement strand, too */
  int ndir = invcomp ? 2 : 1;           /* number of strands */
  double log_sigma = log(1.0/ndir);	/* log \sigma */
  double lambda = model->lambda;	/* \lambda of tcm model */
  double log_lambda = LOG(lambda);	/* log \lambda */
  double log_1mlambda = LOG(1-lambda);	/* log (1 - \lambda) */
  double logpX;				/* log likelihood; no erase or smooth */

  /* E step */

  convert_theta_to_log(model, dataset);

  /* calculate all the posterior offset probabilities */
  logpX = 0;

  for (i=0; i < n_samples; i++) {	/* sequence */
    SAMPLE *s = dataset->samples[i];
    int lseq = s->length;
    int m = lseq - w + 1;		/* number of possible sites */
    double *zi = s->z;			/* Pr(z_ij=1 | X_i, \theta) */
    double *not_o = s->not_o;		/* Pr(v_ij = 1) */
    double *lcb = s->logcumback;	/* cumulative background probability */

    if (lseq < w) continue;		/* sequence too short for motif */

	 /* added by M.H. */
	 /* use log sigma_ij * lambda * m instead of lambda if secondary structure information is given */
	 /* NOTE: log_sigma is the prior for + or - strand --> here only + strand --> log_sigma = 0 */
	 if (dataset->secondaryStructureFilename != NULL) {
	 
	 	/* first check if the maximum sigma * (lambda*m) > 1   --> if so P(Zij=1 | \phi) can be > 1 */
	   double Pcount = dataset->secondaryStructurePseudocount;
		double maxPrior = ((s->max_ss_value + Pcount) / (s->sum_ss_value + (m * Pcount))) * (lambda*m);

		if (maxPrior > 1.0 ) {
			/* compute new pseudocount that gives sigma_i_max = 1 = (max_ss_value + pseudocount) / (\sum (ss_value[i] + pseudocount)) * lambda * m */
			Pcount = (-1 * s->max_ss_value * lambda * m + s->sum_ss_value) / (m * (lambda - 1));
	
			/* for statistics keep maximum adjustment */
			if (Pcount - dataset->secondaryStructurePseudocount > MAXADJUST) {
				MAXADJUST = Pcount - dataset->secondaryStructurePseudocount;
			}
		}
	 	
		/* compute new sigmas with this pseudocount */
		double sum = s->sum_ss_value + (m * Pcount);  		 /* \sum ss_value[i] + m*pseudocount */
    	for (j=0; j < m; j++) {
	   	s->sigma[j] = (s->ss_value[j] + Pcount) / sum;
		}
	 }


    for (k=0; k<ndir; k++) {		/* strand */
      BOOLEAN ic = (k==1);		/* doing - strand */
      double *szik = s->sz[k];		/* Pr(X_i | z_ij=1, s_ijk=1, \theta) */

      for (j=0; j<m; j++) {		/* site start */
		
	 	   /* added by M.H. */
		   /* use the prior instead of lambda */
	 	   if (dataset->secondaryStructureFilename != NULL) {
			  double p = MIN(1.0, ( s->sigma[j] * lambda * m ) ) ;		/* rounding */
			  log_lambda = LOG(p);	
  			  log_1mlambda = LOG(1-p);	
		   }
		
       	 /* log Pr(X_ij | s_ijk=1, \theta0) \sigma (1-\lambda) */
			 double log_pXijtheta0 = log_sigma + log_1mlambda;	
         /* log Pr(X_ij | s_ijk=1, \theta1) \sigma \lambda */
			double log_pXijtheta1 = log_sigma + log_lambda;
         int off = ic ? lseq-w-j : j;	/* - strand offset from rgt. */
         char *res = ic ? s->resic+off : s->res+off;	/* integer sequence */

        /* calculate the probability of positions in the site under the
  	  background and foreground models
        */
        log_pXijtheta0 += Log_back(lcb, j, w);
        for (ii=0; ii<w; ii++) log_pXijtheta1 += logtheta1(ii, (int)res[ii]);
 
        /* set log szik to:
          Pr(X_i | z_ij=1, s_ijk=1, \theta) \sigma \lambda
        */
        szik[j] = log_pXijtheta1;
 
        /* set z_ij to log Pr(X_ij | \phi): (6-21-99 tlb)
          log(
	    \sigma * sum_{k=0}^{ndir-1} ( 
	      Pr(X_i|z_ij=1, s_ijk=1, \theta) \lambda +
	      Pr(X_i|z_ij=0, s_ijk=1, \theta) (1-\lambda) 
	    )
          )
        */
        zi[j] = k==0 ? LOGL_SUM(log_pXijtheta0, log_pXijtheta1) : 
          LOGL_SUM(zi[j], LOGL_SUM(log_pXijtheta0, log_pXijtheta1));
      } /* site start */
    } /* strand */

    /* compute log Pr(X | \phi) = sum_i,j log(Pr(X_ij)) */
    for (j=0; j<m; j++) {			/* site start */
      logpX += zi[j];				/* z_ij = log Pr(X_ij | \phi) */
    }

    /* sz_ijk : normalize, delog and account for erasing
      Pr(z_ij=1, s_ijk=1 | X_i, \phi) \approx
           P(z_ij=1, s_ijk=1 | X_i, \phi) P(v_ij = 1)
    */
    for (k=0; k<ndir; k++) {		/* strand */
      double *szik = s->sz[k];		/* Pr(X_i | z_ij=1, s_ijk=1, \phi) */
      for (j=0; j<m; j++) {		/* site start */
        /* note zi[j] holds Pr(X_ij|\phi) */
        szik[j] = MIN(1.0, exp(szik[j] - zi[j]) * not_o[j]);	/* roundoff */
      } /* site */
    } /* strand */

    /* z_ij : sum of sz_ijk */
    for (j=0; j<m; j++) {		/* site start */
      for (k=zi[j]=0; k<ndir; k++) {	/* strand */
        zi[j] += s->sz[k][j];
      } /* strand */
      zi[j] = MIN(1.0, zi[j]);		/* avoid roundoff errors */
    } /* site */
    for (j=m; j<lseq; j++) {		/* tail of sequence */
      zi[j] = 0;
    }

  } /* sequence */

  /* smooth so no window of size w has z_i which sum to greater than 1.0 */
  (void) smooth(w, model, dataset);

  return (logpX/log(2.0));
} /* tcm_e_step */