double *sum_distr( double *d1, /* (log) distribution of RV1 */ int r1, /* range of RV1 */ double *d2, /* (log) distribution of RV2 */ int r2, /* range of RV2 */ int *r_sum /* range of sum of RV1 and RV2 */ ) { int i, j, k; int range = r1 + r2; /* potential range of sum */ double *d_sum = NULL; /* distribution of sum */ Resize(d_sum, range+1, double); /* space for distribution */ for (i=0; i<=range; i++) { /* value of sum */ d_sum[i] = LOGZERO; } for (i=0; i<=r1; i++) { /* range of RV1 */ if (d1[i]==LOGZERO) continue; for (j=0, k=i; j<=r2; j++, k++) { /* range of RV2 */ if (d2[j]==LOGZERO) continue; d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]); } /* RV2 */ } /* RV1 */ for (i=range; i>=0; i--) { /* value of sum */ if (d_sum[i] > LOGZERO) break; } *r_sum = i; /* non-zero range */ return d_sum; } /* sum_distr */
static double *cdf( double *d, /* integer valued distribution */ int r /* range [0..r] */ ) { double *cdf=NULL, slope=0; int I, i, j, k; Resize(cdf, r+1, double); cdf[r] = d[r]; for (I=r-1; I>=0; I--) { cdf[I] = LOGL_SUM(cdf[I+1], d[I]); } /* smooth cdf by linear interpolation in logs */ for (i=r; i>0; i=j) { for (j=i-1; j>0 && d[j]==LOGZERO; j--) ; /* find next non-zero p */ if (i!=j) slope = (cdf[i]-cdf[j])/(i-j); /* slope */ for (k=j+1; k<i; k++) cdf[k] = cdf[j] + (k-j)*slope; } return cdf; } /* cdf */
double *llr_distr( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples */ int desired_range, /* desired range for scaled LLR */ double frac, /* fraction of scores to use */ double *alpha, /* scale factor for scaled LLR */ int *offset, /* prob[0] = prob(offset) */ int *range /* range for scaled LLR */ ) { int i; /* index over alphabet */ int n; /* index over samples */ int k; /* other index */ int I; /* LLR */ double dd_sum; /* sum of dd */ int **IP; /* I'_i[n] */ int *minI=NULL; /* minimum intermediate value of I */ int *maxI=NULL; /* maximum intermediate value of I */ int Irange; /* maxI-minI+1 */ double logNfact; /* log N! */ double **logP; /* log P_i[n] */ double **logSP; /* log script_P[# samples][LLR] */ double *prob=NULL; /* final probability distribution */ double min, max, min_dd; /* create space for IP, P, minI and maxI */ create_2array(IP, int, A, N+1); create_2array(logP, double, A, N+1); Resize(minI, N+1, int); Resize(maxI, N+1, int); /* make sure distribution sums to 1.0 and has no 0's */ for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON; for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum; /* compute N! */ logNfact = 0; for (i=2; i<=N; i++) logNfact += log(i); /* get estimates of minimum and miximum values of llr */ for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]); max = NINT(-N * log(min_dd)); for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i])); min = NINT(min); /*printf("min = %f max = %f\n", min, max);*/ /* set alpha to achieve the desired range */ *alpha = desired_range/((max-min)); /* *alpha = NINT(((int)desired_range)/((max-min))); if (*alpha < 1) *alpha = 1;*/ /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/ /* compute I', P, minI and maxI */ for (n=0; n<=N; n++) minI[n] = maxI[n] = 0; for (i=0; i<A; i++) { /* index over alphabet */ double logdd = LOG(dd[i]); /* log(dd[i]) */ IP[i][0] = 0; /* I'_i(0) */ logP[i][0] = 0; /* log P_i(0) */ for (n=1; n<=N; n++) { /* index over samples */ IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); /* I'_i(n) */ logP[i][n] = logP[i][n-1] + logdd - log(n); /* log P_i(n) */ for (k=1; k<=n; k++) { /* index over samples of new */ minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]); } } } /* get overall minI and maxI */ for (n=1; n<=N; n++) { /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/ minI[0] = MIN(minI[0], minI[n]); /* min for intermediates */ maxI[0] = MAX(maxI[0], maxI[n]); /* max for intermediates */ minI[n] = LOGZEROI; maxI[n] = 0; } Irange = maxI[0] - minI[0] + 2; *offset = minI[0] - 1; /* I offset: I=-1 is array 0 */ /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/ minI[0] = LOGZEROI; maxI[0] = 0; /* create script_P arrays with enough space for intermediate calculations */ create_2array(logSP, double, N+1, Irange+1); /* clear intermediate probability array */ for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO; /* init probability array for first letter in alphabet */ for (n=0; n<=N; n++) { I = IP[0][n] - *offset; /* offset I */ logSP[n][I] = logNfact + logP[0][n]; /* init */ minI[n] = maxI[n] = I; } /* compute probabilities recursively */ for (i=1; i<A; i++) { /* index over (rest of) alphabet */ for (n=N; n>=0; n--) { /* index over samples */ for (k=1; k<=n; k++) { /* index over samples of new letter */ int min = minI[n-k]; int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1)); /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/ for (I=min; I<=max; I++) { /* index over I */ if (logSP[n-k][I] > LOGZERO) { /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/ logSP[n][I+IP[i][k]] = LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]); } } /* get current minimum and maximum I in intermediate arrays */ minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]); } if (n==N && i==A-1) break; /* all done */ } } /* compute range */ /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/ *range = maxI[N] - minI[N]; /* move to probability array with prob(offset) in position 0 */ *offset += minI[N]; /* prob[0] = prob(offset) */ Resize(prob, *range+2, double); for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I]; /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, *offset, *alpha);*/ /* free up space */ free_2array(IP, A); free_2array(logP, A); free_2array(logSP, N+1); myfree(minI); myfree(maxI); return prob; } /* llr_distr */
double tcm_e_step( MODEL *model, /* the model */ DATASET *dataset /* the dataset */ ) { int i, j, k, ii; THETA logtheta1 = model->logtheta; /* motif log(theta) */ int w = model->w; /* motif width */ int n_samples = dataset->n_samples; /* number of sequences */ BOOLEAN invcomp = model->invcomp; /* use reverse complement strand, too */ int ndir = invcomp ? 2 : 1; /* number of strands */ double log_sigma = log(1.0/ndir); /* log \sigma */ double lambda = model->lambda; /* \lambda of tcm model */ double log_lambda = LOG(lambda); /* log \lambda */ double log_1mlambda = LOG(1-lambda); /* log (1 - \lambda) */ double logpX; /* log likelihood; no erase or smooth */ /* E step */ convert_theta_to_log(model, dataset); /* calculate all the posterior offset probabilities */ logpX = 0; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = dataset->samples[i]; int lseq = s->length; int m = lseq - w + 1; /* number of possible sites */ double *zi = s->z; /* Pr(z_ij=1 | X_i, \theta) */ double *not_o = s->not_o; /* Pr(v_ij = 1) */ double *lcb = s->logcumback; /* cumulative background probability */ if (lseq < w) continue; /* sequence too short for motif */ /* added by M.H. */ /* use log sigma_ij * lambda * m instead of lambda if secondary structure information is given */ /* NOTE: log_sigma is the prior for + or - strand --> here only + strand --> log_sigma = 0 */ if (dataset->secondaryStructureFilename != NULL) { /* first check if the maximum sigma * (lambda*m) > 1 --> if so P(Zij=1 | \phi) can be > 1 */ double Pcount = dataset->secondaryStructurePseudocount; double maxPrior = ((s->max_ss_value + Pcount) / (s->sum_ss_value + (m * Pcount))) * (lambda*m); if (maxPrior > 1.0 ) { /* compute new pseudocount that gives sigma_i_max = 1 = (max_ss_value + pseudocount) / (\sum (ss_value[i] + pseudocount)) * lambda * m */ Pcount = (-1 * s->max_ss_value * lambda * m + s->sum_ss_value) / (m * (lambda - 1)); /* for statistics keep maximum adjustment */ if (Pcount - dataset->secondaryStructurePseudocount > MAXADJUST) { MAXADJUST = Pcount - dataset->secondaryStructurePseudocount; } } /* compute new sigmas with this pseudocount */ double sum = s->sum_ss_value + (m * Pcount); /* \sum ss_value[i] + m*pseudocount */ for (j=0; j < m; j++) { s->sigma[j] = (s->ss_value[j] + Pcount) / sum; } } for (k=0; k<ndir; k++) { /* strand */ BOOLEAN ic = (k==1); /* doing - strand */ double *szik = s->sz[k]; /* Pr(X_i | z_ij=1, s_ijk=1, \theta) */ for (j=0; j<m; j++) { /* site start */ /* added by M.H. */ /* use the prior instead of lambda */ if (dataset->secondaryStructureFilename != NULL) { double p = MIN(1.0, ( s->sigma[j] * lambda * m ) ) ; /* rounding */ log_lambda = LOG(p); log_1mlambda = LOG(1-p); } /* log Pr(X_ij | s_ijk=1, \theta0) \sigma (1-\lambda) */ double log_pXijtheta0 = log_sigma + log_1mlambda; /* log Pr(X_ij | s_ijk=1, \theta1) \sigma \lambda */ double log_pXijtheta1 = log_sigma + log_lambda; int off = ic ? lseq-w-j : j; /* - strand offset from rgt. */ char *res = ic ? s->resic+off : s->res+off; /* integer sequence */ /* calculate the probability of positions in the site under the background and foreground models */ log_pXijtheta0 += Log_back(lcb, j, w); for (ii=0; ii<w; ii++) log_pXijtheta1 += logtheta1(ii, (int)res[ii]); /* set log szik to: Pr(X_i | z_ij=1, s_ijk=1, \theta) \sigma \lambda */ szik[j] = log_pXijtheta1; /* set z_ij to log Pr(X_ij | \phi): (6-21-99 tlb) log( \sigma * sum_{k=0}^{ndir-1} ( Pr(X_i|z_ij=1, s_ijk=1, \theta) \lambda + Pr(X_i|z_ij=0, s_ijk=1, \theta) (1-\lambda) ) ) */ zi[j] = k==0 ? LOGL_SUM(log_pXijtheta0, log_pXijtheta1) : LOGL_SUM(zi[j], LOGL_SUM(log_pXijtheta0, log_pXijtheta1)); } /* site start */ } /* strand */ /* compute log Pr(X | \phi) = sum_i,j log(Pr(X_ij)) */ for (j=0; j<m; j++) { /* site start */ logpX += zi[j]; /* z_ij = log Pr(X_ij | \phi) */ } /* sz_ijk : normalize, delog and account for erasing Pr(z_ij=1, s_ijk=1 | X_i, \phi) \approx P(z_ij=1, s_ijk=1 | X_i, \phi) P(v_ij = 1) */ for (k=0; k<ndir; k++) { /* strand */ double *szik = s->sz[k]; /* Pr(X_i | z_ij=1, s_ijk=1, \phi) */ for (j=0; j<m; j++) { /* site start */ /* note zi[j] holds Pr(X_ij|\phi) */ szik[j] = MIN(1.0, exp(szik[j] - zi[j]) * not_o[j]); /* roundoff */ } /* site */ } /* strand */ /* z_ij : sum of sz_ijk */ for (j=0; j<m; j++) { /* site start */ for (k=zi[j]=0; k<ndir; k++) { /* strand */ zi[j] += s->sz[k][j]; } /* strand */ zi[j] = MIN(1.0, zi[j]); /* avoid roundoff errors */ } /* site */ for (j=m; j<lseq; j++) { /* tail of sequence */ zi[j] = 0; } } /* sequence */ /* smooth so no window of size w has z_i which sum to greater than 1.0 */ (void) smooth(w, model, dataset); return (logpX/log(2.0)); } /* tcm_e_step */