int main( int argc, char **argv ) { int i; int order = atoi(argv[1]); double logcumback[1000]; //char *string = "ACGTT"; char *string = "ATTTTTTTT"; int len = strlen(string); char *seq = NULL; Resize(seq, len+1, char); strcpy(seq, string); printf("order = %d len = %d string = %s\n", order, len, string); setup_hash_alph(DNAB); char *alpha = "ACGTX"; BOOLEAN rc = TRUE; double *a_cp = get_markov_from_sequence(seq, alpha, rc, order); printf("# conditional probabilities\n"); print_prob(a_cp, order+1, "", 0, alpha); log_cum_back(seq, a_cp, order, logcumback); for (i=0; i<len; i++) printf("%c %f %f\n", seq[i], exp(logcumback[i]), exp(Log_back(logcumback,i,6)) ); printf("All done\n"); return(0); } /* main */
double tcm_e_step( MODEL *model, /* the model */ DATASET *dataset /* the dataset */ ) { int i, j, k, ii; THETA logtheta1 = model->logtheta; /* motif log(theta) */ int w = model->w; /* motif width */ int n_samples = dataset->n_samples; /* number of sequences */ BOOLEAN invcomp = model->invcomp; /* use reverse complement strand, too */ int ndir = invcomp ? 2 : 1; /* number of strands */ double log_sigma = log(1.0/ndir); /* log \sigma */ double lambda = model->lambda; /* \lambda of tcm model */ double log_lambda = LOG(lambda); /* log \lambda */ double log_1mlambda = LOG(1-lambda); /* log (1 - \lambda) */ double logpX; /* log likelihood; no erase or smooth */ /* E step */ convert_theta_to_log(model, dataset); /* calculate all the posterior offset probabilities */ logpX = 0; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = dataset->samples[i]; int lseq = s->length; int m = lseq - w + 1; /* number of possible sites */ double *zi = s->z; /* Pr(z_ij=1 | X_i, \theta) */ double *not_o = s->not_o; /* Pr(v_ij = 1) */ double *lcb = s->logcumback; /* cumulative background probability */ if (lseq < w) continue; /* sequence too short for motif */ /* added by M.H. */ /* use log sigma_ij * lambda * m instead of lambda if secondary structure information is given */ /* NOTE: log_sigma is the prior for + or - strand --> here only + strand --> log_sigma = 0 */ if (dataset->secondaryStructureFilename != NULL) { /* first check if the maximum sigma * (lambda*m) > 1 --> if so P(Zij=1 | \phi) can be > 1 */ double Pcount = dataset->secondaryStructurePseudocount; double maxPrior = ((s->max_ss_value + Pcount) / (s->sum_ss_value + (m * Pcount))) * (lambda*m); if (maxPrior > 1.0 ) { /* compute new pseudocount that gives sigma_i_max = 1 = (max_ss_value + pseudocount) / (\sum (ss_value[i] + pseudocount)) * lambda * m */ Pcount = (-1 * s->max_ss_value * lambda * m + s->sum_ss_value) / (m * (lambda - 1)); /* for statistics keep maximum adjustment */ if (Pcount - dataset->secondaryStructurePseudocount > MAXADJUST) { MAXADJUST = Pcount - dataset->secondaryStructurePseudocount; } } /* compute new sigmas with this pseudocount */ double sum = s->sum_ss_value + (m * Pcount); /* \sum ss_value[i] + m*pseudocount */ for (j=0; j < m; j++) { s->sigma[j] = (s->ss_value[j] + Pcount) / sum; } } for (k=0; k<ndir; k++) { /* strand */ BOOLEAN ic = (k==1); /* doing - strand */ double *szik = s->sz[k]; /* Pr(X_i | z_ij=1, s_ijk=1, \theta) */ for (j=0; j<m; j++) { /* site start */ /* added by M.H. */ /* use the prior instead of lambda */ if (dataset->secondaryStructureFilename != NULL) { double p = MIN(1.0, ( s->sigma[j] * lambda * m ) ) ; /* rounding */ log_lambda = LOG(p); log_1mlambda = LOG(1-p); } /* log Pr(X_ij | s_ijk=1, \theta0) \sigma (1-\lambda) */ double log_pXijtheta0 = log_sigma + log_1mlambda; /* log Pr(X_ij | s_ijk=1, \theta1) \sigma \lambda */ double log_pXijtheta1 = log_sigma + log_lambda; int off = ic ? lseq-w-j : j; /* - strand offset from rgt. */ char *res = ic ? s->resic+off : s->res+off; /* integer sequence */ /* calculate the probability of positions in the site under the background and foreground models */ log_pXijtheta0 += Log_back(lcb, j, w); for (ii=0; ii<w; ii++) log_pXijtheta1 += logtheta1(ii, (int)res[ii]); /* set log szik to: Pr(X_i | z_ij=1, s_ijk=1, \theta) \sigma \lambda */ szik[j] = log_pXijtheta1; /* set z_ij to log Pr(X_ij | \phi): (6-21-99 tlb) log( \sigma * sum_{k=0}^{ndir-1} ( Pr(X_i|z_ij=1, s_ijk=1, \theta) \lambda + Pr(X_i|z_ij=0, s_ijk=1, \theta) (1-\lambda) ) ) */ zi[j] = k==0 ? LOGL_SUM(log_pXijtheta0, log_pXijtheta1) : LOGL_SUM(zi[j], LOGL_SUM(log_pXijtheta0, log_pXijtheta1)); } /* site start */ } /* strand */ /* compute log Pr(X | \phi) = sum_i,j log(Pr(X_ij)) */ for (j=0; j<m; j++) { /* site start */ logpX += zi[j]; /* z_ij = log Pr(X_ij | \phi) */ } /* sz_ijk : normalize, delog and account for erasing Pr(z_ij=1, s_ijk=1 | X_i, \phi) \approx P(z_ij=1, s_ijk=1 | X_i, \phi) P(v_ij = 1) */ for (k=0; k<ndir; k++) { /* strand */ double *szik = s->sz[k]; /* Pr(X_i | z_ij=1, s_ijk=1, \phi) */ for (j=0; j<m; j++) { /* site start */ /* note zi[j] holds Pr(X_ij|\phi) */ szik[j] = MIN(1.0, exp(szik[j] - zi[j]) * not_o[j]); /* roundoff */ } /* site */ } /* strand */ /* z_ij : sum of sz_ijk */ for (j=0; j<m; j++) { /* site start */ for (k=zi[j]=0; k<ndir; k++) { /* strand */ zi[j] += s->sz[k][j]; } /* strand */ zi[j] = MIN(1.0, zi[j]); /* avoid roundoff errors */ } /* site */ for (j=m; j<lseq; j++) { /* tail of sequence */ zi[j] = 0; } } /* sequence */ /* smooth so no window of size w has z_i which sum to greater than 1.0 */ (void) smooth(w, model, dataset); return (logpX/log(2.0)); } /* tcm_e_step */