extern void init_llr_pv_tables( int min, /* minimum number of sites */ int max, /* maximum number of sites */ int alength, /* alphabet length */ double *back, /* background frequencies */ BOOLEAN pal /* sites are palindromes */ ) { int nsites; /* number of sites */ /* set effective number of sites to double if pal */ if (pal) { min *= 2; max *= 2; } if (!NO_STATUS) fprintf(stderr, "Initializing the motif probability tables for %d to %d sites...\n", min, max); /* make sure the distr table gets initialized on all nodes */ (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back); for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) { /* nsites */ /* allocate space for table */ (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back); if (!load_balance_llr(nsites, pal)) { continue; /* for parallel */ } /* create table */ if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); } (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back); } /* nsites */ broadcast_llr(min, max, pal); /* for parallel; collect the tables */ #ifdef DEBUG /* print results */ int n; for (n=min; n<=max; n++) { int I; int w = 1; printf("# N I llr 1-cdf\n"); for (I=0; I<=distrs[n].range[w]; I++) { /* LLR */ double m2, e2; if (distrs[n].cdf[w][I] == LOGZERO) { m2 = e2 = 0; } else { exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1); } printf("%3d %3d %5.1f %3.1fe%+05.0f\n", n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2); } /* LLR */ } #endif if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n"); } /* init_llr_pv_tables */
extern void em( MODEL *model, /* the model */ DATASET *dataset, /* the dataset */ PRIORS *priors, /* the priors */ int maxiter, /* maximum number of iterations */ double distance /* stopping criterion */ ) { int alength = dataset->alength; THETA theta_save; int iter; /* iteration number */ double (*E_STEP)(MODEL *, DATASET *); /* expectation step function */ double (*E_STEP0)(MODEL *, DATASET *); /* expectation step function */ /* maximization step function */ void (*M_STEP)(MODEL *, DATASET *, PRIORS *, int); int nc = model->c; /* number of components of model */ int max_w = model->w[nc-1]; /* width of last component */ BOOLEAN converged = FALSE; /* EM has converged */ /* create a place to save old value of theta */ create_2array(theta_save, double, max_w, alength); /* set up the correct type of EM to run */ M_STEP = m_step; E_STEP = e_step; E_STEP0 = e_step; switch (model->mtype) { case Oops: E_STEP = e_step; break; case Zoops: E_STEP = zoops_e_step; break; case Tcm: E_STEP = tcm_e_step; break; default: fprintf(stderr,"Unknown model type in em()! \n"); exit(1); break; } /* use like_e_step to set z matrix on iteration 0 if motifs were given */ if (dataset->nmotifs > 0) {E_STEP0 = E_STEP; E_STEP = like_e_step;} /* get the probability that a site starting at position x_ij would NOT overlap a previously found motif; used in E_STEP. */ get_not_o(dataset, model->w[1], FALSE); /* Perform EM for number of iterations or until no improvement */ for (iter=0; iter < maxiter; iter++) { int w = model->w[nc-1]; /* width of model */ THETA theta = model->theta[nc-1]; /* final theta of last component */ if (iter > 0 && dataset->nmotifs > 0) E_STEP = E_STEP0; if (PRINTALL) ajFmtPrintF(outf,"\niter %d\n", iter); #ifdef PARALLEL /* If we're running in parallel, only print from one node. */ if (mpMyID() == 0) #endif if ((!NO_STATUS) && ((iter % 10) == 0)) fprintf(stderr, "\rem: w=%4d, iter=%4d ", w, iter); /* fix this later: save current contents of theta */ copy_theta(theta, theta_save, w, alength); /* expectation step */ model->ll = E_STEP(model, dataset); /* maximization step */ M_STEP(model, dataset, priors, iter); /* print status if requested */ if (PRINT_LL) { double m1, e1; double nsites = model->lambda[1] * ps(dataset, model->w[1]); calc_like(model, dataset); exp10_logx(model->sig, m1, e1); ajFmtPrintF(outf,"iter=%d w=%d ll=%8.2f e_ll=%8.2f nsites=%6.1f sig=%5.3fe%+04.0f", iter, model->w[1], model->ll, model->e_ll, nsites, m1, e1); } if (PRINTALL) { int c; for (c=0; c<nc; c++) { ajFmtPrintF(outf,"component %2d: lambda= %8.6f\n", c, model->lambda[c]); print_theta(2, model->theta[c], model->w[c], "", dataset, NULL); print_theta(2, model->obs[c], model->w[c], "", dataset, NULL); } } if (PRINT_Z) print_zij(dataset, model); /* see if EM has converged */ converged = check_convergence(theta_save, theta, w, distance, alength, iter, maxiter); if (converged) {iter++; break;} /* done */ } /* save the number of iterations (counting from zero)*/ model->iter += iter; /* get the consensus of each component of the model */ { THETA theta = model->theta[1]; int w = model->w[1]; char *cons = model->cons; cons = get_consensus(theta, w, dataset, 1, MINCONS); } /* calculate the expected likelihood of the model */ calc_like(model, dataset); free_2array(theta_save, max_w); }