Ejemplo n.º 1
0
extern void init_llr_pv_tables(
  int min,				/* minimum number of sites */
  int max,				/* maximum number of sites */
  int alength,				/* alphabet length */
  double *back,				/* background frequencies */
  BOOLEAN pal				/* sites are palindromes */
)
{
  int nsites;				/* number of sites */

  /* set effective number of sites to double if pal */
  if (pal) { min *= 2; max *= 2; }

  if (!NO_STATUS)
    fprintf(stderr,
      "Initializing the motif probability tables for %d to %d sites...\n",
	min, max);

  /* make sure the distr table gets initialized on all nodes */
  (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back);

  for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) {   /* nsites */

    /* allocate space for table */
    (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back);

    if (!load_balance_llr(nsites, pal)) {
      continue;	/* for parallel */
    } 

    /* create table */
    if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); }
    (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back);

  } /* nsites */
  broadcast_llr(min, max, pal);		/* for parallel; collect the tables */

#ifdef DEBUG
  /* print results */
  int n;
  for (n=min; n<=max; n++) {
    int I;
    int w = 1;
    printf("# N    I    llr         1-cdf\n");
    for (I=0; I<=distrs[n].range[w]; I++) {		/* LLR */
      double m2, e2;
      if (distrs[n].cdf[w][I] == LOGZERO) {
        m2 = e2 = 0;
      } else {
        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);
      }
      printf("%3d %3d %5.1f %3.1fe%+05.0f\n",
        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2);
    } /* LLR */
  }
#endif

  if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n");

} /* init_llr_pv_tables */
Ejemplo n.º 2
0
extern void em(
  MODEL *model,			/* the model */
  DATASET *dataset,		/* the dataset */
  PRIORS *priors,		/* the priors */
  int maxiter,			/* maximum number of iterations */
  double distance		/* stopping criterion */
)
{
  int alength = dataset->alength;
  THETA theta_save;
  int iter;			/* iteration number */
  double (*E_STEP)(MODEL *, DATASET *); /* expectation step function */
  double (*E_STEP0)(MODEL *, DATASET *); /* expectation step function */
  /* maximization step function */
  void (*M_STEP)(MODEL *, DATASET *, PRIORS *, int);	
  int nc = model->c;		/* number of components of model */
  int max_w = model->w[nc-1];	/* width of last component */
  BOOLEAN converged = FALSE;	/* EM has converged */

  /* create a place to save old value of theta */
  create_2array(theta_save, double, max_w, alength);

  /* set up the correct type of EM to run */
  M_STEP = m_step;
  E_STEP = e_step;
  E_STEP0 = e_step;
  switch (model->mtype) {
    case Oops:
      E_STEP = e_step;
      break;
    case Zoops:
      E_STEP = zoops_e_step;
      break;
    case Tcm:
      E_STEP = tcm_e_step;
      break;
    default:
      fprintf(stderr,"Unknown model type in em()! \n");
      exit(1);
      break;
  }
  /* use like_e_step to set z matrix on iteration 0 if motifs were given */
  if (dataset->nmotifs > 0) {E_STEP0 = E_STEP; E_STEP = like_e_step;}

  /* get the probability that a site starting at position x_ij would
     NOT overlap a previously found motif; used in E_STEP.
  */
  get_not_o(dataset, model->w[1], FALSE);

  /* Perform EM for number of iterations or until no improvement */
  for (iter=0; iter < maxiter; iter++) {
    int w = model->w[nc-1];	/* width of model */
    THETA theta = model->theta[nc-1];	/* final theta of last component */

    if (iter > 0 && dataset->nmotifs > 0) E_STEP = E_STEP0;

    if (PRINTALL) ajFmtPrintF(outf,"\niter %d\n", iter);
#ifdef PARALLEL
    /* If we're running in parallel, only print from one node. */
    if (mpMyID() == 0)
#endif
    if ((!NO_STATUS) && ((iter % 10) == 0))
      fprintf(stderr, "\rem: w=%4d, iter=%4d                       ", w, iter);

    /* fix this later: save current contents of theta */
    copy_theta(theta, theta_save, w, alength);

    /* expectation step */
    model->ll = E_STEP(model, dataset);

    /* maximization step */
    M_STEP(model, dataset, priors, iter);

    /* print status if requested */
    if (PRINT_LL) {
      double m1, e1;
      double nsites = model->lambda[1] * ps(dataset, model->w[1]);
      calc_like(model, dataset);
      exp10_logx(model->sig, m1, e1);
      ajFmtPrintF(outf,"iter=%d w=%d ll=%8.2f e_ll=%8.2f nsites=%6.1f sig=%5.3fe%+04.0f",
      iter, model->w[1], model->ll, model->e_ll, nsites, m1, e1);
    }
    if (PRINTALL) {
      int c;
      for (c=0; c<nc; c++) {
        ajFmtPrintF(outf,"component %2d: lambda= %8.6f\n", c,
		    model->lambda[c]);
        print_theta(2, model->theta[c], model->w[c], "", dataset, NULL); 
        print_theta(2, model->obs[c], model->w[c], "", dataset, NULL); 
      }
    }
    if (PRINT_Z) print_zij(dataset, model);

    /* see if EM has converged */
    converged = check_convergence(theta_save, theta, w, distance, alength,
      iter, maxiter);

    if (converged) {iter++; break;}		/* done */
  }

  /* save the number of iterations (counting from zero)*/
  model->iter += iter;

  /* get the consensus of each component of the model */
  {
    THETA theta = model->theta[1];
    int w = model->w[1];
    char *cons = model->cons;
    cons = get_consensus(theta, w, dataset, 1, MINCONS); 
  }

  /* calculate the expected likelihood of the model */
  calc_like(model, dataset);

  free_2array(theta_save, max_w);
}