static double get_n_unique_multisets (
  int n,					/* size of multisets */
  int m 					/* number of item types */
)
{
  int i, j, k;
  double **f;					/* dynamic programming array */

  create_2array(f, double, n+1, m+1)

  /* initialize */
  for (i=0; i<=n; i++) {
    for (j=0; j<=m; j++) {
      if (i==0 || j==1) {
        f[i][j] = 1;
      } else if (i==1) {
        f[i][j] = j;
      } else {
        f[i][j] = 0;
      }
    }
  }

  /* fill in array */
    for (j=2; j<=m; j++) 
      for (i=2; i<=n; i++) 
        for (k=0; k<=i; k++) 
          f[i][j] += f[k][j-1];

  fprintf(stderr, "unique multisets (%d %d) %g\n", n, m, f[n][m]);
  free_2array(f, n);

  return f[n][m];
} /* get_n_unique_multisets */
Example #2
0
SEQ_T *gendb(
  FILE *out, // Output stream; return output if null.
  int type,  // Type of alphabet.
                // 0: protein w/ambigs
                // 1: dna w/ambigs
                // 2: codons
                // 3: dna w/o ambigs
                // 4: protein w/o ambigs
  char *bfile,  // Name of Markov model file.
  int use_order,// Order of Markov model to use.
  double *f, // model; 0-order model used if bfile==NULL.
  int nseqs, // Number of sequences.
  int min,   // Shortest sequence.
  int max,   // Longest sequence.
  int seed   // Random seed.
)
{
  char *alph;     // alphabet
  double *def_f;  // letter or codon frequencies
  double *cum;    // letter/codon cumulative distr.
  char **letters; // letters/codons to print
  int r;          // number of letters/codons */
  int c;          // length of letter/codon strings */
  int order;

  // Get the letters and alphabet stuff.
  letters = get_letters(type, &alph, &r, &c, &def_f);
  if (f == NULL) f = def_f;

  // Get the cumulative distribution(s).
  cum = get_cum_distr(bfile, f, alph, r, use_order, &order);

  // Print the random sequences.
  SEQ_T *seq = print_random_seqs(out, seed, nseqs, min, max, letters, r, c, order, cum);
  if (out) {
    fflush(out);
  }
  myfree(cum);
  free_2array(letters, r);
  myfree(def_f);
  return(seq);
} // gendb
Example #3
0
double *llr_distr(
  int A,				/* dimension of discrete distribution */
  double *dd,				/* discrete distribution */
  int N,				/* number of samples */
  int desired_range,			/* desired range for scaled LLR */
  double frac,				/* fraction of scores to use */
  double *alpha,			/* scale factor for scaled LLR */
  int *offset,				/* prob[0] = prob(offset) */
  int *range				/* range for scaled LLR */
)
{
  int i; 				/* index over alphabet */
  int n;				/* index over samples */
  int k;				/* other index */
  int I;				/* LLR */
  double dd_sum;			/* sum of dd */
  int **IP;				/* I'_i[n] */
  int *minI=NULL;			/* minimum intermediate value of I */
  int *maxI=NULL;			/* maximum intermediate value of I */
  int Irange;				/* maxI-minI+1 */
  double logNfact;			/* log N! */
  double **logP;			/* log P_i[n] */
  double **logSP;			/* log script_P[# samples][LLR] */
  double *prob=NULL;			/* final probability distribution */
  double min, max, min_dd;

  /* create space for IP, P, minI and maxI */
  create_2array(IP, int, A, N+1);
  create_2array(logP, double, A, N+1);
  Resize(minI, N+1, int);
  Resize(maxI, N+1, int);

  /* make sure distribution sums to 1.0 and has no 0's */
  for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON;
  for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum;

  /* compute N! */
  logNfact = 0;
  for (i=2; i<=N; i++) logNfact += log(i); 

  /* get estimates of minimum and miximum values of llr */
  for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]);
  max = NINT(-N * log(min_dd));
  for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i]));
  min = NINT(min);
  /*printf("min = %f max = %f\n", min, max);*/

  /* set alpha to achieve the desired range */
  *alpha = desired_range/((max-min));
  /* *alpha = NINT(((int)desired_range)/((max-min)));
  if (*alpha < 1) *alpha = 1;*/
  /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/

  /* compute I', P, minI and maxI */ 
  for (n=0; n<=N; n++) minI[n] = maxI[n] = 0;
  for (i=0; i<A; i++) {				/* index over alphabet */
    double logdd = LOG(dd[i]);			/* log(dd[i]) */
    IP[i][0] = 0; 				/* I'_i(0) */
    logP[i][0] = 0;				/* log P_i(0) */
    for (n=1; n<=N; n++) {			/* index over samples */
      IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); 	/* I'_i(n) */
      logP[i][n] = logP[i][n-1] + logdd - log(n);	/* log P_i(n) */
      for (k=1; k<=n; k++) {			/* index over samples of new */
	minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]);
	maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]);
      }
    }
  }

  /* get overall minI and maxI */
  for (n=1; n<=N; n++) {
    /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/
    minI[0] = MIN(minI[0], minI[n]);		/* min for intermediates */
    maxI[0] = MAX(maxI[0], maxI[n]);		/* max for intermediates */
    minI[n] = LOGZEROI;
    maxI[n] = 0;
  }
  Irange = maxI[0] - minI[0] + 2;
  *offset = minI[0] - 1;			/* I offset: I=-1 is array 0 */
  /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/
  minI[0] = LOGZEROI;
  maxI[0] = 0;

  /* create script_P arrays with enough space for intermediate calculations */
  create_2array(logSP, double, N+1, Irange+1);
  
  /* clear intermediate probability array */
  for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO;

  /* init probability array for first letter in alphabet */
  for (n=0; n<=N; n++) {
    I = IP[0][n] - *offset;			/* offset I */
    logSP[n][I] = logNfact + logP[0][n];	/* init */
    minI[n] = maxI[n] = I;
  }

  /* compute probabilities recursively */
  for (i=1; i<A; i++) {			/* index over (rest of) alphabet */
    for (n=N; n>=0; n--) {		/* index over samples */
      for (k=1; k<=n; k++) {		/* index over samples of new letter */
        int min = minI[n-k];
        int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1));
        /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/
        for (I=min; I<=max; I++) {	/* index over I */
          if (logSP[n-k][I] > LOGZERO) {
	    /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/
            logSP[n][I+IP[i][k]] = 
              LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]);
          }
	}
	/* get current minimum and maximum I in intermediate arrays */
	minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]);
	maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]);
      }
      if (n==N && i==A-1) break;	/* all done */
    }
  }

  /* compute range */
  /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/
  *range = maxI[N] - minI[N]; 

  /* move to probability array with prob(offset) in position 0 */
  *offset += minI[N];			/* prob[0] = prob(offset) */
  Resize(prob, *range+2, double);
  for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I];
  /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, 
    *offset, *alpha);*/
      
  /* free up space */
  free_2array(IP, A);
  free_2array(logP, A);
  free_2array(logSP, N+1);
  myfree(minI);
  myfree(maxI);

  return prob;
} /* llr_distr */
Example #4
0
extern void em(
  MODEL *model,			/* the model */
  DATASET *dataset,		/* the dataset */
  PRIORS *priors,		/* the priors */
  int maxiter,			/* maximum number of iterations */
  double distance		/* stopping criterion */
)
{
  int alength = dataset->alength;
  THETA theta_save;
  int iter;			/* iteration number */
  double (*E_STEP)(MODEL *, DATASET *); /* expectation step function */
  double (*E_STEP0)(MODEL *, DATASET *); /* expectation step function */
  /* maximization step function */
  void (*M_STEP)(MODEL *, DATASET *, PRIORS *, int);	
  int nc = model->c;		/* number of components of model */
  int max_w = model->w[nc-1];	/* width of last component */
  BOOLEAN converged = FALSE;	/* EM has converged */

  /* create a place to save old value of theta */
  create_2array(theta_save, double, max_w, alength);

  /* set up the correct type of EM to run */
  M_STEP = m_step;
  E_STEP = e_step;
  E_STEP0 = e_step;
  switch (model->mtype) {
    case Oops:
      E_STEP = e_step;
      break;
    case Zoops:
      E_STEP = zoops_e_step;
      break;
    case Tcm:
      E_STEP = tcm_e_step;
      break;
    default:
      fprintf(stderr,"Unknown model type in em()! \n");
      exit(1);
      break;
  }
  /* use like_e_step to set z matrix on iteration 0 if motifs were given */
  if (dataset->nmotifs > 0) {E_STEP0 = E_STEP; E_STEP = like_e_step;}

  /* get the probability that a site starting at position x_ij would
     NOT overlap a previously found motif; used in E_STEP.
  */
  get_not_o(dataset, model->w[1], FALSE);

  /* Perform EM for number of iterations or until no improvement */
  for (iter=0; iter < maxiter; iter++) {
    int w = model->w[nc-1];	/* width of model */
    THETA theta = model->theta[nc-1];	/* final theta of last component */

    if (iter > 0 && dataset->nmotifs > 0) E_STEP = E_STEP0;

    if (PRINTALL) ajFmtPrintF(outf,"\niter %d\n", iter);
#ifdef PARALLEL
    /* If we're running in parallel, only print from one node. */
    if (mpMyID() == 0)
#endif
    if ((!NO_STATUS) && ((iter % 10) == 0))
      fprintf(stderr, "\rem: w=%4d, iter=%4d                       ", w, iter);

    /* fix this later: save current contents of theta */
    copy_theta(theta, theta_save, w, alength);

    /* expectation step */
    model->ll = E_STEP(model, dataset);

    /* maximization step */
    M_STEP(model, dataset, priors, iter);

    /* print status if requested */
    if (PRINT_LL) {
      double m1, e1;
      double nsites = model->lambda[1] * ps(dataset, model->w[1]);
      calc_like(model, dataset);
      exp10_logx(model->sig, m1, e1);
      ajFmtPrintF(outf,"iter=%d w=%d ll=%8.2f e_ll=%8.2f nsites=%6.1f sig=%5.3fe%+04.0f",
      iter, model->w[1], model->ll, model->e_ll, nsites, m1, e1);
    }
    if (PRINTALL) {
      int c;
      for (c=0; c<nc; c++) {
        ajFmtPrintF(outf,"component %2d: lambda= %8.6f\n", c,
		    model->lambda[c]);
        print_theta(2, model->theta[c], model->w[c], "", dataset, NULL); 
        print_theta(2, model->obs[c], model->w[c], "", dataset, NULL); 
      }
    }
    if (PRINT_Z) print_zij(dataset, model);

    /* see if EM has converged */
    converged = check_convergence(theta_save, theta, w, distance, alength,
      iter, maxiter);

    if (converged) {iter++; break;}		/* done */
  }

  /* save the number of iterations (counting from zero)*/
  model->iter += iter;

  /* get the consensus of each component of the model */
  {
    THETA theta = model->theta[1];
    int w = model->w[1];
    char *cons = model->cons;
    cons = get_consensus(theta, w, dataset, 1, MINCONS); 
  }

  /* calculate the expected likelihood of the model */
  calc_like(model, dataset);

  free_2array(theta_save, max_w);
}