static double get_n_unique_multisets ( int n, /* size of multisets */ int m /* number of item types */ ) { int i, j, k; double **f; /* dynamic programming array */ create_2array(f, double, n+1, m+1) /* initialize */ for (i=0; i<=n; i++) { for (j=0; j<=m; j++) { if (i==0 || j==1) { f[i][j] = 1; } else if (i==1) { f[i][j] = j; } else { f[i][j] = 0; } } } /* fill in array */ for (j=2; j<=m; j++) for (i=2; i<=n; i++) for (k=0; k<=i; k++) f[i][j] += f[k][j-1]; fprintf(stderr, "unique multisets (%d %d) %g\n", n, m, f[n][m]); free_2array(f, n); return f[n][m]; } /* get_n_unique_multisets */
SEQ_T *gendb( FILE *out, // Output stream; return output if null. int type, // Type of alphabet. // 0: protein w/ambigs // 1: dna w/ambigs // 2: codons // 3: dna w/o ambigs // 4: protein w/o ambigs char *bfile, // Name of Markov model file. int use_order,// Order of Markov model to use. double *f, // model; 0-order model used if bfile==NULL. int nseqs, // Number of sequences. int min, // Shortest sequence. int max, // Longest sequence. int seed // Random seed. ) { char *alph; // alphabet double *def_f; // letter or codon frequencies double *cum; // letter/codon cumulative distr. char **letters; // letters/codons to print int r; // number of letters/codons */ int c; // length of letter/codon strings */ int order; // Get the letters and alphabet stuff. letters = get_letters(type, &alph, &r, &c, &def_f); if (f == NULL) f = def_f; // Get the cumulative distribution(s). cum = get_cum_distr(bfile, f, alph, r, use_order, &order); // Print the random sequences. SEQ_T *seq = print_random_seqs(out, seed, nseqs, min, max, letters, r, c, order, cum); if (out) { fflush(out); } myfree(cum); free_2array(letters, r); myfree(def_f); return(seq); } // gendb
double *llr_distr( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples */ int desired_range, /* desired range for scaled LLR */ double frac, /* fraction of scores to use */ double *alpha, /* scale factor for scaled LLR */ int *offset, /* prob[0] = prob(offset) */ int *range /* range for scaled LLR */ ) { int i; /* index over alphabet */ int n; /* index over samples */ int k; /* other index */ int I; /* LLR */ double dd_sum; /* sum of dd */ int **IP; /* I'_i[n] */ int *minI=NULL; /* minimum intermediate value of I */ int *maxI=NULL; /* maximum intermediate value of I */ int Irange; /* maxI-minI+1 */ double logNfact; /* log N! */ double **logP; /* log P_i[n] */ double **logSP; /* log script_P[# samples][LLR] */ double *prob=NULL; /* final probability distribution */ double min, max, min_dd; /* create space for IP, P, minI and maxI */ create_2array(IP, int, A, N+1); create_2array(logP, double, A, N+1); Resize(minI, N+1, int); Resize(maxI, N+1, int); /* make sure distribution sums to 1.0 and has no 0's */ for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON; for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum; /* compute N! */ logNfact = 0; for (i=2; i<=N; i++) logNfact += log(i); /* get estimates of minimum and miximum values of llr */ for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]); max = NINT(-N * log(min_dd)); for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i])); min = NINT(min); /*printf("min = %f max = %f\n", min, max);*/ /* set alpha to achieve the desired range */ *alpha = desired_range/((max-min)); /* *alpha = NINT(((int)desired_range)/((max-min))); if (*alpha < 1) *alpha = 1;*/ /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/ /* compute I', P, minI and maxI */ for (n=0; n<=N; n++) minI[n] = maxI[n] = 0; for (i=0; i<A; i++) { /* index over alphabet */ double logdd = LOG(dd[i]); /* log(dd[i]) */ IP[i][0] = 0; /* I'_i(0) */ logP[i][0] = 0; /* log P_i(0) */ for (n=1; n<=N; n++) { /* index over samples */ IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); /* I'_i(n) */ logP[i][n] = logP[i][n-1] + logdd - log(n); /* log P_i(n) */ for (k=1; k<=n; k++) { /* index over samples of new */ minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]); } } } /* get overall minI and maxI */ for (n=1; n<=N; n++) { /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/ minI[0] = MIN(minI[0], minI[n]); /* min for intermediates */ maxI[0] = MAX(maxI[0], maxI[n]); /* max for intermediates */ minI[n] = LOGZEROI; maxI[n] = 0; } Irange = maxI[0] - minI[0] + 2; *offset = minI[0] - 1; /* I offset: I=-1 is array 0 */ /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/ minI[0] = LOGZEROI; maxI[0] = 0; /* create script_P arrays with enough space for intermediate calculations */ create_2array(logSP, double, N+1, Irange+1); /* clear intermediate probability array */ for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO; /* init probability array for first letter in alphabet */ for (n=0; n<=N; n++) { I = IP[0][n] - *offset; /* offset I */ logSP[n][I] = logNfact + logP[0][n]; /* init */ minI[n] = maxI[n] = I; } /* compute probabilities recursively */ for (i=1; i<A; i++) { /* index over (rest of) alphabet */ for (n=N; n>=0; n--) { /* index over samples */ for (k=1; k<=n; k++) { /* index over samples of new letter */ int min = minI[n-k]; int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1)); /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/ for (I=min; I<=max; I++) { /* index over I */ if (logSP[n-k][I] > LOGZERO) { /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/ logSP[n][I+IP[i][k]] = LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]); } } /* get current minimum and maximum I in intermediate arrays */ minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]); } if (n==N && i==A-1) break; /* all done */ } } /* compute range */ /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/ *range = maxI[N] - minI[N]; /* move to probability array with prob(offset) in position 0 */ *offset += minI[N]; /* prob[0] = prob(offset) */ Resize(prob, *range+2, double); for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I]; /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, *offset, *alpha);*/ /* free up space */ free_2array(IP, A); free_2array(logP, A); free_2array(logSP, N+1); myfree(minI); myfree(maxI); return prob; } /* llr_distr */
extern void em( MODEL *model, /* the model */ DATASET *dataset, /* the dataset */ PRIORS *priors, /* the priors */ int maxiter, /* maximum number of iterations */ double distance /* stopping criterion */ ) { int alength = dataset->alength; THETA theta_save; int iter; /* iteration number */ double (*E_STEP)(MODEL *, DATASET *); /* expectation step function */ double (*E_STEP0)(MODEL *, DATASET *); /* expectation step function */ /* maximization step function */ void (*M_STEP)(MODEL *, DATASET *, PRIORS *, int); int nc = model->c; /* number of components of model */ int max_w = model->w[nc-1]; /* width of last component */ BOOLEAN converged = FALSE; /* EM has converged */ /* create a place to save old value of theta */ create_2array(theta_save, double, max_w, alength); /* set up the correct type of EM to run */ M_STEP = m_step; E_STEP = e_step; E_STEP0 = e_step; switch (model->mtype) { case Oops: E_STEP = e_step; break; case Zoops: E_STEP = zoops_e_step; break; case Tcm: E_STEP = tcm_e_step; break; default: fprintf(stderr,"Unknown model type in em()! \n"); exit(1); break; } /* use like_e_step to set z matrix on iteration 0 if motifs were given */ if (dataset->nmotifs > 0) {E_STEP0 = E_STEP; E_STEP = like_e_step;} /* get the probability that a site starting at position x_ij would NOT overlap a previously found motif; used in E_STEP. */ get_not_o(dataset, model->w[1], FALSE); /* Perform EM for number of iterations or until no improvement */ for (iter=0; iter < maxiter; iter++) { int w = model->w[nc-1]; /* width of model */ THETA theta = model->theta[nc-1]; /* final theta of last component */ if (iter > 0 && dataset->nmotifs > 0) E_STEP = E_STEP0; if (PRINTALL) ajFmtPrintF(outf,"\niter %d\n", iter); #ifdef PARALLEL /* If we're running in parallel, only print from one node. */ if (mpMyID() == 0) #endif if ((!NO_STATUS) && ((iter % 10) == 0)) fprintf(stderr, "\rem: w=%4d, iter=%4d ", w, iter); /* fix this later: save current contents of theta */ copy_theta(theta, theta_save, w, alength); /* expectation step */ model->ll = E_STEP(model, dataset); /* maximization step */ M_STEP(model, dataset, priors, iter); /* print status if requested */ if (PRINT_LL) { double m1, e1; double nsites = model->lambda[1] * ps(dataset, model->w[1]); calc_like(model, dataset); exp10_logx(model->sig, m1, e1); ajFmtPrintF(outf,"iter=%d w=%d ll=%8.2f e_ll=%8.2f nsites=%6.1f sig=%5.3fe%+04.0f", iter, model->w[1], model->ll, model->e_ll, nsites, m1, e1); } if (PRINTALL) { int c; for (c=0; c<nc; c++) { ajFmtPrintF(outf,"component %2d: lambda= %8.6f\n", c, model->lambda[c]); print_theta(2, model->theta[c], model->w[c], "", dataset, NULL); print_theta(2, model->obs[c], model->w[c], "", dataset, NULL); } } if (PRINT_Z) print_zij(dataset, model); /* see if EM has converged */ converged = check_convergence(theta_save, theta, w, distance, alength, iter, maxiter); if (converged) {iter++; break;} /* done */ } /* save the number of iterations (counting from zero)*/ model->iter += iter; /* get the consensus of each component of the model */ { THETA theta = model->theta[1]; int w = model->w[1]; char *cons = model->cons; cons = get_consensus(theta, w, dataset, 1, MINCONS); } /* calculate the expected likelihood of the model */ calc_like(model, dataset); free_2array(theta_save, max_w); }