/* Function: p7_ViterbiMu() * Synopsis: Determines the local Viterbi Gumbel mu parameter for a model. * Incept: SRE, Tue May 19 10:26:19 2009 [Janelia] * * Purpose: Identical to p7_MSVMu(), above, except that it fits * Viterbi scores instead of MSV scores. * * The difference between the two mus is small, but can be * up to ~1 bit or so for large, low-info models [J4/126] so * decided to calibrate the two mus separately [J5/8]. * * Args: r : source of random numbers * om : score profile (length config is changed upon return!) * bg : null model (length config is changed upon return!) * L : length of sequences to simulate * N : number of sequences to simulate * lambda : known Gumbel lambda parameter * ret_vmu : RETURN: ML estimate of location param mu * * Returns: <eslOK> on success, and <ret_mu> contains the ML estimate * of $\mu$. * * Throws: (no abnormal error conditions) */ int p7_ViterbiMu(ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double *ret_vmu) { P7_OMX *ox = p7_omx_Create(om->M, 0, 0); /* DP matrix: 1 row version */ ESL_DSQ *dsq = NULL; double *xv = NULL; int i; float sc, nullsc; #ifndef p7_IMPL_DUMMY float maxsc = (32767.0 - om->base_w) / om->scale_w; /* if score overflows, use this [J4/139] */ #endif int status; if (ox == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(xv, sizeof(double) * N); ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2)); p7_oprofile_ReconfigLength(om, L); p7_bg_SetLength(bg, L); for (i = 0; i < N; i++) { if ((status = esl_rsq_xfIID(r, bg->f, om->abc->K, L, dsq)) != eslOK) goto ERROR; if ((status = p7_bg_NullOne(bg, dsq, L, &nullsc)) != eslOK) goto ERROR; status = p7_ViterbiFilter(dsq, L, om, ox, &sc); #ifndef p7_IMPL_DUMMY if (status == eslERANGE) { sc = maxsc; status = eslOK; } #endif if (status != eslOK) goto ERROR; xv[i] = (sc - nullsc) / eslCONST_LOG2; } if ((status = esl_gumbel_FitCompleteLoc(xv, N, lambda, ret_vmu)) != eslOK) goto ERROR; p7_omx_Destroy(ox); free(xv); free(dsq); return eslOK; ERROR: *ret_vmu = 0.0; if (ox != NULL) p7_omx_Destroy(ox); if (xv != NULL) free(xv); if (dsq != NULL) free(dsq); return status; }
static int output_result(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { ESL_HISTOGRAM *h = NULL; int i; double tailp; double x10; double mu, lambda, E10; double mufix, E10fix; double mufix2, E10fix2; double E10p; double almean, alvar; /* alignment length mean and variance (optional output) */ double pmu, plambda; int status; /* fetch statistical params from HMM for expected distribution */ if (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU]; plambda = hmm->evparam[p7_VLAMBDA]; } else if (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU]; plambda = hmm->evparam[p7_MLAMBDA]; } else if (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; } /* Optional output of scores/alignment lengths: */ if (cfg->xfp) fwrite(scores, sizeof(double), cfg->N, cfg->xfp); if (cfg->alfp) for (i = 0; i < cfg->N; i++) fprintf(cfg->alfp, "%d %.3f\n", alilens[i], scores[i]); if (esl_opt_GetBoolean(go, "-v")) for (i = 0; i < cfg->N; i++) printf("%.3f\n", scores[i]); /* optional "filter power" data file: <hmm name> <# seqs <= P threshold> <fraction of seqs <= P threshold> */ if (cfg->ffp) output_filter_power(go, cfg, errbuf, hmm, scores); /* Count the scores into a histogram object. */ if ((h = esl_histogram_CreateFull(-50., 50., 0.2)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "allocation failed"); for (i = 0; i < cfg->N; i++) esl_histogram_Add(h, scores[i]); /* For viterbi, MSV, and hybrid, fit data to a Gumbel, either with known lambda or estimated lambda. */ if (esl_opt_GetBoolean(go, "--vit") || esl_opt_GetBoolean(go, "--msv")) { esl_histogram_GetRank(h, 10, &x10); tailp = 1.0; /* mu, lambda, E10 fields are for ML Gumbel fit to the observed data */ if (esl_gumbel_FitComplete(scores, cfg->N, &mu, &lambda) != eslOK) esl_fatal("gumbel complete data fit failed"); E10 = cfg->N * esl_gumbel_surv(x10, mu, lambda); /* mufix, E10fix fields: assume lambda = log2; fit an ML mu to the data */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, 0.693147, &mufix) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for lambda = log2"); E10fix = cfg->N * esl_gumbel_surv(x10, mufix, 0.693147); /* mufix2, E10fix2 fields: assume H3's own lambda estimate; fit ML mu */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, plambda, &mufix2) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for fitted lambda"); E10fix2 = cfg->N * esl_gumbel_surv(x10, mufix2, plambda); /* pmu, plambda, E10p: use H3 expectation estimates (pmu, plambda) */ E10p = cfg->N * esl_gumbel_surv(x10, pmu, plambda); fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, mufix2, E10fix2, pmu, plambda, E10p); if (esl_opt_GetBoolean(go, "-a")) { esl_stats_IMean(alilens, cfg->N, &almean, &alvar); fprintf(cfg->ofp, " %8.4f %8.4f\n", almean, sqrt(alvar)); } else fprintf(cfg->ofp, "\n"); if (cfg->survfp != NULL) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_gumbel_Plot(cfg->survfp, pmu, plambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mu, lambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mufix, 0.693147, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_gumbel_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* For Forward, fit tail to exponential tails, for a range of tail mass choices. */ else if (esl_opt_GetBoolean(go, "--fwd")) { double tmin = esl_opt_GetReal(go, "--tmin"); double tmax = esl_opt_GetReal(go, "--tmax"); double tpoints = (double) esl_opt_GetInteger(go, "--tpoints"); int do_linear = esl_opt_GetBoolean(go, "--tlinear"); double *xv; double tau; int n; esl_histogram_GetRank(h, 10, &x10); tailp = tmin; do { if (tailp > 1.0) tailp = 1.0; esl_histogram_GetTailByMass(h, tailp, &xv, &n, NULL); if (esl_exp_FitComplete(xv, n, &mu, &lambda) != eslOK) esl_fatal("exponential fit failed"); E10 = cfg->N * tailp * esl_exp_surv(x10, mu, lambda); mufix = mu; E10fix = cfg->N * tailp * esl_exp_surv(x10, mu, 0.693147); E10p = cfg->N * esl_exp_surv(x10, pmu, plambda); /* the pmu is relative to a P=1.0 tail origin. */ tau = mu + log(tailp) / lambda; fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f\n", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, pmu, plambda, E10p); if (tpoints == 1) break; else if (do_linear) tailp += (tmax-tmin) / (tpoints-1); else tailp *= exp(log(tmax/tmin) / (tpoints-1)); } while (tailp <= tmax+1e-7); if (cfg->survfp) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_exp_Plot(cfg->survfp, pmu, plambda, esl_exp_surv, pmu, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, lambda, esl_exp_surv, tau, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, 0.693147, esl_exp_surv, tau, xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_exp_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* fallthrough: both normal, error cases execute same cleanup code */ status = eslOK; ERROR: esl_histogram_Destroy(h); return status; }