/* tests: * 1. each sampled trace must validate. * 2. each trace must be <= viterbi trace score * 3. in a large # of traces, one is "equal" to the viterbi trace score. * (this of course is stochastic; but it's true for the particular * choice of RNG seed used in tests here.) */ static void utest_stotrace(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_OPROFILE *om, ESL_DSQ *dsq, int L, int ntrace) { P7_GMX *gx = NULL; P7_OMX *ox = NULL; P7_TRACE *tr = NULL; char errbuf[eslERRBUFSIZE]; int idx; float maxsc = -eslINFINITY; float vsc, sc; if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("generic DP matrix creation failed"); if ((ox = p7_omx_Create(gm->M, L, L)) == NULL) esl_fatal("optimized DP matrix create failed"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); if (p7_GViterbi(dsq, L, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_Forward (dsq, L, om, ox, NULL) != eslOK) esl_fatal("forward failed"); for (idx = 0; idx < ntrace; idx++) { if (p7_StochasticTrace(rng, dsq, L, om, ox, tr) != eslOK) esl_fatal("stochastic trace failed"); if (p7_trace_Validate(tr, abc, dsq, errbuf) != eslOK) esl_fatal("trace invalid:\n%s", errbuf); if (p7_trace_Score(tr, dsq, gm, &sc) != eslOK) esl_fatal("trace scoring failed"); maxsc = ESL_MAX(sc, maxsc); if (sc > vsc) esl_fatal("sampled trace has score > optimal Viterbi path; not possible"); p7_trace_Reuse(tr); } if (esl_FCompare(maxsc, vsc, 0.1) != eslOK) esl_fatal("stochastic trace failed to sample the Viterbi path"); p7_trace_Destroy(tr); p7_omx_Destroy(ox); p7_gmx_Destroy(gx); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_TRACE *tr = p7_trace_Create(); ESL_SQ *sq = NULL; char errbuf[eslERRBUFSIZE]; int i; int status; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); sq = esl_sq_CreateDigital(abc); for (i = 0; i < N; i++) { p7_ProfileEmit(rng, hmm, gm, bg, sq, tr); esl_sq_FormatName(sq, "%s-sample%d", hmm->name, i); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) esl_fatal(errbuf); esl_sq_Reuse(sq); p7_trace_Reuse(tr); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }
static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_SQ *sq = NULL; P7_TRACE *tr = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; int do_profile = esl_opt_GetBoolean(go, "-p"); int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int mode = p7_LOCAL; int nseq; int status; if (esl_opt_GetBoolean(go, "--local")) mode = p7_LOCAL; else if (esl_opt_GetBoolean(go, "--unilocal")) mode = p7_UNILOCAL; else if (esl_opt_GetBoolean(go, "--glocal")) mode = p7_GLOCAL; else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); if ((bg = p7_bg_Create(hmm->abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, L, mode) != eslOK) esl_fatal("failed to configure profile"); if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal("failed to reconfig null model length"); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); for (nseq = 1; nseq <= N; nseq++) { if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr); else status = p7_CoreEmit (r, hmm, sq, tr); if (status) esl_fatal("Failed to emit sequence\n"); status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq); if (status) esl_fatal("Failed to set sequence name\n"); status = esl_sqio_Write(ofp, sq, outfmt, FALSE); if (status != eslOK) esl_fatal("Failed to write sequence\n"); p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_bg_Destroy(bg); p7_profile_Destroy(gm); return; }
/* Viterbi validation is done by comparing the returned score * to the score of the optimal trace. Not foolproof, but catches * many kinds of errors. * * Another check is that the average score should be <= 0, * since the random sequences are drawn from the null model. */ static void utest_viterbi(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L) { float avg_sc = 0.; char errbuf[eslERRBUFSIZE]; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; P7_TRACE *tr = NULL; int idx; float sc1, sc2; if ((dsq = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL) esl_fatal("malloc failed"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); for (idx = 0; idx < nseq; idx++) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed"); if (p7_GViterbi(dsq, L, gm, gx, &sc1) != eslOK) esl_fatal("viterbi failed"); if (p7_GTrace (dsq, L, gm, gx, tr) != eslOK) esl_fatal("trace failed"); if (p7_trace_Validate(tr, abc, dsq, errbuf) != eslOK) esl_fatal("trace invalid:\n%s", errbuf); if (p7_trace_Score(tr, dsq, gm, &sc2) != eslOK) esl_fatal("trace score failed"); if (esl_FCompare(sc1, sc2, 1e-6) != eslOK) esl_fatal("Trace score != Viterbi score"); if (p7_bg_NullOne(bg, dsq, L, &sc2) != eslOK) esl_fatal("null score failed"); avg_sc += (sc1 - sc2); if (esl_opt_GetBoolean(go, "--vv")) printf("utest_viterbi: Viterbi score: %.4f (null %.4f) (total so far: %.4f)\n", sc1, sc2, avg_sc); p7_trace_Reuse(tr); } avg_sc /= (float) nseq; if (avg_sc > 0.) esl_fatal("Viterbi scores have positive expectation (%f nats)", avg_sc); p7_gmx_Destroy(gx); p7_trace_Destroy(tr); free(dsq); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; char *ghmmfile = esl_opt_GetArg(go, 1); /* HMMs parameterized for sequence generation */ char *ahmmfile = esl_opt_GetArg(go, 2); /* HMMs parameterized for alignment */ int N = esl_opt_GetInteger(go, "-N"); P7_HMMFILE *ghfp = NULL; P7_HMMFILE *ahfp = NULL; P7_HMM *ghmm = NULL; P7_HMM *ahmm = NULL; P7_PROFILE *ggm = NULL; P7_PROFILE *agm = NULL; P7_OPROFILE *aom = NULL; P7_BG *bg = NULL; ESL_SQ *sq = NULL; P7_TRACE *reftr = p7_trace_Create(); P7_TRACE *testtr = p7_trace_Create(); P7_TRACE_METRICS *tmetrics = p7_trace_metrics_Create(); P7_REFMX *rmx = p7_refmx_Create(100,100); // P7_FILTERMX *ox = NULL; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); P7_SPARSEMASK *sm = p7_sparsemask_Create(100, 100, hw->simd); P7_SPARSEMX *sxv = p7_sparsemx_Create(NULL); int idx; char errbuf[eslERRBUFSIZE]; int status; p7_Init(); /* open HMM file containing models parameterized for generation (sampling) of seqs */ status = p7_hmmfile_OpenE(ghmmfile, NULL, &ghfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ghmmfile, errbuf); /* open HMM file containing models parameterized for alignment (may be the same as ghmmfile) */ status = p7_hmmfile_OpenE(ahmmfile, NULL, &ahfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ahmmfile, errbuf); while ( (status = p7_hmmfile_Read(ghfp, &abc, &ghmm)) == eslOK) /* <abc> gets set on first read */ { /* read the counterpart HMM from <ahfp> */ status = p7_hmmfile_Read(ahfp, &abc, &ahmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ahfp->fname, ahfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ahfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", ahfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", ahfp->fname); /* try to validate that they're the "same" */ if (ahmm->M != ghmm->M || strcmp(ahmm->name, ghmm->name) != 0) p7_Fail("<gen-hmmfile>, <ali-hmmfile> contain different set or order of models"); /* deferred one-time creation of structures that need to know the alphabet */ if (!bg) bg = p7_bg_Create(abc); if (!sq) sq = esl_sq_CreateDigital(abc); ggm = p7_profile_Create(ghmm->M, abc); agm = p7_profile_Create(ahmm->M, abc); aom = p7_oprofile_Create(ahmm->M, abc, hw->simd); p7_profile_ConfigCustom(ggm, ghmm, bg, esl_opt_GetInteger(go, "--gL"), esl_opt_GetReal(go, "--gnj"), esl_opt_GetReal(go, "--gpglocal")); p7_profile_ConfigCustom(agm, ahmm, bg, 100, esl_opt_GetReal(go, "--anj"), esl_opt_GetReal(go, "--apglocal")); p7_oprofile_Convert(agm, aom); for (idx = 1; idx <= N; idx++) { p7_ProfileEmit(rng, ghmm, ggm, bg, sq, reftr); if (esl_opt_GetBoolean(go, "--dumpseqs")) { esl_sq_FormatName(sq, "seq%d", idx); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); } p7_bg_SetLength(bg, sq->n); p7_profile_SetLength(agm, sq->n); p7_sparsemask_Reinit(sm, agm->M, sq->n); p7_sparsemask_AddAll(sm); if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(sq->dsq, sq->n, agm, rmx, testtr, /*opt_vsc=*/NULL); else p7_SparseViterbi (sq->dsq, sq->n, agm, sm, sxv, testtr, /*opt_vsc=*/NULL); p7_trace_metrics(reftr, testtr, tmetrics); p7_sparsemask_Reuse(sm); p7_sparsemx_Reuse(sxv); //p7_filtermx_Reuse(ox); p7_refmx_Reuse(rmx); esl_sq_Reuse(sq); p7_trace_Reuse(reftr); p7_trace_Reuse(testtr); } p7_oprofile_Destroy(aom); p7_profile_Destroy(ggm); p7_profile_Destroy(agm); p7_hmm_Destroy(ghmm); p7_hmm_Destroy(ahmm); } /* we leave the loop with <status> set by a p7_hmmfile_Read() on ghfp; if all is well, status=eslEOF */ if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ghfp->fname, ghfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ghfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslEOF) p7_Fail("Unexpected error in reading HMMs from %s\n", ghfp->fname); p7_trace_metrics_Dump(stdout, tmetrics); p7_hmmfile_Close(ghfp); p7_hmmfile_Close(ahfp); // p7_filtermx_Destroy(ox); p7_sparsemask_Destroy(sm); p7_sparsemx_Destroy(sxv); p7_refmx_Destroy(rmx); p7_trace_metrics_Destroy(tmetrics); p7_trace_Destroy(testtr); p7_trace_Destroy(reftr); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); int N = esl_opt_GetInteger(go, "-N"); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; P7_ALIDISPLAY *ad = NULL; int i,z; if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, 0); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, 0, p7_UNIGLOCAL); /* that sets N,C,J to generate nothing */ om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_GetBoolean(go, "-p")) tr = p7_trace_CreateWithPP(); else tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_ProfileEmit(r, hmm, gm, bg, sq, tr); esl_sq_SetName(sq, "random"); if (! esl_opt_GetBoolean(go, "-b")) { if (esl_opt_GetBoolean(go, "-p")) for (z = 0; z < tr->N; z++) if (tr->i[z] > 0) tr->pp[z] = esl_random(r); ad = p7_alidisplay_Create(tr, 0, om, sq); p7_alidisplay_Print(stdout, ad, 40, 80, FALSE); p7_alidisplay_Destroy(ad); } p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_stopwatch_Destroy(w); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; ESL_RANDOMNESS *rng = esl_randomness_CreateFast(0); P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; int N = esl_opt_GetInteger(go, "-N"); int i; float vsc, fsc, tsc; char errbuf[eslERRBUFSIZE]; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_GetBoolean(go, "-p")) p7_oprofile_Dump(stdout, om); fwd = p7_omx_Create(gm->M, sq->n, sq->n); gx = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); if (esl_opt_GetBoolean(go, "-m") == TRUE) p7_omx_SetDumpMode(stdout, fwd, TRUE); p7_GViterbi(sq->dsq, sq->n, gm, gx, &vsc); p7_Forward (sq->dsq, sq->n, om, fwd, &fsc); for (i = 0; i < N; i++) { p7_StochasticTrace(rng, sq->dsq, sq->n, om, fwd, tr); p7_trace_Score(tr, sq->dsq, gm, &tsc); if (esl_opt_GetBoolean(go, "-t") == TRUE) p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace %d fails validation:\n%s\n", i, errbuf); printf("Sampled trace: %.4f nats\n", tsc); p7_trace_Reuse(tr); } printf("Forward score: %.4f nats\n", fsc); printf("Viterbi score: %.4f nats\n", vsc); /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_trace_Destroy(tr); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_randomness_Destroy(rng); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* glocal_region_trace_ensemble() * EPN, Tue Oct 5 10:13:25 2010 * * Based on p7_domaindef.c's region_trace_ensemble(). Modified so that * generic matrices (which can be used for glocally configured models) * can be used. An additional parameter <do_null2> has been added, * so that null2-related calculations are only done if necessary. * That is, they're skipped if null2 has been turned off in the pipeline. * * Notes from p7_domaindef.c::region_trace_ensemble(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 11:49:44 2008 [Janelia] * * Here, we've decided that region <ireg>..<jreg> in sequence <dsq> might be * composed of more than one domain, and we're going to use clustering * of a posterior ensemble of stochastic tracebacks to sort it out. * * Caller provides a filled Forward matrix in <fwd> for the sequence * region <dsq+ireg-1>, length <jreg-ireg+1>, for the model <om> * configured in multihit mode with its target length distribution * set to the total length of <dsq>: i.e., the same model * configuration used to score the complete sequence (if it weren't * multihit, we wouldn't be worried about multiple domains). * * Caller also provides a DP matrix in <wrk> containing at least one * row, for use as temporary workspace. (This will typically be the * caller's Backwards matrix, which we haven't yet used at this point * in the processing pipeline.) * * Caller provides <ddef>, which defines heuristic parameters that * control the clustering, and provides working space for the * calculation and the answers. The <ddef->sp> object must have been * reused (i.e., it needs to be fresh; we're going to use it here); * the caller needs to Reuse() it specifically, because it can't just * Reuse() the whole <ddef>, when it's in the process of analyzing * regions. * * Upon return, <*ret_nc> contains the number of clusters that were * defined. * * The caller can retrieve info on each cluster by calling * <p7_spensemble_GetClusterCoords(ddef->sp...)> on the * <P7_SPENSEMBLE> object in <ddef>. * * Other information on what's happened in working memory: * * <ddef->n2sc[ireg..jreg]> now contains log f'(x_i) / f(x_i) null2 scores * for each residue. * * <ddef->sp> gets filled in, and upon return, it's holding the answers * (the cluster definitions). When the caller is done retrieving those * answers, it needs to <esl_spensemble_Reuse()> it before calling * <region_trace_ensemble()> again. * * <ddef->tr> is used as working memory for sampled traces. * * <wrk> has had its zero row clobbered as working space for a null2 calculation. *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ static int glocal_region_trace_ensemble(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_DSQ *dsq, int ireg, int jreg, const P7_GMX *fwd, P7_GMX *wrk, int do_null2, int *ret_nc) { int Lr = jreg-ireg+1; int t, d, d2; int nov, n; int nc; int pos; float null2[p7_MAXCODE]; esl_vec_FSet(ddef->n2sc+ireg, Lr, 0.0); /* zero the null2 scores in region */ /* By default, we make results reproducible by forcing a reset of * the RNG to its originally seeded state. */ if (ddef->do_reseeding) esl_randomness_Init(ddef->r, esl_randomness_GetSeed(ddef->r)); /* Collect an ensemble of sampled traces; calculate null2 odds ratios from these if nec */ for (t = 0; t < ddef->nsamples; t++) { p7_GStochasticTrace(ddef->r, dsq+ireg-1, Lr, gm, fwd, ddef->tr); p7_trace_Index(ddef->tr); pos = 1; for (d = 0; d < ddef->tr->ndom; d++) { p7_spensemble_Add(ddef->sp, t, ddef->tr->sqfrom[d]+ireg-1, ddef->tr->sqto[d]+ireg-1, ddef->tr->hmmfrom[d], ddef->tr->hmmto[d]); if(do_null2) { p7_GNull2_ByTrace(gm, ddef->tr, ddef->tr->tfrom[d], ddef->tr->tto[d], wrk, null2); /* residues outside domains get bumped +1: because f'(x) = f(x), so f'(x)/f(x) = 1 in these segments */ for (; pos <= ddef->tr->sqfrom[d]; pos++) ddef->n2sc[ireg+pos-1] += 1.0; /* Residues inside domains get bumped by their null2 ratio */ for (; pos <= ddef->tr->sqto[d]; pos++) ddef->n2sc[ireg+pos-1] += null2[dsq[ireg+pos-1]]; } } if(do_null2) { /* the remaining residues in the region outside any domains get +1 */ for (; pos <= Lr; pos++) ddef->n2sc[ireg+pos-1] += 1.0; } p7_trace_Reuse(ddef->tr); } /* Convert the accumulated n2sc[] ratios in this region to log odds null2 scores on each residue. */ if(do_null2) { for (pos = ireg; pos <= jreg; pos++) ddef->n2sc[pos] = logf(ddef->n2sc[pos] / (float) ddef->nsamples); } /* Cluster the ensemble of traces to break region into envelopes. */ p7_spensemble_Cluster(ddef->sp, ddef->min_overlap, ddef->of_smaller, ddef->max_diagdiff, ddef->min_posterior, ddef->min_endpointp, &nc); /* A little hacky now. Remove "dominated" domains relative to seq coords. */ for (d = 0; d < nc; d++) ddef->sp->assignment[d] = 0; /* overload <assignment> to flag that a domain is dominated */ /* who dominates who? (by post prob) */ for (d = 0; d < nc; d++) { for (d2 = d+1; d2 < nc; d2++) { nov = ESL_MIN(ddef->sp->sigc[d].j, ddef->sp->sigc[d2].j) - ESL_MAX(ddef->sp->sigc[d].i, ddef->sp->sigc[d2].i) + 1; if (nov == 0) break; n = ESL_MIN(ddef->sp->sigc[d].j - ddef->sp->sigc[d].i + 1, ddef->sp->sigc[d2].j - ddef->sp->sigc[d2].i + 1); if ((float) nov / (float) n >= 0.8) /* overlap */ { if (ddef->sp->sigc[d].prob > ddef->sp->sigc[d2].prob) ddef->sp->assignment[d2] = 1; else ddef->sp->assignment[d] = 1; } } } /* shrink the sigc list, removing dominated domains */ d = 0; for (d2 = 0; d2 < nc; d2++) { if (ddef->sp->assignment[d2]) continue; /* skip domain d2, it's dominated. */ if (d != d2) memcpy(ddef->sp->sigc + d, ddef->sp->sigc + d2, sizeof(struct p7_spcoord_s)); d++; } ddef->sp->nc = d; *ret_nc = d; return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *ox1 = NULL; P7_OMX *ox2 = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); float null2[p7_MAXCODE]; int i,j,d,pos; int nsamples = 200; float fsc, bsc; double Mcs; if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); ox1 = p7_omx_Create(gm->M, L, L); ox2 = p7_omx_Create(gm->M, L, L); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_Forward (dsq, L, om, ox1, &fsc); if (esl_opt_GetBoolean(go, "-t")) { P7_TRACE *tr = p7_trace_Create(); float *n2sc = malloc(sizeof(float) * (L+1)); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { /* This is approximately what p7_domaindef.c::region_trace_ensemble() is doing: */ for (j = 0; j < nsamples; j++) { p7_StochasticTrace(r, dsq, L, om, ox1, tr); p7_trace_Index(tr); pos = 1; for (d = 0; d < tr->ndom; d++) { p7_Null2_ByTrace(om, tr, tr->tfrom[d], tr->tto[d], ox2, null2); for (; pos <= tr->sqfrom[d]; pos++) n2sc[pos] += 1.0; for (; pos < tr->sqto[d]; pos++) n2sc[pos] += null2[dsq[pos]]; } for (; pos <= L; pos++) n2sc[pos] += 1.0; p7_trace_Reuse(tr); } for (pos = 1; pos <= L; pos++) n2sc[pos] = logf(n2sc[pos] / nsamples); } esl_stopwatch_Stop(w); free(n2sc); p7_trace_Destroy(tr); } else { p7_Backward(dsq, L, om, ox1, ox2, &bsc); p7_Decoding(om, ox1, ox2, ox2); esl_stopwatch_Start(w); for (i = 0; i < N; i++) p7_Null2_ByExpectation(om, ox2, null2); esl_stopwatch_Stop(w); } Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_omx_Destroy(ox1); p7_omx_Destroy(ox2); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float fsc, bsc, accscore; double Mcs; p7_FLogsumInit(); if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); gx1 = p7_gmx_Create(gm->M, L); gx2 = p7_gmx_Create(gm->M, L); tr = p7_trace_CreateWithPP(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_GForward (dsq, L, gm, gx1, &fsc); p7_GBackward(dsq, L, gm, gx2, &bsc); p7_GDecoding(gm, gx1, gx2, gx2); /* <gx2> is now the posterior decoding matrix */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_GOptimalAccuracy(gm, gx2, gx1, &accscore); /* <gx1> is now the OA matrix */ if (! esl_opt_GetBoolean(go, "--notrace")) { p7_GOATrace(gm, gx2, gx1, tr); p7_trace_Reuse(tr); } } esl_stopwatch_Stop(w); Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* Function: p7_ProfileEmit() * Synopsis: Sample a sequence from the search form of the model. * Incept: SRE, Mon Jan 22 10:23:28 2007 [Janelia] * * Purpose: Sample a sequence from the implicit * probabilistic model of a Plan7 profile <gm>. This * requires also having the core probabilities of * the accompanying <hmm>, and the background * frequencies of null1 model <bg>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively. Caller has allocated space for * both of these, though they may get reallocated/grown * here. Either can be passed as <NULL> if unneeded. * * Only the sequence field is set in the <sq>. Caller must * set the name, plus any other fields it wants to set. If * the <sq> was created in digital mode, this is the <sq->dsq>; * if the <sq> was created in text mode, this is <sq->seq>. * * <p7_ProfileEmit()> deliberately uses an <ESL_SQ> object * instead of a plain <ESL_DSQ *> or <char *> string, to * take advantage of the object's support for dynamic * reallocation of seq length, and to allow both digital and * text mode generation. * * Args: r - source of randomness * hmm - core probabilities of the profile * gm - configured search profile * sq - optRETURN: sampled sequence * tr - optRETURN: sampled trace * * Throws: (no abnormal error conditions) */ int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr) { char prv, st; /* prev, current state type */ int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ int x; /* sampled residue */ int kend = hmm->M; /* predestined end node */ int status; float xt[p7P_NXSTATES][p7P_NXTRANS]; /* Backcalculate the probabilities in the special states (loop and length model) */ for (i = 0; i < p7P_NXSTATES; i++) for (x = 0; x < p7P_NXTRANS; x++) xt[i][x] = exp(gm->xsc[i][x]); if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_S, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_N, k, i)) != eslOK) goto ERROR; } st = p7T_N; i = 0; while (st != p7T_T) { /* Sample a state transition. After this section, prv and st (prev->current state) are set; * k also gets set if we make a B->Mk entry transition. */ prv = st; switch (st) { case p7T_B: if (p7_profile_IsLocal(gm)) { /* local mode: enter the implicit profile: choose our entry and our predestined exit */ if ((status = sample_endpoints(r, gm, &k, &kend)) != eslOK) goto ERROR; st = p7T_M; /* must be, because left wing is retracted */ } else { /* glocal mode: treat B as M_0, use its transitions to MID. */ /* FIXME: this is wrong. It should sample from B->Mk distribution! */ switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, 0), p7H_NTMAT)) { case 0: st = p7T_M; k = 1; break; case 1: st = p7T_I; k = 0; break; case 2: st = p7T_D; k = 1; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_M: if (k == kend) st = p7T_E; /* check our preordained fate */ else { switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, k), p7H_NTMAT)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_D: if (k == kend) st = p7T_E; else st = (esl_rnd_FChoose(r, P7H_TDEL(hmm, k), p7H_NTDEL) == 0) ? p7T_M : p7T_D; break; case p7T_I: st = (esl_rnd_FChoose(r, P7H_TINS(hmm, k), p7H_NTINS) == 0) ? p7T_M : p7T_I; break; case p7T_N: st = (esl_rnd_FChoose(r, xt[p7P_N], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_N; break; case p7T_E: st = (esl_rnd_FChoose(r, xt[p7P_E], p7P_NXTRANS) == p7P_MOVE) ? p7T_C : p7T_J; break; case p7T_C: st = (esl_rnd_FChoose(r, xt[p7P_C], p7P_NXTRANS) == p7P_MOVE) ? p7T_T : p7T_C; break; case p7T_J: st = (esl_rnd_FChoose(r, xt[p7P_J], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_J; break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Based on the transition we just sampled, update k. */ if (st == p7T_E) k = 0; else if (st == p7T_M && prv != p7T_B) k++; /* be careful about B->Mk, where we already set k */ else if (st == p7T_D) k++; /* Based on the transition we just sampled, generate a residue. */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else if ((st == p7T_N || st == p7T_C || st == p7T_J) && prv==st) x = esl_rnd_FChoose(r, bg->f, hmm->abc->K); else x = eslDSQ_SENTINEL; if (x != eslDSQ_SENTINEL) i++; /* Add residue (if any) to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL && (status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; /* Add state to trace. */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* "generation" test * Compare a randomly sampled profile to sequences sampled * from that profile. * * This test is not very stringent, because we don't know the "true" * envelopes. Rather, this is more of a test that nothing obviously * bad happens, like a crash, or obviously incorrect data. * * We test: * 1. Seq coordinates of each envelope are coherent: * 1 <= oa <= ia <= i0 <= ib <= ob <= L * * 2. Envelopes do not overlap (assuming default threshold of * 0.5 when defining them): * ia(d) > ib(d-1) for d = 2..D * (Outer envelopes, in contrast, can overlap.) * * 3. envsc(d) <= asc_sc <= fwdsc. * * 4. If D=1 (single domain) in both the generated trace * and the inferred envelopes, and the domain coords in * the trace are encompassed by the outer envelope, * then envsc(d) >= generated trace score. */ static void utest_generation(ESL_RANDOMNESS *rng, int M, const ESL_ALPHABET *abc, int N) { char msg[] = "reference_envelopes:: generation unit test failed"; ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = p7_profile_Create(M, abc); P7_TRACE *gtr = p7_trace_Create(); // generated trace P7_TRACE *vtr = p7_trace_Create(); // Viterbi trace P7_REFMX *rxf = p7_refmx_Create(M, 20); // Fwd, Vit ~~> ASC Decode UP P7_REFMX *rxd = p7_refmx_Create(M, 20); // Bck, Decode ~~> ASC Decode DOWN P7_REFMX *afu = p7_refmx_Create(M, 20); // ASC Fwd UP P7_REFMX *afd = p7_refmx_Create(M, 20); // ASC Fwd DOWN P7_REFMX *apu = rxf; // for 'clarity' we use two names for this mx P7_REFMX *apd = rxd; // ... and this one too. float *wrk = NULL; P7_ANCHORS *anch = p7_anchors_Create(); P7_ANCHORHASH *ah = p7_anchorhash_Create(); P7_ENVELOPES *env = p7_envelopes_Create(); float tol = 0.001; float gsc, fsc, asc; int idx; int d; if ( p7_modelsample(rng, M, abc, &hmm) != eslOK) esl_fatal(msg); if ( p7_profile_Config(gm, hmm, bg) != eslOK) esl_fatal(msg); for (idx = 0; idx < N; idx++) { /* Emit sequence from model, using an arbitrary length model of <M>; * restrict the emitted sequence length to 6M, arbitrarily, to * keep it down to something reasonable. */ if ( p7_profile_SetLength(gm, M) != eslOK) esl_fatal(msg); do { esl_sq_Reuse(sq); if (p7_ProfileEmit(rng, hmm, gm, bg, sq, gtr) != eslOK) esl_fatal(msg); } while (sq->n > M * 6); if (p7_trace_Index (gtr) != eslOK) esl_fatal(msg); if (p7_trace_Score (gtr, sq->dsq, gm, &gsc) != eslOK) esl_fatal(msg); /* Reset the length model to the actual length sq->n, then * put it through the domain postprocessing analysis pipeline */ if ( p7_profile_SetLength(gm, sq->n) != eslOK) esl_fatal(msg); /* First pass analysis */ if ( p7_ReferenceViterbi (sq->dsq, sq->n, gm, rxf, vtr, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceForward (sq->dsq, sq->n, gm, rxf, &fsc) != eslOK) esl_fatal(msg); if ( p7_ReferenceBackward(sq->dsq, sq->n, gm, rxd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceDecoding(sq->dsq, sq->n, gm, rxf, rxd, rxd) != eslOK) esl_fatal(msg); /* Anchor determination (MPAS algorithm) */ if ( p7_reference_Anchors(rng, sq->dsq, sq->n, gm, rxf, rxd, vtr, &wrk, ah, afu, afd, anch, &asc, NULL, NULL) != eslOK) esl_fatal(msg); /* Reuse rxf,rxd as apu, apd; finish ASC analysis with Backward, Decoding */ p7_refmx_Reuse(apu); p7_refmx_Reuse(apd); if ( p7_ReferenceASCBackward(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceASCDecoding(sq->dsq, sq->n, gm, anch->a, anch->D, afu, afd, apu, apd, apu, apd) != eslOK) esl_fatal(msg); /* Envelope calculation */ if ( p7_reference_Envelopes(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, afu, afd, env) != eslOK) esl_fatal(msg); /* Test 1. Coords of each domain are coherent */ if (anch->D != env->D) esl_fatal(msg); for (d = 1; d <= anch->D; d++) if (! (1 <= env->arr[d].oa && env->arr[d].oa <= env->arr[d].ia && env->arr[d].ia <= env->arr[d].i0 && env->arr[d].i0 <= env->arr[d].ib && env->arr[d].ib <= env->arr[d].ob && env->arr[d].ob <= sq->n)) esl_fatal(msg); /* Test 2. Envelopes do not overlap. */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].ia > env->arr[d-1].ib)) esl_fatal(msg); /* Test 3. envsc(d) <= asc_sc <= fwdsc */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].env_sc <= asc+tol && asc <= fsc+tol)) esl_fatal(msg); /* Test 4, only on D=1 case with generated trace's domain * encompassed by the outer envelope */ if (gtr->ndom == 1 && anch->D == 1 && gtr->sqfrom[0] >= env->arr[1].oa && // in <gtr>, domains are 0..D-1; in <env>, 1..D gtr->sqto[0] <= env->arr[1].ob) if (! ( env->arr[1].env_sc >= gsc)) esl_fatal(msg); p7_envelopes_Reuse(env); p7_anchors_Reuse(anch); p7_anchorhash_Reuse(ah); p7_refmx_Reuse(rxf); p7_refmx_Reuse(rxd); p7_refmx_Reuse(afu); p7_refmx_Reuse(afd); p7_trace_Reuse(gtr); p7_trace_Reuse(vtr); esl_sq_Reuse(sq); } if (wrk) free(wrk); p7_envelopes_Destroy(env); p7_anchors_Destroy(anch); p7_anchorhash_Destroy(ah); p7_refmx_Destroy(afu); p7_refmx_Destroy(afd); p7_refmx_Destroy(rxf); p7_refmx_Destroy(rxd); p7_trace_Destroy(vtr); p7_trace_Destroy(gtr); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_sq_Destroy(sq); }
void run_hmmer_pipeline(const char* seq) { int index, i, status; ESL_SQ* sq = esl_sq_CreateFrom(NULL, seq, NULL, NULL, NULL); P7_OPROFILE *om = NULL; P7_PROFILE *gm = NULL; float usc, vfsc, fwdsc; /* filter scores */ float filtersc; /* HMM null filter score */ float nullsc; /* null model score */ float seqbias; float seq_score; /* the corrected per-seq bit score */ double P; WRAPPER_RESULT* result; num_results = 0; if(sq->n == 0) { esl_sq_Destroy(sq); return; } esl_sq_Digitize(abc, sq); int n = 0; float oasc; for(index = 0;index < num_models;index++) { om = models[index]; p7_omx_Reuse(oxf); p7_omx_Reuse(oxb); p7_omx_GrowTo(oxf, om->M, sq->n, sq->n); p7_omx_GrowTo(oxb, om->M, sq->n, sq->n); p7_oprofile_ReconfigLength(om, sq->n); p7_bg_SetFilter(bg, om->M, om->compo); p7_bg_SetLength(bg, sq->n); //Calibrate null model p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc); //MSV Filter p7_MSVFilter(sq->dsq, sq->n, om, oxf, &usc); seq_score = (usc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Bias filter (model compo) p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc); seq_score = (usc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Viterbi filter (Only do if P value from Bias is high) if(P > f2) { p7_ViterbiFilter(sq->dsq, sq->n, om, oxf, &vfsc); seq_score = (vfsc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_VMU], om->evparam[p7_VLAMBDA]); if (P > f2) continue; } //Get the real probability (forward) p7_Forward(sq->dsq, sq->n, om, oxf, &fwdsc); seq_score = (fwdsc - filtersc) / eslCONST_LOG2; P = esl_exp_surv(seq_score, om->evparam[p7_FTAU], om->evparam[p7_FLAMBDA]); if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } if (P > f3) continue; //Real hit, go in to posterior decoding and alignment p7_omx_Reuse(oxb); p7_trace_Reuse(tr); p7_Backward(sq->dsq, sq->n, om, oxf, oxb, NULL); status = p7_Decoding(om, oxf, oxb, oxb); if(status == eslOK) { //And then trace the result p7_OptimalAccuracy(om, oxb, oxf, &oasc); p7_OATrace(om, oxb, oxf, tr); } else if(status == eslERANGE) { fprintf(stderr, "Decoding overflow on model %s\n", om->name); gm = gmodels[index]; if(gxf == NULL) { gxf = p7_gmx_Create(gm->M, sq->n); gxb = p7_gmx_Create(gm->M, sq->n); } else { p7_gmx_GrowTo(gxf, gm->M, sq->n); p7_gmx_GrowTo(gxb, gm->M, sq->n); } p7_ReconfigLength(gm, sq->n); p7_GForward (sq->dsq, sq->n, gm, gxf, &fwdsc); p7_GBackward(sq->dsq, sq->n, gm, gxb, NULL); p7_GDecoding(gm, gxf, gxb, gxb); p7_GOptimalAccuracy(gm, gxb, gxf, &oasc); p7_GOATrace (gm, gxb, gxf, tr); p7_gmx_Reuse(gxf); p7_gmx_Reuse(gxb); } if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } result = wrapper_results[num_results]; reuse_result(result, tr->N + om->M, om->name); //We're way overallocating here, but it's hard to know at this point how much space we'll need for the alignment (plus leading and trailing gaps) trace_into(tr, result, sq, abc, om->M); result->bits = seq_score; num_results++; } esl_sq_Destroy(sq); }
/* glocal_rescore_isolated_domain() * EPN, Tue Oct 5 10:16:12 2010 * * Based on p7_domaindef.c's rescore_isolated_domain(). Modified * so that generic matrices (which can be used for glocally configured * models) can be used. This function finds a single glocal domain, not a * single local one. * * Also modified to optionally remove the Backward and OA alignment. * The decision to do these is determined by three input parameters: * <null2_is_done>: TRUE if we've already computed the null2 scores for * this region (see Sean's notes below). * <do_null2>: TRUE if we will apply a null2 penalty eventually * to this domain * <do_aln>: TRUE if we need the OA alignment * * Notes (verbatim) from p7_domaindef.c::rescore_isolated_domain(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 09:18:33 2008 [Janelia] * * We have isolated a single domain's envelope from <i>..<j> in * sequence <sq>, and now we want to score it in isolation and obtain * an alignment display for it. * * (Later, we can add up all the individual domain scores from this * seq into a new per-seq score, to compare to the original per-seq * score). * * The caller provides model <om> configured in unilocal mode; by * using unilocal (as opposed to multilocal), we're going to force the * identification of a single domain in this envelope now. * * The alignment is an optimal accuracy alignment (sensu IH Holmes), * also obtained in unilocal mode. * * The caller provides DP matrices <ox1> and <ox2> with sufficient * space to hold Forward and Backward calculations for this domain * against the model. (The caller will typically already have matrices * sufficient for the complete sequence lying around, and can just use * those.) The caller also provides a <P7_DOMAINDEF> object which is * (efficiently, we trust) managing any necessary temporary working * space and heuristic thresholds. * * Returns <eslOK> if a domain was successfully identified, scored, * and aligned in the envelope; if so, the per-domain information is * registered in <ddef>, in <ddef->dcl>. * * And here's what's happened to our working memory: * * <ddef>: <ddef->tr> has been used, and possibly reallocated, for * the OA trace of the domain. Before exit, we called * <Reuse()> on it. * * <ox1> : happens to be holding OA score matrix for the domain * upon return, but that's not part of the spec; officially * its contents are "undefined". * * <ox2> : happens to be holding a posterior probability matrix * for the domain upon return, but we're not making that * part of the spec, so caller shouldn't rely on this; * spec just makes its contents "undefined". */ static int glocal_rescore_isolated_domain(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_SQ *sq, P7_GMX *gx1, P7_GMX *gx2, int i, int j, int null2_is_done, int do_null2, int do_aln) { P7_DOMAIN *dom = NULL; int Ld = j-i+1; float domcorrection = 0.0; float envsc, oasc; int z; int pos; float null2[p7_MAXCODE]; int status; p7_GForward (sq->dsq + i-1, Ld, gm, gx1, &envsc); oasc = 0.; if(do_null2 || do_aln) { p7_GBackward(sq->dsq + i-1, Ld, gm, gx2, NULL); status = p7_GDecoding(gm, gx1, gx2, gx2); /* <ox2> is now overwritten with post probabilities */ if (status == eslERANGE) return eslFAIL; /* rare: numeric overflow; domain is assumed to be repetitive garbage [J3/119-212] */ /* Is null2 set already for this i..j? (It is, if we're in a domain that * was defined by stochastic traceback clustering in a multidomain region; * it isn't yet, if we're in a simple one-domain region). If it isn't, * do it now, by the expectation (posterior decoding) method. */ if ((! null2_is_done) && do_null2) { p7_GNull2_ByExpectation(gm, gx2, null2); for (pos = i; pos <= j; pos++) ddef->n2sc[pos] = logf(null2[sq->dsq[pos]]); } if(do_null2) { for (pos = i; pos <= j; pos++) domcorrection += ddef->n2sc[pos]; /* domcorrection is in units of NATS */ } if(do_aln) { /* Find an optimal accuracy alignment */ p7_GOptimalAccuracy(gm, gx2, gx1, &oasc); /* <ox1> is now overwritten with OA scores */ p7_GOATrace (gm, gx2, gx1, ddef->tr); /* <tr>'s seq coords are offset by i-1, rel to orig dsq */ /* hack the trace's sq coords to be correct w.r.t. original dsq */ for (z = 0; z < ddef->tr->N; z++) if (ddef->tr->i[z] > 0) ddef->tr->i[z] += i-1; } /* get ptr to next empty domain structure in domaindef's results */ } if (ddef->ndom == ddef->nalloc) { void *p; ESL_RALLOC(ddef->dcl, p, sizeof(P7_DOMAIN) * (ddef->nalloc*2)); ddef->nalloc *= 2; } dom = &(ddef->dcl[ddef->ndom]); /* store the results in it */ dom->ienv = i; dom->jenv = j; dom->envsc = envsc; /* in units of NATS */ dom->domcorrection = domcorrection; /* in units of NATS, will be 0. if do_null2 == FALSE */ dom->oasc = oasc; /* in units of expected # of correctly aligned residues, will be 0. if do_aln == FALSE */ dom->dombias = 0.0; /* gets set later, using bg->omega and dombias */ dom->bitscore = 0.0; /* gets set later by caller, using envsc, null score, and dombias */ dom->lnP = 1.0; /* gets set later by caller, using bitscore */ dom->is_reported = FALSE; /* gets set later by caller */ dom->is_included = FALSE; /* gets set later by caller */ dom->ad = NULL; dom->iali = i; dom->jali = j; ddef->ndom++; if(do_aln) { p7_trace_Reuse(ddef->tr); } return eslOK; ERROR: if(do_aln) { p7_trace_Reuse(ddef->tr); } return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; P7_OMX *ox1 = NULL; P7_OMX *ox2 = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float fsc, bsc, accscore; float fsc_g, bsc_g, accscore_g; double Mcs; p7_FLogsumInit(); if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); if (esl_opt_GetBoolean(go, "-x") && p7_FLogsumError(-0.4, -0.5) > 0.0001) p7_Fail("-x here requires p7_Logsum() recompiled in slow exact mode"); ox1 = p7_omx_Create(gm->M, L, L); ox2 = p7_omx_Create(gm->M, L, L); tr = p7_trace_CreateWithPP(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_Forward (dsq, L, om, ox1, &fsc); p7_Backward(dsq, L, om, ox1, ox2, &bsc); p7_Decoding(om, ox1, ox2, ox2); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_OptimalAccuracy(om, ox2, ox1, &accscore); if (! esl_opt_GetBoolean(go, "--notrace")) { p7_OATrace(om, ox2, ox1, tr); p7_trace_Reuse(tr); } } esl_stopwatch_Stop(w); Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); if (esl_opt_GetBoolean(go, "-c") || esl_opt_GetBoolean(go, "-x") ) { gx1 = p7_gmx_Create(gm->M, L); gx2 = p7_gmx_Create(gm->M, L); p7_GForward (dsq, L, gm, gx1, &fsc_g); p7_GBackward(dsq, L, gm, gx2, &bsc_g); p7_GDecoding(gm, gx1, gx2, gx2); p7_GOptimalAccuracy(gm, gx2, gx1, &accscore_g); printf("generic: fwd=%8.4f bck=%8.4f acc=%8.4f\n", fsc_g, bsc_g, accscore_g); printf("VMX: fwd=%8.4f bck=%8.4f acc=%8.4f\n", fsc, bsc, accscore); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); } free(dsq); p7_omx_Destroy(ox1); p7_omx_Destroy(ox2); p7_trace_Destroy(tr); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* * 1. Compare accscore to GOptimalAccuracy(). * 2. Compare trace to GOATrace(). * * Note: This test is subject to some expected noise and can fail * for entirely innocent reasons. Generic Forward/Backward calculations with * p7_GForward(), p7_GBackward() use coarse-grain table lookups to sum * log probabilities, and sufficient roundoff error can accumulate to * change the optimal accuracy traceback, causing this test to fail. * So, if optacc_utest fails, before you go looking for bugs, first * go to ../logsum.c, change the #ifdef to activate the slow/accurate * version, recompile and rerun optacc_utest. If the failure goes away, * you can ignore it. - SRE, Wed Dec 17 09:45:31 2008 */ static void utest_optacc(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, int M, int L, int N) { char *msg = "optimal accuracy unit test failed"; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_OMX *ox1 = p7_omx_Create(M, L, L); P7_OMX *ox2 = p7_omx_Create(M, L, L); P7_GMX *gx1 = p7_gmx_Create(M, L); P7_GMX *gx2 = p7_gmx_Create(M, L); P7_TRACE *tr = p7_trace_CreateWithPP(); P7_TRACE *trg = p7_trace_CreateWithPP(); P7_TRACE *tro = p7_trace_CreateWithPP(); float accscore_o; float fsc, bsc, accscore; float fsc_g, bsc_g, accscore_g, accscore_g2; float pptol = 0.01; float sctol = 0.001; float gtol; p7_FLogsumInit(); gtol = ( (p7_FLogsumError(-0.4, -0.5) > 0.0001) ? 0.1 : 0.001); if (p7_oprofile_Sample(r, abc, bg, M, L, &hmm, &gm, &om)!= eslOK) esl_fatal(msg); while (N--) { if (p7_ProfileEmit(r, hmm, gm, bg, sq, tro) != eslOK) esl_fatal(msg); if (p7_omx_GrowTo(ox1, M, sq->n, sq->n) != eslOK) esl_fatal(msg); if (p7_omx_GrowTo(ox2, M, sq->n, sq->n) != eslOK) esl_fatal(msg); if (p7_gmx_GrowTo(gx1, M, sq->n) != eslOK) esl_fatal(msg); if (p7_gmx_GrowTo(gx2, M, sq->n) != eslOK) esl_fatal(msg); if (p7_Forward (sq->dsq, sq->n, om, ox1, &fsc) != eslOK) esl_fatal(msg); if (p7_Backward(sq->dsq, sq->n, om, ox1, ox2, &bsc) != eslOK) esl_fatal(msg); if (p7_Decoding(om, ox1, ox2, ox2) != eslOK) esl_fatal(msg); if (p7_OptimalAccuracy(om, ox2, ox1, &accscore) != eslOK) esl_fatal(msg); #if 0 p7_omx_FDeconvert(ox1, gx1); p7_gmx_Dump(stdout, gx1, p7_DEFAULT); p7_omx_FDeconvert(ox2, gx1); p7_gmx_Dump(stdout, gx1, p7_DEFAULT); #endif if (p7_OATrace(om, ox2, ox1, tr) != eslOK) esl_fatal(msg); if (p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc_g) != eslOK) esl_fatal(msg); if (p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc_g) != eslOK) esl_fatal(msg); #if 0 p7_gmx_Dump(stdout, gx1, p7_DEFAULT); /* fwd */ p7_gmx_Dump(stdout, gx2, p7_DEFAULT); /* bck */ #endif if (p7_GDecoding(gm, gx1, gx2, gx2) != eslOK) esl_fatal(msg); if (p7_GOptimalAccuracy(gm, gx2, gx1, &accscore_g) != eslOK) esl_fatal(msg); #if 0 p7_gmx_Dump(stdout, gx1, p7_DEFAULT); /* oa */ p7_gmx_Dump(stdout, gx2, p7_DEFAULT); /* pp */ #endif if (p7_GOATrace(gm, gx2, gx1, trg) != eslOK) esl_fatal(msg); if (p7_trace_SetPP(tro, gx2) != eslOK) esl_fatal(msg); if (esl_opt_GetBoolean(go, "--traces")) { p7_trace_Dump(stdout, tro, gm, sq->dsq); p7_trace_Dump(stdout, tr, gm, sq->dsq); p7_trace_Dump(stdout, trg, gm, sq->dsq); } if (p7_trace_Validate(tr, abc, sq->dsq, NULL) != eslOK) esl_fatal(msg); if (p7_trace_Validate(trg, abc, sq->dsq, NULL) != eslOK) esl_fatal(msg); if (p7_trace_Compare(tr, trg, pptol) != eslOK) esl_fatal(msg); accscore_o = p7_trace_GetExpectedAccuracy(tro); /* according to gx2; see p7_trace_SetPP() call above */ accscore_g2 = p7_trace_GetExpectedAccuracy(trg); #if 0 printf("%f %f %f %f\n", accscore, accscore_g, accscore_g2, accscore_o); #endif if (esl_FCompare(fsc, bsc, sctol) != eslOK) esl_fatal(msg); if (esl_FCompare(fsc_g, bsc_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(fsc, fsc_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(accscore, accscore_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(accscore_g, accscore_g2, gtol) != eslOK) esl_fatal(msg); if (accscore_g2 < accscore_o) esl_fatal(msg); /* the above deserves explanation: * - accscore_o is the accuracy of the originally emitted trace, according * to the generic posterior decoding matrix <gx2>. This is a lower bound * on the expected # of accurately aligned residues found by a DP * optimization. * - accscore is the accuracy found by the fast (vector) code DP implementation. * - accscore_g is the accuracy found by the generic DP implementation. * accscore and accscore_g should be nearly identical, * within tolerance of roundoff error accumulation and * the imprecision of Logsum() tables. * - accscore_g2 is the accuracy of the traceback identified by the generic * DP implementation. It should be identical (within order-of-evaluation * roundoff error) to accscore_g. * * the "accscore_g2 < accscore_o" test is carefully contrived. * accscore_o is a theoretical lower bound but because of fp error, * accscore and (much more rarely) even accscore_g can exceed accscore_o. * accscore_g2, however, is calculated with identical order of evaluation * as accscore_o if the optimal trace does turn out to be identical to * the originally emitted trace. It should be extremely unlikely (though * not impossible) for accscore_o to exceed accscore_g2. (The DP algorithm * would have to identify a trace that was different than the original trace, * which the DP algorithm, by order-of-evaluation, assigned higher accuracy, * but order-of-evaluation in traceback dependent code assigned lower accuracy. * [xref J5/29] */ esl_sq_Reuse(sq); p7_trace_Reuse(tr); p7_trace_Reuse(trg); p7_trace_Reuse(tro); } p7_trace_Destroy(tro); p7_trace_Destroy(trg); p7_trace_Destroy(tr); p7_gmx_Destroy(gx2); p7_gmx_Destroy(gx1); p7_omx_Destroy(ox2); p7_omx_Destroy(ox1); esl_sq_Destroy(sq); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); }
/* Function: p7_CoreEmit() * Incept: SRE, Tue Jan 9 10:20:51 2007 [Janelia] * * Purpose: Generate (sample) a sequence from a core HMM <hmm>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively, which the caller has * allocated. Having the caller provide these reusable * objects allows re-use of both <sq> and <tr> in repeated * calls, saving malloc/free wastage. Either can be passed * as <NULL> if it isn't needed. * * This does not set any fields in the <sq> except for the * sequence itself. Caller must set the name, and any other * annotation it wants to add. * * Trace is relative to the core model: it may include * I_0 and I_M states, B->DD->M entry is explicit, and a * 0 length generated sequence is possible. * * Args: r - source of randomness * hmm - core HMM to generate from * sq - opt: digital sequence sampled (or NULL) * tr - opt: trace sampled (or NULL) * * Returns: <eslOK> on success; * optionally return the digital sequence through <ret_sq>, * and optionally return its trace in <ret_tr>. * * Throws: <eslECORRUPT> if emission gets us into an illegal state, * probably indicating that a probability that should have * been zero wasn't. * * Throws <eslEMEM> on a reallocation error. * * In these cases, the contents of <sq> and <tr> may be * corrupted. Caller should not trust their data, but may * safely reuse them. * * Xref: STL11/124. */ int p7_CoreEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr) { int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ char st = p7T_B; /* state type */ int x; /* sampled residue */ int status; if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } while (st != p7T_E) { /* Sample next state type, given current state type (and current k) */ switch (st) { case p7T_B: case p7T_M: switch (esl_rnd_FChoose(r, hmm->t[k], 3)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_I: switch (esl_rnd_FChoose(r, hmm->t[k]+3, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_D: switch (esl_rnd_FChoose(r, hmm->t[k]+5, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Bump k,i if needed, depending on new state type */ if (st == p7T_M || st == p7T_D) k++; if (st == p7T_M || st == p7T_I) i++; /* a transit to M_M+1 is a transit to the E state */ if (k == hmm->M+1) { if (st == p7T_M) { st = p7T_E; k = 0; } else ESL_XEXCEPTION(eslECORRUPT, "failed to reach E state properly"); } /* Sample new residue x if in match or insert */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else x = eslDSQ_SENTINEL; /* Add state to trace */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } /* Add x to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL) if ((status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float fsc, bsc, vsc; float accscore; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* multihit local: H3 default */ /* Allocations */ gx1 = p7_gmx_Create(gm->M, sq->n); gx2 = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_CreateWithPP(); p7_FLogsumInit(); /* Run Forward, Backward; do OA fill and trace */ p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc); p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc); p7_GDecoding(gm, gx1, gx2, gx2); /* <gx2> is now the posterior decoding matrix */ p7_GOptimalAccuracy(gm, gx2, gx1, &accscore); /* <gx1> is now the OA matrix */ p7_GOATrace(gm, gx2, gx1, tr); if (esl_opt_GetBoolean(go, "-d")) p7_gmx_Dump(stdout, gx2, p7_DEFAULT); if (esl_opt_GetBoolean(go, "-m")) p7_gmx_Dump(stdout, gx1, p7_DEFAULT); p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); printf("fwd = %.4f nats\n", fsc); printf("bck = %.4f nats\n", bsc); printf("acc = %.4f (%.2f%%)\n", accscore, accscore * 100. / (float) sq->n); p7_trace_Reuse(tr); p7_GViterbi(sq->dsq, sq->n, gm, gx1, &vsc); p7_GTrace (sq->dsq, sq->n, gm, gx1, tr); p7_trace_SetPP(tr, gx2); p7_trace_Dump(stdout, tr, gm, sq->dsq); printf("vit = %.4f nats\n", vsc); printf("acc = %.4f\n", p7_trace_GetExpectedAccuracy(tr)); /* Cleanup */ esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* process_workunit() * * This is the routine that actually does the work. * * A work unit consists of one HMM, <hmm>. * The result is the <scores> array, which contains an array of N scores; * caller provides this memory. * How those scores are generated is controlled by the application configuration in <cfg>. */ static int process_workunit(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { int L = esl_opt_GetInteger(go, "-L"); P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_REFMX *rmx = NULL; P7_CHECKPTMX *cx = NULL; P7_FILTERMX *fx = NULL; P7_TRACE *tr = NULL; ESL_DSQ *dsq = NULL; int i; int scounts[p7T_NSTATETYPES]; /* state usage counts from a trace */ float sc; float nullsc; int status; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); /* Optionally set a custom background, determined by model composition; * an experimental hack. */ if (esl_opt_GetBoolean(go, "--bgcomp")) { float *p = NULL; float KL; p7_hmm_CompositionKLDist(hmm, cfg->bg, &KL, &p); esl_vec_FCopy(p, cfg->abc->K, cfg->bg->f); } /* Create and configure our generic profile, as requested */ gm = p7_profile_Create(hmm->M, cfg->abc); if (esl_opt_GetBoolean(go, "--multi")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_Config (gm, hmm, cfg->bg); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigLocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigGlocal(gm, hmm, cfg->bg, L); } } else if (esl_opt_GetBoolean(go, "--uni")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_ConfigCustom (gm, hmm, cfg->bg, L, 0.0, 0.5); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigUnilocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigUniglocal(gm, hmm, cfg->bg, L); } } p7_profile_SetLength(gm, L); p7_bg_SetLength(cfg->bg, L); if (esl_opt_GetBoolean(go, "--x-no-lengthmodel")) elide_length_model(gm, cfg->bg); /* Allocate DP matrix for <gm>. */ rmx = p7_refmx_Create(gm->M, L); /* Create and configure the vectorized profile, if needed; * and allocate its DP matrix */ if (esl_opt_GetBoolean(go, "--vector")) { om = p7_oprofile_Create(gm->M, cfg->abc, om->simd); p7_oprofile_Convert(gm, om); cx = p7_checkptmx_Create(gm->M, L, ESL_MBYTES(32), om->simd); fx = p7_filtermx_Create(gm->M, om->simd); } /* Remaining allocation */ ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2)); tr = p7_trace_Create(); /* Collect scores from N random sequences of length L */ for (i = 0; i < cfg->N; i++) { esl_rsq_xfIID(cfg->r, cfg->bg->f, cfg->abc->K, L, dsq); sc = eslINFINITY; /* Vectorized implementations of Viterbi, MSV may overflow. * In this case, they'll leave sc=eslINFINITY. * Then we fail over to the nonvector "generic" implementation. * That's why this next block isn't an if/else. */ if (esl_opt_GetBoolean(go, "--vector")) { if (esl_opt_GetBoolean(go, "--vit")) p7_ViterbiFilter(dsq, L, om, fx, &sc); else if (esl_opt_GetBoolean(go, "--fwd")) p7_ForwardFilter(dsq, L, om, cx, &sc); else if (esl_opt_GetBoolean(go, "--msv")) p7_MSVFilter (dsq, L, om, fx, &sc); } /* If we tried a vector calculation above but it overflowed, * or if we're to do --generic DP calculations, sc==eslINFINITY now; * hence the if condition here: */ if (sc == eslINFINITY) { if (esl_opt_GetBoolean(go, "--fwd")) p7_ReferenceForward(dsq, L, gm, rmx, &sc); /* any mode: dual,local,glocal; gm's config takes care of this */ else if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(dsq, L, gm, rmx, tr, &sc); /* local-only mode. cmdline opts processing has already assured that --local set */ else if (esl_opt_GetBoolean(go, "--msv")) p7_Die("We used to be able to do a generic MSV algorithm - but no longer"); } /* Optional: get Viterbi alignment length too. */ if (esl_opt_GetBoolean(go, "-a")) /* -a only works with Viterbi; getopts has checked this already; <tr> must be valid */ { p7_trace_GetStateUseCounts(tr, scounts); /* there's various ways we could counts "alignment length". * Here we'll use the total length of model used, in nodes: M+D states. * score vs al would gives us relative entropy / model position. */ /* alilens[i] = scounts[p7T_D] + scounts[p7T_I]; SRE: temporarily testing this instead */ alilens[i] = scounts[p7T_ML] + scounts[p7T_DL] + scounts[p7T_IL] + scounts[p7T_MG] + scounts[p7T_DG] + scounts[p7T_IG]; p7_trace_Reuse(tr); } p7_bg_NullOne(cfg->bg, dsq, L, &nullsc); scores[i] = (sc - nullsc) / eslCONST_LOG2; if (cx) p7_checkptmx_Reuse(cx); if (fx) p7_filtermx_Reuse(fx); p7_refmx_Reuse(rmx); } status = eslOK; /* deliberate flowthru */ ERROR: if (dsq != NULL) free(dsq); p7_checkptmx_Destroy(cx); p7_filtermx_Destroy(fx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_refmx_Destroy(rmx); p7_trace_Destroy(tr); if (status == eslEMEM) sprintf(errbuf, "allocation failure"); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float sc, fsc, vsc; float bestsc = -eslINFINITY; if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); fwd = p7_omx_Create(gm->M, L, L); gx = p7_gmx_Create(gm->M, L); tr = p7_trace_Create(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_GViterbi(dsq, L, gm, gx, &vsc); p7_Forward (dsq, L, om, fwd, &fsc); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_StochasticTrace(r, dsq, L, om, fwd, tr); p7_trace_Score(tr, dsq, gm, &sc); bestsc = ESL_MAX(bestsc, sc); p7_trace_Reuse(tr); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("forward sc = %.4f nats\n", fsc); printf("viterbi sc = %.4f nats\n", vsc); printf("max trace sc = %.4f nats\n", bestsc); free(dsq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx); p7_omx_Destroy(fwd); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* Function: p7_domaindef_GlocalByPosteriorHeuristics() * Synopsis: Define glocal domains in a sequence using posterior probs. * Incept: EPN, Tue Oct 5 10:02:34 2010 * SRE, Sat Feb 23 08:17:44 2008 [Janelia] (p7_domaindef_ByPosteriorHeuristics()) * * Purpose: Given a sequence <sq> and model <gm> for which we have * already calculated a Forward and Backward parsing * matrices <gxf> and <gxb>; use posterior probability * heuristics to determine an annotated domain structure; * and for each domain found, score it (with null2 * calculations) and obtain an optimal accuracy alignment, * using <fwd> and <bck> matrices as workspace for the * necessary full-matrix DP calculations. Caller provides a * new or reused <ddef> object to hold these results. * * As a special case, if the profile is in unihit mode * upon entering, we don't ever modify its configuration. * This is especially important if this function is * being used within a search/scan pipeline with a * specially configured p7 profile in which N->N and/or * C->C transitions have been set to IMPOSSIBLE. (If * we were to call ReconfigLength() on such a profile * we would make those transitions possible.) * * One case in which profile reconfiguration is necessary * is when multiple domains are suspected. However, we * guard against this if the profile enters in unihit mode * by no allowing multiple domains (in fact, it should * never happen because J states are unreachable in unihit * profiles). If multiple domains are suspected in this case, * we return eslEINCONCEIVABLE. * * Upon return, <ddef> contains the definitions of all the * domains: their bounds, their null-corrected Forward * scores, and their optimal posterior accuracy alignments. * * <do_null2> is TRUE if we'll eventually apply a null2 * penalty FALSE if not. If FALSE, we can save time by * skipping Backward calls at some stages. * * Returns: <eslOK> on success. * * <eslERANGE> on numeric overflow in posterior * decoding. This should not be possible for multihit * models. * * <eslEINCONCEIVABLE> if profile enters as unihit but * multiple domains are suspected. */ int p7_domaindef_GlocalByPosteriorHeuristics(const ESL_SQ *sq, P7_PROFILE *gm, P7_GMX *gxf, P7_GMX *gxb, P7_GMX *fwd, P7_GMX *bck, P7_DOMAINDEF *ddef, int do_null2) { int i, j; int triggered; int d; int i2,j2; int last_j2; int nc; int saveL = gm->L; /* Save the length config of <om>; will restore upon return */ int save_mode = gm->mode; /* Likewise for the mode. */ int status; int save_mode_is_unihit; save_mode_is_unihit = (p7_IsMulti(save_mode)) ? FALSE : TRUE; /* if save_mode_is_unihit is TRUE, we never modify profile's configuration (length nor mode) */ if ((status = p7_domaindef_GrowTo(ddef, sq->n)) != eslOK) return status; /* ddef's btot,etot,mocc now ready for seq of length n */ /*printf("GDD P7 mode: %d\n", gm->mode);*/ if ((status = p7_GDomainDecoding(gm, gxf, gxb, ddef)) != eslOK) return status; /* ddef->{btot,etot,mocc} now made. */ /*printf("In p7_domaindef_GlocalByPosteriorHeuristics(): mode: %d rt1: %g rt2: %g rt3: %g nsamples: %d reseed: %d\n", save_mode, ddef->rt1, ddef->rt2, ddef->rt3, ddef->nsamples, ddef->do_reseeding);*/ esl_vec_FSet(ddef->n2sc, sq->n+1, 0.0); /* ddef->n2sc null2 scores are initialized */ ddef->nexpected = ddef->btot[sq->n]; /* posterior expectation for # of domains (same as etot[sq->n]) */ if(! save_mode_is_unihit) p7_ReconfigUnihit(gm, saveL); /* process each domain in unihit mode, regardless of gm->mode */ i = -1; triggered = FALSE; for (j = 1; j <= sq->n; j++) { /*printf("GDD j: %5d m: %.5f b: %8.3f e: %8.3f bhere: %8.3f ehere: %8.3f\n", j, ddef->mocc[j], ddef->btot[j], ddef->etot[j], ddef->btot[j] - ddef->btot[j-1], ddef->etot[j] - ddef->etot[j-1]); */ if (! triggered) { /* xref J2/101 for what the logic below is: */ if (ddef->mocc[j] - (ddef->btot[j] - ddef->btot[j-1]) < ddef->rt2) i = j; else if (i == -1) i = j; if (ddef->mocc[j] >= ddef->rt1) triggered = TRUE; } else if (ddef->mocc[j] - (ddef->etot[j] - ddef->etot[j-1]) < ddef->rt2) { /* We have a region i..j to evaluate. */ p7_gmx_GrowTo(fwd, gm->M, j-i+1); p7_gmx_GrowTo(bck, gm->M, j-i+1); ddef->nregions++; if (is_multidomain_region(ddef, i, j)) { if(save_mode_is_unihit) return eslEINCONCEIVABLE; /* This region appears to contain more than one domain, so we have to * resolve it by cluster analysis of posterior trace samples, to define * one or more domain envelopes. */ ddef->nclustered++; /* Resolve the region into domains by stochastic trace * clustering; assign position-specific null2 model by * stochastic trace clustering; there is redundancy * here; we will consolidate later if null2 strategy * works */ p7_ReconfigMultihit(gm, saveL); p7_GForward(sq->dsq+i-1, j-i+1, gm, fwd, NULL); glocal_region_trace_ensemble(ddef, gm, sq->dsq, i, j, fwd, bck, do_null2, &nc); p7_ReconfigUnihit(gm, saveL); /* ddef->n2sc is now set on i..j by the traceback-dependent method */ last_j2 = 0; for (d = 0; d < nc; d++) { p7_spensemble_GetClusterCoords(ddef->sp, d, &i2, &j2, NULL, NULL, NULL); if (i2 <= last_j2) ddef->noverlaps++; /* Note that k..m coords on model are available, but * we're currently ignoring them. This leads to a * rare clustering bug that we eventually need to fix * properly [xref J3/32]: two different regions in one * profile HMM might have hit same seq domain, and * when we now go to calculate an OA trace, nothing * constrains us to find the two different alignments * to the HMM; in fact, because OA is optimal, we'll * find one and the *same* alignment, leading to an * apparent duplicate alignment in the output. * * Registered as #h74, Dec 2009, after EBI finds and * reports it. #h74 is worked around in p7_tophits.c * by hiding all but one envelope with an identical * alignment, in the rare event that this * happens. [xref J5/130]. */ ddef->nenvelopes++; if (glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i2, j2, TRUE, do_null2, FALSE) == eslOK) last_j2 = j2; } p7_spensemble_Reuse(ddef->sp); p7_trace_Reuse(ddef->tr); } else { /* The region looks simple, single domain; convert the region to an envelope. */ ddef->nenvelopes++; glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i, j, FALSE, do_null2, FALSE); } i = -1; triggered = FALSE; } } /* If profile was unihit upon entrance, we didn't modify its configuration (length nor mode), * else restore it to its original multihit mode, and to its original length model */ if (! save_mode_is_unihit) { p7_ReconfigMultihit(gm, saveL); } return eslOK; }