/* compare results to GDecoding(). */ static void utest_decoding(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, int M, int L, int N, float tolerance) { char *msg = "decoding unit test failed"; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); P7_OMX *fwd = p7_omx_Create(M, L, L); P7_OMX *bck = p7_omx_Create(M, L, L); P7_OMX *pp = p7_omx_Create(M, L, L); P7_GMX *gxf = p7_gmx_Create(M, L); P7_GMX *gxb = p7_gmx_Create(M, L); P7_GMX *gxp2 = p7_gmx_Create(M, L); float fsc1, fsc2; float bsc1, bsc2; if (p7_oprofile_Sample(r, abc, bg, M, L, &hmm, &gm, &om) != eslOK) esl_fatal(msg); while (N--) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal(msg); if (p7_Forward (dsq, L, om, fwd, &fsc1) != eslOK) esl_fatal(msg); if (p7_Backward (dsq, L, om, fwd, bck, &bsc1) != eslOK) esl_fatal(msg); if (p7_Decoding(om, fwd, bck, pp) != eslOK) esl_fatal(msg); if (p7_GForward (dsq, L, gm, gxf, &fsc2) != eslOK) esl_fatal(msg); if (p7_GBackward(dsq, L, gm, gxb, &bsc2) != eslOK) esl_fatal(msg); if (p7_GDecoding(gm, gxf, gxb, gxp2) != eslOK) esl_fatal(msg); if (p7_gmx_Compare(pp, gxp2, tolerance) != eslOK) esl_fatal(msg); } p7_gmx_Destroy(gxp2); p7_gmx_Destroy(gxf); p7_gmx_Destroy(gxb); p7_omx_Destroy(fwd); p7_omx_Destroy(bck); p7_omx_Destroy(pp); free(dsq); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); }
/* Forward is hard to validate. * We do know that the Forward score is >= Viterbi. * We also know that the expected score on random seqs is <= 0 (not * exactly - we'd have to sample the random length from the background * model too, not just use a fixed L - but it's close enough to * being true to be a useful test.) */ static void utest_forward(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L) { float avg_sc; ESL_DSQ *dsq = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; int idx; float fsc, bsc; float vsc, nullsc; if ((dsq = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL) esl_fatal("malloc failed"); if ((fwd = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); if ((bck = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); avg_sc = 0.; for (idx = 0; idx < nseq; idx++) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed"); if (p7_GViterbi(dsq, L, gm, fwd, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_GForward(dsq, L, gm, fwd, &fsc) != eslOK) esl_fatal("forward failed"); if (p7_GBackward(dsq, L, gm, bck, &bsc) != eslOK) esl_fatal("backward failed"); if (fsc < vsc) esl_fatal("Foward score can't be less than Viterbi score"); if (fabs(fsc-bsc) > 0.001) esl_fatal("Forward/Backward failed: %f %f\n", fsc, bsc); if (p7_bg_NullOne(bg, dsq, L, &nullsc) != eslOK) esl_fatal("null score failed"); avg_sc += fsc - nullsc; if (esl_opt_GetBoolean(go, "--vv")) printf("utest_forward: Forward score: %.4f (total so far: %.4f)\n", fsc, avg_sc); } avg_sc /= (float) nseq; if (avg_sc > 0.) esl_fatal("Forward scores have positive expectation (%f nats)", avg_sc); p7_gmx_Destroy(fwd); p7_gmx_Destroy(bck); free(dsq); return; }
static void utest_correct_normalization(ESL_RANDOMNESS *r, P7_PROFILE *gm, P7_BG *bg, ESL_DSQ *dsq, int L, P7_GMX *fwd, P7_GMX *bck) { char *msg = "normalization unit test failed"; float null2[p7_MAXABET]; float sum; int x; esl_rsq_xfIID(r, bg->f, gm->abc->K, L, dsq); /* sample a random digital seq of length L */ p7_GForward (dsq, L, gm, fwd, NULL); p7_GBackward(dsq, L, gm, bck, NULL); p7_PosteriorNull2(L, gm, fwd, bck, bck); /* <bck> now contains posterior probs */ p7_Null2Corrections(gm, dsq, L, 0, bck, fwd, null2, NULL, NULL); /* use <fwd> as workspace */ /* Convert null2 from lod score to frequencies f_d */ for (x = 0; x < gm->abc->K; x++) null2[x] = exp(null2[x]) * bg->f[x]; sum = esl_vec_FSum(null2, gm->abc->K); if (sum < 0.99 || sum > 1.01) esl_fatal(msg); }
/* The "generation" test scores sequences generated by the same profile. * Each Viterbi and Forward score should be >= the trace score of the emitted seq. * The expectation of Forward scores should be positive. */ static void utest_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_HMM *hmm, P7_BG *bg, int nseq) { ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_GMX *gx = p7_gmx_Create(gm->M, 100); P7_TRACE *tr = p7_trace_Create(); float vsc, fsc, nullsc, tracesc; float avg_fsc; int idx; avg_fsc = 0.0; for (idx = 0; idx < nseq; idx++) { if (p7_ProfileEmit(r, hmm, gm, bg, sq, tr) != eslOK) esl_fatal("profile emission failed"); if (p7_gmx_GrowTo(gx, gm->M, sq->n) != eslOK) esl_fatal("failed to reallocate gmx"); if (p7_GViterbi(sq->dsq, sq->n, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_GForward(sq->dsq, sq->n, gm, gx, &fsc) != eslOK) esl_fatal("forward failed"); if (p7_trace_Score(tr, sq->dsq, gm, &tracesc) != eslOK) esl_fatal("trace score failed"); if (p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc) != eslOK) esl_fatal("null score failed"); if (vsc < tracesc) esl_fatal("viterbi score is less than trace"); if (fsc < tracesc) esl_fatal("forward score is less than trace"); if (vsc > fsc) esl_fatal("viterbi score is greater than forward"); if (esl_opt_GetBoolean(go, "--vv")) printf("generated: len=%d v=%8.4f f=%8.4f t=%8.4f\n", (int) sq->n, vsc, fsc, tracesc); avg_fsc += (fsc - nullsc); } avg_fsc /= (float) nseq; if (avg_fsc < 0.) esl_fatal("generation: Forward scores have negative expectation (%f nats)", avg_fsc); p7_gmx_Destroy(gx); p7_trace_Destroy(tr); esl_sq_Destroy(sq); }
void run_hmmer_pipeline(const char* seq) { int index, i, status; ESL_SQ* sq = esl_sq_CreateFrom(NULL, seq, NULL, NULL, NULL); P7_OPROFILE *om = NULL; P7_PROFILE *gm = NULL; float usc, vfsc, fwdsc; /* filter scores */ float filtersc; /* HMM null filter score */ float nullsc; /* null model score */ float seqbias; float seq_score; /* the corrected per-seq bit score */ double P; WRAPPER_RESULT* result; num_results = 0; if(sq->n == 0) { esl_sq_Destroy(sq); return; } esl_sq_Digitize(abc, sq); int n = 0; float oasc; for(index = 0;index < num_models;index++) { om = models[index]; p7_omx_Reuse(oxf); p7_omx_Reuse(oxb); p7_omx_GrowTo(oxf, om->M, sq->n, sq->n); p7_omx_GrowTo(oxb, om->M, sq->n, sq->n); p7_oprofile_ReconfigLength(om, sq->n); p7_bg_SetFilter(bg, om->M, om->compo); p7_bg_SetLength(bg, sq->n); //Calibrate null model p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc); //MSV Filter p7_MSVFilter(sq->dsq, sq->n, om, oxf, &usc); seq_score = (usc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Bias filter (model compo) p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc); seq_score = (usc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Viterbi filter (Only do if P value from Bias is high) if(P > f2) { p7_ViterbiFilter(sq->dsq, sq->n, om, oxf, &vfsc); seq_score = (vfsc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_VMU], om->evparam[p7_VLAMBDA]); if (P > f2) continue; } //Get the real probability (forward) p7_Forward(sq->dsq, sq->n, om, oxf, &fwdsc); seq_score = (fwdsc - filtersc) / eslCONST_LOG2; P = esl_exp_surv(seq_score, om->evparam[p7_FTAU], om->evparam[p7_FLAMBDA]); if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } if (P > f3) continue; //Real hit, go in to posterior decoding and alignment p7_omx_Reuse(oxb); p7_trace_Reuse(tr); p7_Backward(sq->dsq, sq->n, om, oxf, oxb, NULL); status = p7_Decoding(om, oxf, oxb, oxb); if(status == eslOK) { //And then trace the result p7_OptimalAccuracy(om, oxb, oxf, &oasc); p7_OATrace(om, oxb, oxf, tr); } else if(status == eslERANGE) { fprintf(stderr, "Decoding overflow on model %s\n", om->name); gm = gmodels[index]; if(gxf == NULL) { gxf = p7_gmx_Create(gm->M, sq->n); gxb = p7_gmx_Create(gm->M, sq->n); } else { p7_gmx_GrowTo(gxf, gm->M, sq->n); p7_gmx_GrowTo(gxb, gm->M, sq->n); } p7_ReconfigLength(gm, sq->n); p7_GForward (sq->dsq, sq->n, gm, gxf, &fwdsc); p7_GBackward(sq->dsq, sq->n, gm, gxb, NULL); p7_GDecoding(gm, gxf, gxb, gxb); p7_GOptimalAccuracy(gm, gxb, gxf, &oasc); p7_GOATrace (gm, gxb, gxf, tr); p7_gmx_Reuse(gxf); p7_gmx_Reuse(gxb); } if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } result = wrapper_results[num_results]; reuse_result(result, tr->N + om->M, om->name); //We're way overallocating here, but it's hard to know at this point how much space we'll need for the alignment (plus leading and trailing gaps) trace_into(tr, result, sq, abc, om->M); result->bits = seq_score; num_results++; } esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float fsc, bsc, vsc; float accscore; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* multihit local: H3 default */ /* Allocations */ gx1 = p7_gmx_Create(gm->M, sq->n); gx2 = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_CreateWithPP(); p7_FLogsumInit(); /* Run Forward, Backward; do OA fill and trace */ p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc); p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc); p7_GDecoding(gm, gx1, gx2, gx2); /* <gx2> is now the posterior decoding matrix */ p7_GOptimalAccuracy(gm, gx2, gx1, &accscore); /* <gx1> is now the OA matrix */ p7_GOATrace(gm, gx2, gx1, tr); if (esl_opt_GetBoolean(go, "-d")) p7_gmx_Dump(stdout, gx2, p7_DEFAULT); if (esl_opt_GetBoolean(go, "-m")) p7_gmx_Dump(stdout, gx1, p7_DEFAULT); p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); printf("fwd = %.4f nats\n", fsc); printf("bck = %.4f nats\n", bsc); printf("acc = %.4f (%.2f%%)\n", accscore, accscore * 100. / (float) sq->n); p7_trace_Reuse(tr); p7_GViterbi(sq->dsq, sq->n, gm, gx1, &vsc); p7_GTrace (sq->dsq, sq->n, gm, gx1, tr); p7_trace_SetPP(tr, gx2); p7_trace_Dump(stdout, tr, gm, sq->dsq); printf("vit = %.4f nats\n", vsc); printf("acc = %.4f\n", p7_trace_GetExpectedAccuracy(tr)); /* Cleanup */ esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float fsc, bsc, accscore; double Mcs; p7_FLogsumInit(); if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); gx1 = p7_gmx_Create(gm->M, L); gx2 = p7_gmx_Create(gm->M, L); tr = p7_trace_CreateWithPP(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_GForward (dsq, L, gm, gx1, &fsc); p7_GBackward(dsq, L, gm, gx2, &bsc); p7_GDecoding(gm, gx1, gx2, gx2); /* <gx2> is now the posterior decoding matrix */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_GOptimalAccuracy(gm, gx2, gx1, &accscore); /* <gx1> is now the OA matrix */ if (! esl_opt_GetBoolean(go, "--notrace")) { p7_GOATrace(gm, gx2, gx1, tr); p7_trace_Reuse(tr); } } esl_stopwatch_Stop(w); Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* * 1. Compare accscore to GOptimalAccuracy(). * 2. Compare trace to GOATrace(). * * Note: This test is subject to some expected noise and can fail * for entirely innocent reasons. Generic Forward/Backward calculations with * p7_GForward(), p7_GBackward() use coarse-grain table lookups to sum * log probabilities, and sufficient roundoff error can accumulate to * change the optimal accuracy traceback, causing this test to fail. * So, if optacc_utest fails, before you go looking for bugs, first * go to ../logsum.c, change the #ifdef to activate the slow/accurate * version, recompile and rerun optacc_utest. If the failure goes away, * you can ignore it. - SRE, Wed Dec 17 09:45:31 2008 */ static void utest_optacc(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, int M, int L, int N) { char *msg = "optimal accuracy unit test failed"; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_OMX *ox1 = p7_omx_Create(M, L, L); P7_OMX *ox2 = p7_omx_Create(M, L, L); P7_GMX *gx1 = p7_gmx_Create(M, L); P7_GMX *gx2 = p7_gmx_Create(M, L); P7_TRACE *tr = p7_trace_CreateWithPP(); P7_TRACE *trg = p7_trace_CreateWithPP(); P7_TRACE *tro = p7_trace_CreateWithPP(); float accscore_o; float fsc, bsc, accscore; float fsc_g, bsc_g, accscore_g, accscore_g2; float pptol = 0.01; float sctol = 0.001; float gtol; p7_FLogsumInit(); gtol = ( (p7_FLogsumError(-0.4, -0.5) > 0.0001) ? 0.1 : 0.001); if (p7_oprofile_Sample(r, abc, bg, M, L, &hmm, &gm, &om)!= eslOK) esl_fatal(msg); while (N--) { if (p7_ProfileEmit(r, hmm, gm, bg, sq, tro) != eslOK) esl_fatal(msg); if (p7_omx_GrowTo(ox1, M, sq->n, sq->n) != eslOK) esl_fatal(msg); if (p7_omx_GrowTo(ox2, M, sq->n, sq->n) != eslOK) esl_fatal(msg); if (p7_gmx_GrowTo(gx1, M, sq->n) != eslOK) esl_fatal(msg); if (p7_gmx_GrowTo(gx2, M, sq->n) != eslOK) esl_fatal(msg); if (p7_Forward (sq->dsq, sq->n, om, ox1, &fsc) != eslOK) esl_fatal(msg); if (p7_Backward(sq->dsq, sq->n, om, ox1, ox2, &bsc) != eslOK) esl_fatal(msg); if (p7_Decoding(om, ox1, ox2, ox2) != eslOK) esl_fatal(msg); if (p7_OptimalAccuracy(om, ox2, ox1, &accscore) != eslOK) esl_fatal(msg); #if 0 p7_omx_FDeconvert(ox1, gx1); p7_gmx_Dump(stdout, gx1, p7_DEFAULT); p7_omx_FDeconvert(ox2, gx1); p7_gmx_Dump(stdout, gx1, p7_DEFAULT); #endif if (p7_OATrace(om, ox2, ox1, tr) != eslOK) esl_fatal(msg); if (p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc_g) != eslOK) esl_fatal(msg); if (p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc_g) != eslOK) esl_fatal(msg); #if 0 p7_gmx_Dump(stdout, gx1, p7_DEFAULT); /* fwd */ p7_gmx_Dump(stdout, gx2, p7_DEFAULT); /* bck */ #endif if (p7_GDecoding(gm, gx1, gx2, gx2) != eslOK) esl_fatal(msg); if (p7_GOptimalAccuracy(gm, gx2, gx1, &accscore_g) != eslOK) esl_fatal(msg); #if 0 p7_gmx_Dump(stdout, gx1, p7_DEFAULT); /* oa */ p7_gmx_Dump(stdout, gx2, p7_DEFAULT); /* pp */ #endif if (p7_GOATrace(gm, gx2, gx1, trg) != eslOK) esl_fatal(msg); if (p7_trace_SetPP(tro, gx2) != eslOK) esl_fatal(msg); if (esl_opt_GetBoolean(go, "--traces")) { p7_trace_Dump(stdout, tro, gm, sq->dsq); p7_trace_Dump(stdout, tr, gm, sq->dsq); p7_trace_Dump(stdout, trg, gm, sq->dsq); } if (p7_trace_Validate(tr, abc, sq->dsq, NULL) != eslOK) esl_fatal(msg); if (p7_trace_Validate(trg, abc, sq->dsq, NULL) != eslOK) esl_fatal(msg); if (p7_trace_Compare(tr, trg, pptol) != eslOK) esl_fatal(msg); accscore_o = p7_trace_GetExpectedAccuracy(tro); /* according to gx2; see p7_trace_SetPP() call above */ accscore_g2 = p7_trace_GetExpectedAccuracy(trg); #if 0 printf("%f %f %f %f\n", accscore, accscore_g, accscore_g2, accscore_o); #endif if (esl_FCompare(fsc, bsc, sctol) != eslOK) esl_fatal(msg); if (esl_FCompare(fsc_g, bsc_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(fsc, fsc_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(accscore, accscore_g, gtol) != eslOK) esl_fatal(msg); if (esl_FCompare(accscore_g, accscore_g2, gtol) != eslOK) esl_fatal(msg); if (accscore_g2 < accscore_o) esl_fatal(msg); /* the above deserves explanation: * - accscore_o is the accuracy of the originally emitted trace, according * to the generic posterior decoding matrix <gx2>. This is a lower bound * on the expected # of accurately aligned residues found by a DP * optimization. * - accscore is the accuracy found by the fast (vector) code DP implementation. * - accscore_g is the accuracy found by the generic DP implementation. * accscore and accscore_g should be nearly identical, * within tolerance of roundoff error accumulation and * the imprecision of Logsum() tables. * - accscore_g2 is the accuracy of the traceback identified by the generic * DP implementation. It should be identical (within order-of-evaluation * roundoff error) to accscore_g. * * the "accscore_g2 < accscore_o" test is carefully contrived. * accscore_o is a theoretical lower bound but because of fp error, * accscore and (much more rarely) even accscore_g can exceed accscore_o. * accscore_g2, however, is calculated with identical order of evaluation * as accscore_o if the optimal trace does turn out to be identical to * the originally emitted trace. It should be extremely unlikely (though * not impossible) for accscore_o to exceed accscore_g2. (The DP algorithm * would have to identify a trace that was different than the original trace, * which the DP algorithm, by order-of-evaluation, assigned higher accuracy, * but order-of-evaluation in traceback dependent code assigned lower accuracy. * [xref J5/29] */ esl_sq_Reuse(sq); p7_trace_Reuse(tr); p7_trace_Reuse(trg); p7_trace_Reuse(tro); } p7_trace_Destroy(tro); p7_trace_Destroy(trg); p7_trace_Destroy(tr); p7_gmx_Destroy(gx2); p7_gmx_Destroy(gx1); p7_omx_Destroy(ox2); p7_omx_Destroy(ox1); esl_sq_Destroy(sq); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; P7_OMX *ox1 = NULL; P7_OMX *ox2 = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float fsc, bsc, accscore; float fsc_g, bsc_g, accscore_g; double Mcs; p7_FLogsumInit(); if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); if (esl_opt_GetBoolean(go, "-x") && p7_FLogsumError(-0.4, -0.5) > 0.0001) p7_Fail("-x here requires p7_Logsum() recompiled in slow exact mode"); ox1 = p7_omx_Create(gm->M, L, L); ox2 = p7_omx_Create(gm->M, L, L); tr = p7_trace_CreateWithPP(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_Forward (dsq, L, om, ox1, &fsc); p7_Backward(dsq, L, om, ox1, ox2, &bsc); p7_Decoding(om, ox1, ox2, ox2); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_OptimalAccuracy(om, ox2, ox1, &accscore); if (! esl_opt_GetBoolean(go, "--notrace")) { p7_OATrace(om, ox2, ox1, tr); p7_trace_Reuse(tr); } } esl_stopwatch_Stop(w); Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); if (esl_opt_GetBoolean(go, "-c") || esl_opt_GetBoolean(go, "-x") ) { gx1 = p7_gmx_Create(gm->M, L); gx2 = p7_gmx_Create(gm->M, L); p7_GForward (dsq, L, gm, gx1, &fsc_g); p7_GBackward(dsq, L, gm, gx2, &bsc_g); p7_GDecoding(gm, gx1, gx2, gx2); p7_GOptimalAccuracy(gm, gx2, gx1, &accscore_g); printf("generic: fwd=%8.4f bck=%8.4f acc=%8.4f\n", fsc_g, bsc_g, accscore_g); printf("VMX: fwd=%8.4f bck=%8.4f acc=%8.4f\n", fsc, bsc, accscore); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); } free(dsq); p7_omx_Destroy(ox1); p7_omx_Destroy(ox2); p7_trace_Destroy(tr); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_OMX *bck = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float fsc, bsc; float fsc2, bsc2; double base_time, bench_time, Mcs; p7_FLogsumInit(); if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); if (esl_opt_GetBoolean(go, "-x") && p7_FLogsumError(-0.4, -0.5) > 0.0001) p7_Fail("-x here requires p7_Logsum() recompiled in slow exact mode"); if (esl_opt_GetBoolean(go, "-P")) { fwd = p7_omx_Create(gm->M, 0, L); bck = p7_omx_Create(gm->M, 0, L); } else { fwd = p7_omx_Create(gm->M, L, L); bck = p7_omx_Create(gm->M, L, L); } gx = p7_gmx_Create(gm->M, L); /* Get a baseline time: how long it takes just to generate the sequences */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); esl_stopwatch_Stop(w); base_time = w->user; esl_stopwatch_Start(w); for (i = 0; i < N; i++) { esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); if (esl_opt_GetBoolean(go, "-P")) { if (! esl_opt_GetBoolean(go, "-B")) p7_ForwardParser (dsq, L, om, fwd, &fsc); if (! esl_opt_GetBoolean(go, "-F")) p7_BackwardParser(dsq, L, om, fwd, bck, &bsc); } else { if (! esl_opt_GetBoolean(go, "-B")) p7_Forward (dsq, L, om, fwd, &fsc); if (! esl_opt_GetBoolean(go, "-F")) p7_Backward(dsq, L, om, fwd, bck, &bsc); } if (esl_opt_GetBoolean(go, "-c") || esl_opt_GetBoolean(go, "-x")) { p7_GForward (dsq, L, gm, gx, &fsc2); p7_GBackward(dsq, L, gm, gx, &bsc2); printf("%.4f %.4f %.4f %.4f\n", fsc, bsc, fsc2, bsc2); } } esl_stopwatch_Stop(w); bench_time = w->user - base_time; Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) bench_time; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_omx_Destroy(bck); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_OMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fraw, braw, nullsc, fsc; float gfraw, gbraw, gfsc; double P, gP; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Open sequence file for reading */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); /* p7_oprofile_Dump(stdout, om); */ /* allocate DP matrices for O(M+L) parsers */ fwd = p7_omx_Create(gm->M, 0, sq->n); bck = p7_omx_Create(gm->M, 0, sq->n); gx = p7_gmx_Create(gm->M, sq->n); /* allocate DP matrices for O(ML) fills */ /* fwd = p7_omx_Create(gm->M, sq->n, sq->n); */ /* bck = p7_omx_Create(gm->M, sq->n, sq->n); */ /* p7_omx_SetDumpMode(stdout, fwd, TRUE); */ /* makes the fast DP algorithms dump their matrices */ /* p7_omx_SetDumpMode(stdout, bck, TRUE); */ while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_oprofile_ReconfigLength(om, sq->n); p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_omx_GrowTo(fwd, om->M, 0, sq->n); p7_omx_GrowTo(bck, om->M, 0, sq->n); p7_gmx_GrowTo(gx, gm->M, sq->n); p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); p7_ForwardParser (sq->dsq, sq->n, om, fwd, &fraw); p7_BackwardParser(sq->dsq, sq->n, om, fwd, bck, &braw); /* p7_Forward (sq->dsq, sq->n, om, fwd, &fsc); printf("forward: %.2f nats\n", fsc); */ /* p7_Backward(sq->dsq, sq->n, om, fwd, bck, &bsc); printf("backward: %.2f nats\n", bsc); */ /* Comparison to other F/B implementations */ p7_GForward (sq->dsq, sq->n, gm, gx, &gfraw); p7_GBackward (sq->dsq, sq->n, gm, gx, &gbraw); /* p7_gmx_Dump(stdout, gx); */ fsc = (fraw-nullsc) / eslCONST_LOG2; gfsc = (gfraw-nullsc) / eslCONST_LOG2; P = esl_exp_surv(fsc, om->evparam[p7_FTAU], om->evparam[p7_FLAMBDA]); gP = esl_exp_surv(gfsc, gm->evparam[p7_FTAU], gm->evparam[p7_FLAMBDA]); if (esl_opt_GetBoolean(go, "-1")) { printf("%-30s\t%-20s\t%9.2g\t%6.1f\t%9.2g\t%6.1f\n", sq->name, hmm->name, P, fsc, gP, gfsc); } else if (esl_opt_GetBoolean(go, "-P")) { /* output suitable for direct use in profmark benchmark postprocessors: */ printf("%g\t%.2f\t%s\t%s\n", P, fsc, sq->name, hmm->name); } else { printf("target sequence: %s\n", sq->name); printf("fwd filter raw score: %.2f nats\n", fraw); printf("bck filter raw score: %.2f nats\n", braw); printf("null score: %.2f nats\n", nullsc); printf("per-seq score: %.2f bits\n", fsc); printf("P-value: %g\n", P); printf("GForward raw score: %.2f nats\n", gfraw); printf("GBackward raw score: %.2f nats\n", gbraw); printf("GForward seq score: %.2f bits\n", gfsc); printf("GForward P-value: %g\n", gP); } esl_sq_Reuse(sq); } /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_omx_Destroy(bck); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; int N = esl_opt_GetInteger(go, "-N"); int i; float vsc, fsc, tsc; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); fwd = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); p7_GViterbi(sq->dsq, sq->n, gm, fwd, &vsc); p7_GForward(sq->dsq, sq->n, gm, fwd, &fsc); if (esl_opt_GetBoolean(go, "-m") == TRUE) p7_gmx_Dump(stdout, fwd, p7_DEFAULT); for (i = 0; i < N; i++) { p7_GStochasticTrace(r, sq->dsq, sq->n, gm, fwd, tr); p7_trace_Score(tr, sq->dsq, gm, &tsc); if (esl_opt_GetBoolean(go, "-t") == TRUE) p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) esl_fatal("trace failed validation: %s\n", errbuf); printf("Sampled trace: %.4f nats\n", tsc); p7_trace_Reuse(tr); } printf("Forward score: %.4f nats\n", fsc); printf("Viterbi score: %.4f nats\n", vsc); /* Cleanup */ p7_gmx_Destroy(fwd); p7_trace_Destroy(tr); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float sc, fsc, vsc; float bestsc = -eslINFINITY; if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); fwd = p7_gmx_Create(gm->M, L); tr = p7_trace_Create(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_GViterbi(dsq, L, gm, fwd, &vsc); p7_GForward(dsq, L, gm, fwd, &fsc); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_GStochasticTrace(r, dsq, L, gm, fwd, tr); p7_trace_Score(tr, dsq, gm, &sc); /* this doesn't add significantly to benchmark time */ bestsc = ESL_MAX(bestsc, sc); p7_trace_Reuse(tr); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("forward sc = %.4f nats\n", fsc); printf("viterbi sc = %.4f nats\n", vsc); printf("max trace sc = %.4f nats\n", bestsc); free(dsq); p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fsc, bsc; float nullsc; int status; /* Initialize log-sum calculator */ p7_FLogsumInit(); /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); /* Now reconfig the models however we were asked to */ if (esl_opt_GetBoolean(go, "--fs")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); else if (esl_opt_GetBoolean(go, "--sw")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); else if (esl_opt_GetBoolean(go, "--ls")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_GLOCAL); else if (esl_opt_GetBoolean(go, "--s")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNIGLOCAL); /* Allocate matrices */ fwd = p7_gmx_Create(gm->M, sq->n); bck = p7_gmx_Create(gm->M, sq->n); printf("%-30s %-10s %-10s %-10s %-10s\n", "# seq name", "fwd (raw)", "bck (raw) ", "fwd (bits)", "bck (bits)"); printf("%-30s %10s %10s %10s %10s\n", "#--------------", "----------", "----------", "----------", "----------"); while ( (status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) p7_Fail("Parse failed (sequence file %s)\n%s\n", sqfp->filename, sqfp->get_error(sqfp)); else if (status != eslOK) p7_Fail("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Resize the DP matrices if necessary */ p7_gmx_GrowTo(fwd, gm->M, sq->n); p7_gmx_GrowTo(bck, gm->M, sq->n); /* Set the profile and null model's target length models */ p7_bg_SetLength(bg, sq->n); p7_ReconfigLength(gm, sq->n); /* Run Forward, Backward */ p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc); p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc); p7_gmx_Dump(stdout, fwd, p7_DEFAULT); /* Those scores are partial log-odds likelihoods in nats. * Subtract off the rest of the null model, convert to bits. */ p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc); printf("%-30s %10.4f %10.4f %10.4f %10.4f\n", sq->name, fsc, bsc, (fsc - nullsc) / eslCONST_LOG2, (bsc - nullsc) / eslCONST_LOG2); p7_gmx_Reuse(fwd); p7_gmx_Reuse(bck); esl_sq_Reuse(sq); } /* Cleanup */ esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_gmx_Destroy(fwd); p7_gmx_Destroy(bck); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* glocal_rescore_isolated_domain() * EPN, Tue Oct 5 10:16:12 2010 * * Based on p7_domaindef.c's rescore_isolated_domain(). Modified * so that generic matrices (which can be used for glocally configured * models) can be used. This function finds a single glocal domain, not a * single local one. * * Also modified to optionally remove the Backward and OA alignment. * The decision to do these is determined by three input parameters: * <null2_is_done>: TRUE if we've already computed the null2 scores for * this region (see Sean's notes below). * <do_null2>: TRUE if we will apply a null2 penalty eventually * to this domain * <do_aln>: TRUE if we need the OA alignment * * Notes (verbatim) from p7_domaindef.c::rescore_isolated_domain(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 09:18:33 2008 [Janelia] * * We have isolated a single domain's envelope from <i>..<j> in * sequence <sq>, and now we want to score it in isolation and obtain * an alignment display for it. * * (Later, we can add up all the individual domain scores from this * seq into a new per-seq score, to compare to the original per-seq * score). * * The caller provides model <om> configured in unilocal mode; by * using unilocal (as opposed to multilocal), we're going to force the * identification of a single domain in this envelope now. * * The alignment is an optimal accuracy alignment (sensu IH Holmes), * also obtained in unilocal mode. * * The caller provides DP matrices <ox1> and <ox2> with sufficient * space to hold Forward and Backward calculations for this domain * against the model. (The caller will typically already have matrices * sufficient for the complete sequence lying around, and can just use * those.) The caller also provides a <P7_DOMAINDEF> object which is * (efficiently, we trust) managing any necessary temporary working * space and heuristic thresholds. * * Returns <eslOK> if a domain was successfully identified, scored, * and aligned in the envelope; if so, the per-domain information is * registered in <ddef>, in <ddef->dcl>. * * And here's what's happened to our working memory: * * <ddef>: <ddef->tr> has been used, and possibly reallocated, for * the OA trace of the domain. Before exit, we called * <Reuse()> on it. * * <ox1> : happens to be holding OA score matrix for the domain * upon return, but that's not part of the spec; officially * its contents are "undefined". * * <ox2> : happens to be holding a posterior probability matrix * for the domain upon return, but we're not making that * part of the spec, so caller shouldn't rely on this; * spec just makes its contents "undefined". */ static int glocal_rescore_isolated_domain(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_SQ *sq, P7_GMX *gx1, P7_GMX *gx2, int i, int j, int null2_is_done, int do_null2, int do_aln) { P7_DOMAIN *dom = NULL; int Ld = j-i+1; float domcorrection = 0.0; float envsc, oasc; int z; int pos; float null2[p7_MAXCODE]; int status; p7_GForward (sq->dsq + i-1, Ld, gm, gx1, &envsc); oasc = 0.; if(do_null2 || do_aln) { p7_GBackward(sq->dsq + i-1, Ld, gm, gx2, NULL); status = p7_GDecoding(gm, gx1, gx2, gx2); /* <ox2> is now overwritten with post probabilities */ if (status == eslERANGE) return eslFAIL; /* rare: numeric overflow; domain is assumed to be repetitive garbage [J3/119-212] */ /* Is null2 set already for this i..j? (It is, if we're in a domain that * was defined by stochastic traceback clustering in a multidomain region; * it isn't yet, if we're in a simple one-domain region). If it isn't, * do it now, by the expectation (posterior decoding) method. */ if ((! null2_is_done) && do_null2) { p7_GNull2_ByExpectation(gm, gx2, null2); for (pos = i; pos <= j; pos++) ddef->n2sc[pos] = logf(null2[sq->dsq[pos]]); } if(do_null2) { for (pos = i; pos <= j; pos++) domcorrection += ddef->n2sc[pos]; /* domcorrection is in units of NATS */ } if(do_aln) { /* Find an optimal accuracy alignment */ p7_GOptimalAccuracy(gm, gx2, gx1, &oasc); /* <ox1> is now overwritten with OA scores */ p7_GOATrace (gm, gx2, gx1, ddef->tr); /* <tr>'s seq coords are offset by i-1, rel to orig dsq */ /* hack the trace's sq coords to be correct w.r.t. original dsq */ for (z = 0; z < ddef->tr->N; z++) if (ddef->tr->i[z] > 0) ddef->tr->i[z] += i-1; } /* get ptr to next empty domain structure in domaindef's results */ } if (ddef->ndom == ddef->nalloc) { void *p; ESL_RALLOC(ddef->dcl, p, sizeof(P7_DOMAIN) * (ddef->nalloc*2)); ddef->nalloc *= 2; } dom = &(ddef->dcl[ddef->ndom]); /* store the results in it */ dom->ienv = i; dom->jenv = j; dom->envsc = envsc; /* in units of NATS */ dom->domcorrection = domcorrection; /* in units of NATS, will be 0. if do_null2 == FALSE */ dom->oasc = oasc; /* in units of expected # of correctly aligned residues, will be 0. if do_aln == FALSE */ dom->dombias = 0.0; /* gets set later, using bg->omega and dombias */ dom->bitscore = 0.0; /* gets set later by caller, using envsc, null score, and dombias */ dom->lnP = 1.0; /* gets set later by caller, using bitscore */ dom->is_reported = FALSE; /* gets set later by caller */ dom->is_included = FALSE; /* gets set later by caller */ dom->ad = NULL; dom->iali = i; dom->jali = j; ddef->ndom++; if(do_aln) { p7_trace_Reuse(ddef->tr); } return eslOK; ERROR: if(do_aln) { p7_trace_Reuse(ddef->tr); } return status; }
/* Function: p7_domaindef_GlocalByPosteriorHeuristics() * Synopsis: Define glocal domains in a sequence using posterior probs. * Incept: EPN, Tue Oct 5 10:02:34 2010 * SRE, Sat Feb 23 08:17:44 2008 [Janelia] (p7_domaindef_ByPosteriorHeuristics()) * * Purpose: Given a sequence <sq> and model <gm> for which we have * already calculated a Forward and Backward parsing * matrices <gxf> and <gxb>; use posterior probability * heuristics to determine an annotated domain structure; * and for each domain found, score it (with null2 * calculations) and obtain an optimal accuracy alignment, * using <fwd> and <bck> matrices as workspace for the * necessary full-matrix DP calculations. Caller provides a * new or reused <ddef> object to hold these results. * * As a special case, if the profile is in unihit mode * upon entering, we don't ever modify its configuration. * This is especially important if this function is * being used within a search/scan pipeline with a * specially configured p7 profile in which N->N and/or * C->C transitions have been set to IMPOSSIBLE. (If * we were to call ReconfigLength() on such a profile * we would make those transitions possible.) * * One case in which profile reconfiguration is necessary * is when multiple domains are suspected. However, we * guard against this if the profile enters in unihit mode * by no allowing multiple domains (in fact, it should * never happen because J states are unreachable in unihit * profiles). If multiple domains are suspected in this case, * we return eslEINCONCEIVABLE. * * Upon return, <ddef> contains the definitions of all the * domains: their bounds, their null-corrected Forward * scores, and their optimal posterior accuracy alignments. * * <do_null2> is TRUE if we'll eventually apply a null2 * penalty FALSE if not. If FALSE, we can save time by * skipping Backward calls at some stages. * * Returns: <eslOK> on success. * * <eslERANGE> on numeric overflow in posterior * decoding. This should not be possible for multihit * models. * * <eslEINCONCEIVABLE> if profile enters as unihit but * multiple domains are suspected. */ int p7_domaindef_GlocalByPosteriorHeuristics(const ESL_SQ *sq, P7_PROFILE *gm, P7_GMX *gxf, P7_GMX *gxb, P7_GMX *fwd, P7_GMX *bck, P7_DOMAINDEF *ddef, int do_null2) { int i, j; int triggered; int d; int i2,j2; int last_j2; int nc; int saveL = gm->L; /* Save the length config of <om>; will restore upon return */ int save_mode = gm->mode; /* Likewise for the mode. */ int status; int save_mode_is_unihit; save_mode_is_unihit = (p7_IsMulti(save_mode)) ? FALSE : TRUE; /* if save_mode_is_unihit is TRUE, we never modify profile's configuration (length nor mode) */ if ((status = p7_domaindef_GrowTo(ddef, sq->n)) != eslOK) return status; /* ddef's btot,etot,mocc now ready for seq of length n */ /*printf("GDD P7 mode: %d\n", gm->mode);*/ if ((status = p7_GDomainDecoding(gm, gxf, gxb, ddef)) != eslOK) return status; /* ddef->{btot,etot,mocc} now made. */ /*printf("In p7_domaindef_GlocalByPosteriorHeuristics(): mode: %d rt1: %g rt2: %g rt3: %g nsamples: %d reseed: %d\n", save_mode, ddef->rt1, ddef->rt2, ddef->rt3, ddef->nsamples, ddef->do_reseeding);*/ esl_vec_FSet(ddef->n2sc, sq->n+1, 0.0); /* ddef->n2sc null2 scores are initialized */ ddef->nexpected = ddef->btot[sq->n]; /* posterior expectation for # of domains (same as etot[sq->n]) */ if(! save_mode_is_unihit) p7_ReconfigUnihit(gm, saveL); /* process each domain in unihit mode, regardless of gm->mode */ i = -1; triggered = FALSE; for (j = 1; j <= sq->n; j++) { /*printf("GDD j: %5d m: %.5f b: %8.3f e: %8.3f bhere: %8.3f ehere: %8.3f\n", j, ddef->mocc[j], ddef->btot[j], ddef->etot[j], ddef->btot[j] - ddef->btot[j-1], ddef->etot[j] - ddef->etot[j-1]); */ if (! triggered) { /* xref J2/101 for what the logic below is: */ if (ddef->mocc[j] - (ddef->btot[j] - ddef->btot[j-1]) < ddef->rt2) i = j; else if (i == -1) i = j; if (ddef->mocc[j] >= ddef->rt1) triggered = TRUE; } else if (ddef->mocc[j] - (ddef->etot[j] - ddef->etot[j-1]) < ddef->rt2) { /* We have a region i..j to evaluate. */ p7_gmx_GrowTo(fwd, gm->M, j-i+1); p7_gmx_GrowTo(bck, gm->M, j-i+1); ddef->nregions++; if (is_multidomain_region(ddef, i, j)) { if(save_mode_is_unihit) return eslEINCONCEIVABLE; /* This region appears to contain more than one domain, so we have to * resolve it by cluster analysis of posterior trace samples, to define * one or more domain envelopes. */ ddef->nclustered++; /* Resolve the region into domains by stochastic trace * clustering; assign position-specific null2 model by * stochastic trace clustering; there is redundancy * here; we will consolidate later if null2 strategy * works */ p7_ReconfigMultihit(gm, saveL); p7_GForward(sq->dsq+i-1, j-i+1, gm, fwd, NULL); glocal_region_trace_ensemble(ddef, gm, sq->dsq, i, j, fwd, bck, do_null2, &nc); p7_ReconfigUnihit(gm, saveL); /* ddef->n2sc is now set on i..j by the traceback-dependent method */ last_j2 = 0; for (d = 0; d < nc; d++) { p7_spensemble_GetClusterCoords(ddef->sp, d, &i2, &j2, NULL, NULL, NULL); if (i2 <= last_j2) ddef->noverlaps++; /* Note that k..m coords on model are available, but * we're currently ignoring them. This leads to a * rare clustering bug that we eventually need to fix * properly [xref J3/32]: two different regions in one * profile HMM might have hit same seq domain, and * when we now go to calculate an OA trace, nothing * constrains us to find the two different alignments * to the HMM; in fact, because OA is optimal, we'll * find one and the *same* alignment, leading to an * apparent duplicate alignment in the output. * * Registered as #h74, Dec 2009, after EBI finds and * reports it. #h74 is worked around in p7_tophits.c * by hiding all but one envelope with an identical * alignment, in the rare event that this * happens. [xref J5/130]. */ ddef->nenvelopes++; if (glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i2, j2, TRUE, do_null2, FALSE) == eslOK) last_j2 = j2; } p7_spensemble_Reuse(ddef->sp); p7_trace_Reuse(ddef->tr); } else { /* The region looks simple, single domain; convert the region to an envelope. */ ddef->nenvelopes++; glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i, j, FALSE, do_null2, FALSE); } i = -1; triggered = FALSE; } } /* If profile was unihit upon entrance, we didn't modify its configuration (length nor mode), * else restore it to its original multihit mode, and to its original length model */ if (! save_mode_is_unihit) { p7_ReconfigMultihit(gm, saveL); } return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fsc, bsc; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); /* Allocate matrices */ fwd = p7_gmx_Create(gm->M, sq->n); bck = p7_gmx_Create(gm->M, sq->n); /* Run Forward, Backward */ p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc); p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc); printf("fwd = %.4f nats\n", fsc); printf("bck = %.4f nats\n", bsc); /* Cleanup */ esl_sq_Destroy(sq); p7_gmx_Destroy(fwd); p7_gmx_Destroy(bck); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* The "enumeration" test samples a random enumerable HMM (transitions to insert are 0, * so the generated seq space only includes seqs of L<=M). * * The test scores all seqs of length <=M by both Viterbi and Forward, verifies that * the two scores are identical, and verifies that the sum of all the probabilities is * 1.0. It also verifies that the score of a sequence of length M+1 is indeed -infinity. * * Because this function is going to work in unscaled probabilities, adding them up, * all P(seq) terms must be >> DBL_EPSILON. That means M must be small; on the order * of <= 10. */ static void utest_enumeration(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, int M) { char errbuf[eslERRBUFSIZE]; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_BG *bg = NULL; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; float vsc, fsc; float bg_ll; /* log P(seq | bg) */ double vp, fp; /* P(seq,\pi | model) and P(seq | model) */ int L; int i; double total_p; char *seq; /* Sample an enumerable HMM & profile of length M. */ if (p7_hmm_SampleEnumerable(r, M, abc, &hmm) != eslOK) esl_fatal("failed to sample an enumerable HMM"); if ((bg = p7_bg_Create(abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, 0, p7_UNILOCAL) != eslOK) esl_fatal("failed to config profile"); if (p7_hmm_Validate (hmm, errbuf, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!: %s", errbuf); if (p7_profile_Validate(gm, errbuf, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!: %s", errbuf); if ( (dsq = malloc(sizeof(ESL_DSQ) * (M+3))) == NULL) esl_fatal("allocation failed"); if ( (seq = malloc(sizeof(char) * (M+2))) == NULL) esl_fatal("allocation failed"); if ((gx = p7_gmx_Create(hmm->M, M+3)) == NULL) esl_fatal("matrix creation failed"); /* Enumerate all sequences of length L <= M */ total_p = 0; for (L = 0; L <= M; L++) { /* Initialize dsq of length L at 0000... */ dsq[0] = dsq[L+1] = eslDSQ_SENTINEL; for (i = 1; i <= L; i++) dsq[i] = 0; while (1) /* enumeration of seqs of length L*/ { if (p7_GViterbi(dsq, L, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_GForward(dsq, L, gm, gx, &fsc) != eslOK) esl_fatal("forward failed"); /* calculate bg log likelihood component of the scores */ for (bg_ll = 0., i = 1; i <= L; i++) bg_ll += log(bg->f[dsq[i]]); /* convert to probabilities, adding the bg LL back to the LLR */ vp = exp(vsc + bg_ll); fp = exp(fsc + bg_ll); if (esl_opt_GetBoolean(go, "--vv")) { esl_abc_Textize(abc, dsq, L, seq); printf("probability of sequence: %10s %16g (lod v=%8.4f f=%8.4f)\n", seq, fp, vsc, fsc); } total_p += fp; /* Increment dsq like a reversed odometer */ for (i = 1; i <= L; i++) if (dsq[i] < abc->K-1) { dsq[i]++; break; } else { dsq[i] = 0; } if (i > L) break; /* we're done enumerating sequences */ } } /* That sum is subject to significant numerical error because of * discretization error in FLogsum(); don't expect it to be too close. */ if (total_p < 0.999 || total_p > 1.001) esl_fatal("Enumeration unit test failed: total Forward p isn't near 1.0 (%g)", total_p); if (esl_opt_GetBoolean(go, "-v")) { printf("enumeration test: total p is %g\n", total_p); } p7_gmx_Destroy(gx); p7_bg_Destroy(bg); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); free(dsq); free(seq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float sc; double base_time, bench_time, Mcs; if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); fwd = p7_gmx_Create(gm->M, L); bck = p7_gmx_Create(gm->M, L); /* Baseline time. */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); esl_stopwatch_Stop(w); base_time = w->user; /* Benchmark time. */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) { esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); if (! esl_opt_GetBoolean(go, "-B")) p7_GForward (dsq, L, gm, fwd, &sc); if (! esl_opt_GetBoolean(go, "-F")) p7_GBackward(dsq, L, gm, bck, NULL); } esl_stopwatch_Stop(w); bench_time = w->user - base_time; Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) bench_time; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_gmx_Destroy(bck); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }