int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float sc; int d; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix and a trace */ fwd = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); /* Run Viterbi; do traceback */ p7_GViterbi (sq->dsq, sq->n, gm, fwd, &sc); p7_GTrace (sq->dsq, sq->n, gm, fwd, tr); /* Dump and validate the trace. */ p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); /* Domain info in the trace. */ p7_trace_Index(tr); printf("# Viterbi: %d domains : ", tr->ndom); for (d = 0; d < tr->ndom; d++) printf("%6d %6d %6d %6d ", tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]); printf("\n"); /* Cleanup */ p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_sq_Destroy(sq); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* "generation" test * Compare a randomly sampled profile to sequences sampled * from that profile. * * This test is not very stringent, because we don't know the "true" * envelopes. Rather, this is more of a test that nothing obviously * bad happens, like a crash, or obviously incorrect data. * * We test: * 1. Seq coordinates of each envelope are coherent: * 1 <= oa <= ia <= i0 <= ib <= ob <= L * * 2. Envelopes do not overlap (assuming default threshold of * 0.5 when defining them): * ia(d) > ib(d-1) for d = 2..D * (Outer envelopes, in contrast, can overlap.) * * 3. envsc(d) <= asc_sc <= fwdsc. * * 4. If D=1 (single domain) in both the generated trace * and the inferred envelopes, and the domain coords in * the trace are encompassed by the outer envelope, * then envsc(d) >= generated trace score. */ static void utest_generation(ESL_RANDOMNESS *rng, int M, const ESL_ALPHABET *abc, int N) { char msg[] = "reference_envelopes:: generation unit test failed"; ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = p7_profile_Create(M, abc); P7_TRACE *gtr = p7_trace_Create(); // generated trace P7_TRACE *vtr = p7_trace_Create(); // Viterbi trace P7_REFMX *rxf = p7_refmx_Create(M, 20); // Fwd, Vit ~~> ASC Decode UP P7_REFMX *rxd = p7_refmx_Create(M, 20); // Bck, Decode ~~> ASC Decode DOWN P7_REFMX *afu = p7_refmx_Create(M, 20); // ASC Fwd UP P7_REFMX *afd = p7_refmx_Create(M, 20); // ASC Fwd DOWN P7_REFMX *apu = rxf; // for 'clarity' we use two names for this mx P7_REFMX *apd = rxd; // ... and this one too. float *wrk = NULL; P7_ANCHORS *anch = p7_anchors_Create(); P7_ANCHORHASH *ah = p7_anchorhash_Create(); P7_ENVELOPES *env = p7_envelopes_Create(); float tol = 0.001; float gsc, fsc, asc; int idx; int d; if ( p7_modelsample(rng, M, abc, &hmm) != eslOK) esl_fatal(msg); if ( p7_profile_Config(gm, hmm, bg) != eslOK) esl_fatal(msg); for (idx = 0; idx < N; idx++) { /* Emit sequence from model, using an arbitrary length model of <M>; * restrict the emitted sequence length to 6M, arbitrarily, to * keep it down to something reasonable. */ if ( p7_profile_SetLength(gm, M) != eslOK) esl_fatal(msg); do { esl_sq_Reuse(sq); if (p7_ProfileEmit(rng, hmm, gm, bg, sq, gtr) != eslOK) esl_fatal(msg); } while (sq->n > M * 6); if (p7_trace_Index (gtr) != eslOK) esl_fatal(msg); if (p7_trace_Score (gtr, sq->dsq, gm, &gsc) != eslOK) esl_fatal(msg); /* Reset the length model to the actual length sq->n, then * put it through the domain postprocessing analysis pipeline */ if ( p7_profile_SetLength(gm, sq->n) != eslOK) esl_fatal(msg); /* First pass analysis */ if ( p7_ReferenceViterbi (sq->dsq, sq->n, gm, rxf, vtr, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceForward (sq->dsq, sq->n, gm, rxf, &fsc) != eslOK) esl_fatal(msg); if ( p7_ReferenceBackward(sq->dsq, sq->n, gm, rxd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceDecoding(sq->dsq, sq->n, gm, rxf, rxd, rxd) != eslOK) esl_fatal(msg); /* Anchor determination (MPAS algorithm) */ if ( p7_reference_Anchors(rng, sq->dsq, sq->n, gm, rxf, rxd, vtr, &wrk, ah, afu, afd, anch, &asc, NULL, NULL) != eslOK) esl_fatal(msg); /* Reuse rxf,rxd as apu, apd; finish ASC analysis with Backward, Decoding */ p7_refmx_Reuse(apu); p7_refmx_Reuse(apd); if ( p7_ReferenceASCBackward(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceASCDecoding(sq->dsq, sq->n, gm, anch->a, anch->D, afu, afd, apu, apd, apu, apd) != eslOK) esl_fatal(msg); /* Envelope calculation */ if ( p7_reference_Envelopes(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, afu, afd, env) != eslOK) esl_fatal(msg); /* Test 1. Coords of each domain are coherent */ if (anch->D != env->D) esl_fatal(msg); for (d = 1; d <= anch->D; d++) if (! (1 <= env->arr[d].oa && env->arr[d].oa <= env->arr[d].ia && env->arr[d].ia <= env->arr[d].i0 && env->arr[d].i0 <= env->arr[d].ib && env->arr[d].ib <= env->arr[d].ob && env->arr[d].ob <= sq->n)) esl_fatal(msg); /* Test 2. Envelopes do not overlap. */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].ia > env->arr[d-1].ib)) esl_fatal(msg); /* Test 3. envsc(d) <= asc_sc <= fwdsc */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].env_sc <= asc+tol && asc <= fsc+tol)) esl_fatal(msg); /* Test 4, only on D=1 case with generated trace's domain * encompassed by the outer envelope */ if (gtr->ndom == 1 && anch->D == 1 && gtr->sqfrom[0] >= env->arr[1].oa && // in <gtr>, domains are 0..D-1; in <env>, 1..D gtr->sqto[0] <= env->arr[1].ob) if (! ( env->arr[1].env_sc >= gsc)) esl_fatal(msg); p7_envelopes_Reuse(env); p7_anchors_Reuse(anch); p7_anchorhash_Reuse(ah); p7_refmx_Reuse(rxf); p7_refmx_Reuse(rxd); p7_refmx_Reuse(afu); p7_refmx_Reuse(afd); p7_trace_Reuse(gtr); p7_trace_Reuse(vtr); esl_sq_Reuse(sq); } if (wrk) free(wrk); p7_envelopes_Destroy(env); p7_anchors_Destroy(anch); p7_anchorhash_Destroy(ah); p7_refmx_Destroy(afu); p7_refmx_Destroy(afd); p7_refmx_Destroy(rxf); p7_refmx_Destroy(rxd); p7_trace_Destroy(vtr); p7_trace_Destroy(gtr); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *ox1 = NULL; P7_OMX *ox2 = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); float null2[p7_MAXCODE]; int i,j,d,pos; int nsamples = 200; float fsc, bsc; double Mcs; if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); ox1 = p7_omx_Create(gm->M, L, L); ox2 = p7_omx_Create(gm->M, L, L); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_Forward (dsq, L, om, ox1, &fsc); if (esl_opt_GetBoolean(go, "-t")) { P7_TRACE *tr = p7_trace_Create(); float *n2sc = malloc(sizeof(float) * (L+1)); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { /* This is approximately what p7_domaindef.c::region_trace_ensemble() is doing: */ for (j = 0; j < nsamples; j++) { p7_StochasticTrace(r, dsq, L, om, ox1, tr); p7_trace_Index(tr); pos = 1; for (d = 0; d < tr->ndom; d++) { p7_Null2_ByTrace(om, tr, tr->tfrom[d], tr->tto[d], ox2, null2); for (; pos <= tr->sqfrom[d]; pos++) n2sc[pos] += 1.0; for (; pos < tr->sqto[d]; pos++) n2sc[pos] += null2[dsq[pos]]; } for (; pos <= L; pos++) n2sc[pos] += 1.0; p7_trace_Reuse(tr); } for (pos = 1; pos <= L; pos++) n2sc[pos] = logf(n2sc[pos] / nsamples); } esl_stopwatch_Stop(w); free(n2sc); p7_trace_Destroy(tr); } else { p7_Backward(dsq, L, om, ox1, ox2, &bsc); p7_Decoding(om, ox1, ox2, ox2); esl_stopwatch_Start(w); for (i = 0; i < N; i++) p7_Null2_ByExpectation(om, ox2, null2); esl_stopwatch_Stop(w); } Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_omx_Destroy(ox1); p7_omx_Destroy(ox2); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* glocal_region_trace_ensemble() * EPN, Tue Oct 5 10:13:25 2010 * * Based on p7_domaindef.c's region_trace_ensemble(). Modified so that * generic matrices (which can be used for glocally configured models) * can be used. An additional parameter <do_null2> has been added, * so that null2-related calculations are only done if necessary. * That is, they're skipped if null2 has been turned off in the pipeline. * * Notes from p7_domaindef.c::region_trace_ensemble(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 11:49:44 2008 [Janelia] * * Here, we've decided that region <ireg>..<jreg> in sequence <dsq> might be * composed of more than one domain, and we're going to use clustering * of a posterior ensemble of stochastic tracebacks to sort it out. * * Caller provides a filled Forward matrix in <fwd> for the sequence * region <dsq+ireg-1>, length <jreg-ireg+1>, for the model <om> * configured in multihit mode with its target length distribution * set to the total length of <dsq>: i.e., the same model * configuration used to score the complete sequence (if it weren't * multihit, we wouldn't be worried about multiple domains). * * Caller also provides a DP matrix in <wrk> containing at least one * row, for use as temporary workspace. (This will typically be the * caller's Backwards matrix, which we haven't yet used at this point * in the processing pipeline.) * * Caller provides <ddef>, which defines heuristic parameters that * control the clustering, and provides working space for the * calculation and the answers. The <ddef->sp> object must have been * reused (i.e., it needs to be fresh; we're going to use it here); * the caller needs to Reuse() it specifically, because it can't just * Reuse() the whole <ddef>, when it's in the process of analyzing * regions. * * Upon return, <*ret_nc> contains the number of clusters that were * defined. * * The caller can retrieve info on each cluster by calling * <p7_spensemble_GetClusterCoords(ddef->sp...)> on the * <P7_SPENSEMBLE> object in <ddef>. * * Other information on what's happened in working memory: * * <ddef->n2sc[ireg..jreg]> now contains log f'(x_i) / f(x_i) null2 scores * for each residue. * * <ddef->sp> gets filled in, and upon return, it's holding the answers * (the cluster definitions). When the caller is done retrieving those * answers, it needs to <esl_spensemble_Reuse()> it before calling * <region_trace_ensemble()> again. * * <ddef->tr> is used as working memory for sampled traces. * * <wrk> has had its zero row clobbered as working space for a null2 calculation. *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ static int glocal_region_trace_ensemble(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_DSQ *dsq, int ireg, int jreg, const P7_GMX *fwd, P7_GMX *wrk, int do_null2, int *ret_nc) { int Lr = jreg-ireg+1; int t, d, d2; int nov, n; int nc; int pos; float null2[p7_MAXCODE]; esl_vec_FSet(ddef->n2sc+ireg, Lr, 0.0); /* zero the null2 scores in region */ /* By default, we make results reproducible by forcing a reset of * the RNG to its originally seeded state. */ if (ddef->do_reseeding) esl_randomness_Init(ddef->r, esl_randomness_GetSeed(ddef->r)); /* Collect an ensemble of sampled traces; calculate null2 odds ratios from these if nec */ for (t = 0; t < ddef->nsamples; t++) { p7_GStochasticTrace(ddef->r, dsq+ireg-1, Lr, gm, fwd, ddef->tr); p7_trace_Index(ddef->tr); pos = 1; for (d = 0; d < ddef->tr->ndom; d++) { p7_spensemble_Add(ddef->sp, t, ddef->tr->sqfrom[d]+ireg-1, ddef->tr->sqto[d]+ireg-1, ddef->tr->hmmfrom[d], ddef->tr->hmmto[d]); if(do_null2) { p7_GNull2_ByTrace(gm, ddef->tr, ddef->tr->tfrom[d], ddef->tr->tto[d], wrk, null2); /* residues outside domains get bumped +1: because f'(x) = f(x), so f'(x)/f(x) = 1 in these segments */ for (; pos <= ddef->tr->sqfrom[d]; pos++) ddef->n2sc[ireg+pos-1] += 1.0; /* Residues inside domains get bumped by their null2 ratio */ for (; pos <= ddef->tr->sqto[d]; pos++) ddef->n2sc[ireg+pos-1] += null2[dsq[ireg+pos-1]]; } } if(do_null2) { /* the remaining residues in the region outside any domains get +1 */ for (; pos <= Lr; pos++) ddef->n2sc[ireg+pos-1] += 1.0; } p7_trace_Reuse(ddef->tr); } /* Convert the accumulated n2sc[] ratios in this region to log odds null2 scores on each residue. */ if(do_null2) { for (pos = ireg; pos <= jreg; pos++) ddef->n2sc[pos] = logf(ddef->n2sc[pos] / (float) ddef->nsamples); } /* Cluster the ensemble of traces to break region into envelopes. */ p7_spensemble_Cluster(ddef->sp, ddef->min_overlap, ddef->of_smaller, ddef->max_diagdiff, ddef->min_posterior, ddef->min_endpointp, &nc); /* A little hacky now. Remove "dominated" domains relative to seq coords. */ for (d = 0; d < nc; d++) ddef->sp->assignment[d] = 0; /* overload <assignment> to flag that a domain is dominated */ /* who dominates who? (by post prob) */ for (d = 0; d < nc; d++) { for (d2 = d+1; d2 < nc; d2++) { nov = ESL_MIN(ddef->sp->sigc[d].j, ddef->sp->sigc[d2].j) - ESL_MAX(ddef->sp->sigc[d].i, ddef->sp->sigc[d2].i) + 1; if (nov == 0) break; n = ESL_MIN(ddef->sp->sigc[d].j - ddef->sp->sigc[d].i + 1, ddef->sp->sigc[d2].j - ddef->sp->sigc[d2].i + 1); if ((float) nov / (float) n >= 0.8) /* overlap */ { if (ddef->sp->sigc[d].prob > ddef->sp->sigc[d2].prob) ddef->sp->assignment[d2] = 1; else ddef->sp->assignment[d] = 1; } } } /* shrink the sigc list, removing dominated domains */ d = 0; for (d2 = 0; d2 < nc; d2++) { if (ddef->sp->assignment[d2]) continue; /* skip domain d2, it's dominated. */ if (d != d2) memcpy(ddef->sp->sigc + d, ddef->sp->sigc + d2, sizeof(struct p7_spcoord_s)); d++; } ddef->sp->nc = d; *ret_nc = d; return eslOK; }