static void emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_MSA *msa = NULL; ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; int N = esl_opt_GetInteger(go, "-N"); int optflags = p7_ALL_CONSENSUS_COLS; int i; if ((tr = malloc(sizeof(P7_TRACE *) * N)) == NULL) esl_fatal("failed to allocate trace array"); if ((sq = malloc(sizeof(ESL_SQ *) * N)) == NULL) esl_fatal("failed to allocate seq array"); for (i = 0; i < N; i++) { if ((sq[i] = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate seq"); if ((tr[i] = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); } for (i = 0; i < N; i++) { if (p7_CoreEmit(r, hmm, sq[i], tr[i]) != eslOK) esl_fatal("Failed to emit sequence"); if (esl_sq_FormatName(sq[i], "%s-sample%d", hmm->name, i+1) != eslOK) esl_fatal("Failed to set sequence name\n"); } p7_tracealign_Seqs(sq, tr, N, hmm->M, optflags, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (i = 0; i < N; i++) p7_trace_Destroy(tr[i]); free(tr); for (i = 0; i < N; i++) esl_sq_Destroy(sq[i]); free(sq); esl_msa_Destroy(msa); return; }
/* tests: * 1. each sampled trace must validate. * 2. each trace must be <= viterbi trace score * 3. in a large # of traces, one is "equal" to the viterbi trace score. * (this of course is stochastic; but it's true for the particular * choice of RNG seed used in tests here.) */ static void utest_stotrace(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_OPROFILE *om, ESL_DSQ *dsq, int L, int ntrace) { P7_GMX *gx = NULL; P7_OMX *ox = NULL; P7_TRACE *tr = NULL; char errbuf[eslERRBUFSIZE]; int idx; float maxsc = -eslINFINITY; float vsc, sc; if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("generic DP matrix creation failed"); if ((ox = p7_omx_Create(gm->M, L, L)) == NULL) esl_fatal("optimized DP matrix create failed"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); if (p7_GViterbi(dsq, L, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_Forward (dsq, L, om, ox, NULL) != eslOK) esl_fatal("forward failed"); for (idx = 0; idx < ntrace; idx++) { if (p7_StochasticTrace(rng, dsq, L, om, ox, tr) != eslOK) esl_fatal("stochastic trace failed"); if (p7_trace_Validate(tr, abc, dsq, errbuf) != eslOK) esl_fatal("trace invalid:\n%s", errbuf); if (p7_trace_Score(tr, dsq, gm, &sc) != eslOK) esl_fatal("trace scoring failed"); maxsc = ESL_MAX(sc, maxsc); if (sc > vsc) esl_fatal("sampled trace has score > optimal Viterbi path; not possible"); p7_trace_Reuse(tr); } if (esl_FCompare(maxsc, vsc, 0.1) != eslOK) esl_fatal("stochastic trace failed to sample the Viterbi path"); p7_trace_Destroy(tr); p7_omx_Destroy(ox); p7_gmx_Destroy(gx); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_TRACE *tr = p7_trace_Create(); ESL_SQ *sq = NULL; char errbuf[eslERRBUFSIZE]; int i; int status; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); sq = esl_sq_CreateDigital(abc); for (i = 0; i < N; i++) { p7_ProfileEmit(rng, hmm, gm, bg, sq, tr); esl_sq_FormatName(sq, "%s-sample%d", hmm->name, i); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) esl_fatal(errbuf); esl_sq_Reuse(sq); p7_trace_Reuse(tr); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }
/* The "basic" utest is a minimal driver for making a small DNA profile and a small DNA sequence, * then running Viterbi and Forward. It's useful for dumping DP matrices and profiles for debugging. */ static void utest_basic(ESL_GETOPTS *go) { char *query= "# STOCKHOLM 1.0\n\nseq1 GAATTC\nseq2 GAATTC\n//\n"; int fmt = eslMSAFILE_STOCKHOLM; char *targ = "GAATTC"; ESL_ALPHABET *abc = NULL; ESL_MSA *msa = NULL; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_BG *bg = NULL; P7_PRIOR *pri = NULL; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; P7_TRACE *tr = NULL; int L = strlen(targ); float vsc, vsc2, fsc; if ((abc = esl_alphabet_Create(eslDNA)) == NULL) esl_fatal("failed to create alphabet"); if ((pri = p7_prior_CreateNucleic()) == NULL) esl_fatal("failed to create prior"); if ((msa = esl_msa_CreateFromString(query, fmt)) == NULL) esl_fatal("failed to create MSA"); if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal("failed to digitize MSA"); if (p7_Fastmodelmaker(msa, 0.5, NULL, &hmm, NULL) != eslOK) esl_fatal("failed to create GAATTC model"); if (p7_ParameterEstimation(hmm, pri) != eslOK) esl_fatal("failed to parameterize GAATTC model"); if (p7_hmm_SetConsensus(hmm, NULL) != eslOK) esl_fatal("failed to make consensus"); if ((bg = p7_bg_Create(abc)) == NULL) esl_fatal("failed to create DNA null model"); if ((gm = p7_profile_Create(hmm->M, abc)) == NULL) esl_fatal("failed to create GAATTC profile"); if (p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL)!= eslOK) esl_fatal("failed to config profile"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); if (esl_abc_CreateDsq(abc, targ, &dsq) != eslOK) esl_fatal("failed to create GAATTC digital sequence"); if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("failed to create DP matrix"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); p7_GViterbi (dsq, L, gm, gx, &vsc); if (esl_opt_GetBoolean(go, "-v")) printf("Viterbi score: %.4f\n", vsc); if (esl_opt_GetBoolean(go, "-v")) p7_gmx_Dump(stdout, gx, p7_DEFAULT); p7_GTrace (dsq, L, gm, gx, tr); p7_trace_Score(tr, dsq, gm, &vsc2); if (esl_opt_GetBoolean(go, "-v")) p7_trace_Dump(stdout, tr, gm, dsq); if (esl_FCompare(vsc, vsc2, 1e-5) != eslOK) esl_fatal("trace score and Viterbi score don't agree."); p7_GForward (dsq, L, gm, gx, &fsc); if (esl_opt_GetBoolean(go, "-v")) printf("Forward score: %.4f\n", fsc); if (esl_opt_GetBoolean(go, "-v")) p7_gmx_Dump(stdout, gx, p7_DEFAULT); p7_trace_Destroy(tr); p7_gmx_Destroy(gx); free(dsq); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_msa_Destroy(msa); p7_prior_Destroy(pri); esl_alphabet_Destroy(abc); return; }
int main(int argc, char **argv) { char *hmmfile = argv[1]; /* name of HMM file to read one HMM from */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ P7_HMMFILE *hfp = NULL; /* open hmmfile */ P7_HMM *hmm = NULL; /* HMM to emit from */ P7_PROFILE *gm = NULL; /* profile HMM (scores) */ P7_BG *bg = NULL; /* null model */ P7_TRACE *tr = NULL; /* sampled trace */ ESL_SQ *sq = NULL; /* sampled digital sequence */ int n = 1000; int counts[p7T_NSTATETYPES]; int i; float sc; float nullsc; double bitscore; r = esl_randomness_CreateFast(0); tr = p7_trace_Create(); if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("failed to open %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("failed to read HMM"); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); for (i = 0; i < n; i++) { p7_ProfileEmit(r, hmm, gm, bg, sq, tr); p7_trace_GetStateUseCounts(tr, counts); p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_trace_Score(tr, sq->dsq, gm, &sc); p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); bitscore = (sc - nullsc)/ eslCONST_LOG2; printf("%d %8.4f\n", counts[p7T_M] + (counts[p7T_I] + counts[p7T_D])/2, bitscore); } p7_profile_Destroy(gm); esl_sq_Destroy(sq); p7_trace_Destroy(tr); esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); p7_hmm_Destroy(hmm); return eslOK; }
static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_SQ *sq = NULL; P7_TRACE *tr = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; int do_profile = esl_opt_GetBoolean(go, "-p"); int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int mode = p7_LOCAL; int nseq; int status; if (esl_opt_GetBoolean(go, "--local")) mode = p7_LOCAL; else if (esl_opt_GetBoolean(go, "--unilocal")) mode = p7_UNILOCAL; else if (esl_opt_GetBoolean(go, "--glocal")) mode = p7_GLOCAL; else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); if ((bg = p7_bg_Create(hmm->abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, L, mode) != eslOK) esl_fatal("failed to configure profile"); if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal("failed to reconfig null model length"); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); for (nseq = 1; nseq <= N; nseq++) { if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr); else status = p7_CoreEmit (r, hmm, sq, tr); if (status) esl_fatal("Failed to emit sequence\n"); status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq); if (status) esl_fatal("Failed to set sequence name\n"); status = esl_sqio_Write(ofp, sq, outfmt, FALSE); if (status != eslOK) esl_fatal("Failed to write sequence\n"); p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_bg_Destroy(bg); p7_profile_Destroy(gm); return; }
/* Function: p7_SingleBuilder() * Synopsis: Build a new HMM from a single sequence. * * Purpose: Take the sequence <sq> and a build configuration <bld>, and * build a new HMM. * * The single sequence scoring system in the <bld> * configuration must have been previously initialized by * <p7_builder_SetScoreSystem()>. * * Args: bld - build configuration * sq - query sequence * bg - null model (needed to paramaterize insert emission probs) * opt_hmm - optRETURN: new HMM * opt_gm - optRETURN: profile corresponding to <hmm> * opt_om - optRETURN: optimized profile corresponding to <gm> * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> if <bld> isn't properly configured somehow. */ int p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE **opt_tr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om) { P7_HMM *hmm = NULL; P7_TRACE *tr = NULL; int k; int status; bld->errbuf[0] = '\0'; if (! bld->Q) ESL_XEXCEPTION(eslEINVAL, "score system not initialized"); if ((status = p7_Seqmodel(bld->abc, sq->dsq, sq->n, sq->name, bld->Q, bg->f, bld->popen, bld->pextend, &hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetComposition(hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetConsensus(hmm, sq)) != eslOK) goto ERROR; if ((status = calibrate(bld, hmm, bg, opt_gm, opt_om)) != eslOK) goto ERROR; if ( bld->abc->type == eslDNA || bld->abc->type == eslRNA ) { if (bld->w_len > 0) hmm->max_length = bld->w_len; else if (bld->w_beta == 0.0) hmm->max_length = hmm->M *4; else if ( (status = p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR; } /* build a faux trace: relative to core model (B->M_1..M_L->E) */ if (opt_tr != NULL) { if ((tr = p7_trace_Create()) == NULL) goto ERROR; if ((status = p7_trace_Append(tr, p7T_B, 0, 0)) != eslOK) goto ERROR; for (k = 1; k <= sq->n; k++) if ((status = p7_trace_Append(tr, p7T_M, k, k)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_E, 0, 0)) != eslOK) goto ERROR; tr->M = sq->n; tr->L = sq->n; } /* note that <opt_gm> and <opt_om> were already set by calibrate() call above. */ if (opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm); if (opt_tr != NULL) *opt_tr = tr; return eslOK; ERROR: p7_hmm_Destroy(hmm); if (tr != NULL) p7_trace_Destroy(tr); if (opt_gm != NULL) p7_profile_Destroy(*opt_gm); if (opt_om != NULL) p7_oprofile_Destroy(*opt_om); return status; }
/* Viterbi validation is done by comparing the returned score * to the score of the optimal trace. Not foolproof, but catches * many kinds of errors. * * Another check is that the average score should be <= 0, * since the random sequences are drawn from the null model. */ static void utest_viterbi(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L) { float avg_sc = 0.; char errbuf[eslERRBUFSIZE]; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; P7_TRACE *tr = NULL; int idx; float sc1, sc2; if ((dsq = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL) esl_fatal("malloc failed"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); for (idx = 0; idx < nseq; idx++) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed"); if (p7_GViterbi(dsq, L, gm, gx, &sc1) != eslOK) esl_fatal("viterbi failed"); if (p7_GTrace (dsq, L, gm, gx, tr) != eslOK) esl_fatal("trace failed"); if (p7_trace_Validate(tr, abc, dsq, errbuf) != eslOK) esl_fatal("trace invalid:\n%s", errbuf); if (p7_trace_Score(tr, dsq, gm, &sc2) != eslOK) esl_fatal("trace score failed"); if (esl_FCompare(sc1, sc2, 1e-6) != eslOK) esl_fatal("Trace score != Viterbi score"); if (p7_bg_NullOne(bg, dsq, L, &sc2) != eslOK) esl_fatal("null score failed"); avg_sc += (sc1 - sc2); if (esl_opt_GetBoolean(go, "--vv")) printf("utest_viterbi: Viterbi score: %.4f (null %.4f) (total so far: %.4f)\n", sc1, sc2, avg_sc); p7_trace_Reuse(tr); } avg_sc /= (float) nseq; if (avg_sc > 0.) esl_fatal("Viterbi scores have positive expectation (%f nats)", avg_sc); p7_gmx_Destroy(gx); p7_trace_Destroy(tr); free(dsq); return; }
/* The "generation" test scores sequences generated by the same profile. * Each Viterbi and Forward score should be >= the trace score of the emitted seq. * The expectation of Forward scores should be positive. */ static void utest_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_HMM *hmm, P7_BG *bg, int nseq) { ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_GMX *gx = p7_gmx_Create(gm->M, 100); P7_TRACE *tr = p7_trace_Create(); float vsc, fsc, nullsc, tracesc; float avg_fsc; int idx; avg_fsc = 0.0; for (idx = 0; idx < nseq; idx++) { if (p7_ProfileEmit(r, hmm, gm, bg, sq, tr) != eslOK) esl_fatal("profile emission failed"); if (p7_gmx_GrowTo(gx, gm->M, sq->n) != eslOK) esl_fatal("failed to reallocate gmx"); if (p7_GViterbi(sq->dsq, sq->n, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_GForward(sq->dsq, sq->n, gm, gx, &fsc) != eslOK) esl_fatal("forward failed"); if (p7_trace_Score(tr, sq->dsq, gm, &tracesc) != eslOK) esl_fatal("trace score failed"); if (p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc) != eslOK) esl_fatal("null score failed"); if (vsc < tracesc) esl_fatal("viterbi score is less than trace"); if (fsc < tracesc) esl_fatal("forward score is less than trace"); if (vsc > fsc) esl_fatal("viterbi score is greater than forward"); if (esl_opt_GetBoolean(go, "--vv")) printf("generated: len=%d v=%8.4f f=%8.4f t=%8.4f\n", (int) sq->n, vsc, fsc, tracesc); avg_fsc += (fsc - nullsc); } avg_fsc /= (float) nseq; if (avg_fsc < 0.) esl_fatal("generation: Forward scores have negative expectation (%f nats)", avg_fsc); p7_gmx_Destroy(gx); p7_trace_Destroy(tr); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float sc; int d; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix and a trace */ fwd = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); /* Run Viterbi; do traceback */ p7_GViterbi (sq->dsq, sq->n, gm, fwd, &sc); p7_GTrace (sq->dsq, sq->n, gm, fwd, tr); /* Dump and validate the trace. */ p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); /* Domain info in the trace. */ p7_trace_Index(tr); printf("# Viterbi: %d domains : ", tr->ndom); for (d = 0; d < tr->ndom; d++) printf("%6d %6d %6d %6d ", tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]); printf("\n"); /* Cleanup */ p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_sq_Destroy(sq); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* profile_local_endpoints() * * Purpose: Wrapper around <p7_ProfileEmit()>, sampling a local * alignment fragment from the profile's probabilistic model * (which may be the implicit model of HMMER3, or the * Plan7 model of HMMER2), and reporting coordinates * of the fragment w.r.t. both model and sequence. * * To simplify the implementation, the profile must be in * <p7_UNILOCAL> mode, not <p7_LOCAL> mode, so we know we * only have to deal with a single hit per sampled * sequence. * * We want <i1..i2> to be relative to the sequence coords * of a complete (global) sampled sequence that we could * have sampled this local alignment from; but the <i1..i2> * we initially get are relative to our profile-sampled * trace, so they are offset both by N-generated residues * that occur in the profile and by residues that the * profile's local entry skipped. To translate from * profile/sequence coords to core model/sequence coords, * we use rejection sampling: sample traces from the core * model until we find one that uses the same statetypes * at *initial* entry/exit points <k1>,<k2>, then use * that sample's sequence to determine offsets and correct * <i1..i2> reference frame. * * Local alignment endpoints are defined to be * match-delimited. However, an H3 model allows exit on * either a D or M state. Thus, the initially sampled end * point k2 may need to be rolled back to last M state, to * satisfy local alignment endpoint definition. Entries are * not a problem; both H2 and H3 profiles can only enter on * a M state. (This rollback has to occur after we've * matched a core trace to the profile trace to determine * i offsets.) * * Then, sampling from both the core model and the profile * in the same routine introduces a complication: * conceivably, profile configuration alters the transition * probabilities in the core model (by adding <M->E> * transitions and renormalizing the M transition * distributions, for example; H2 configuration does this, * though H3 does not). So you can't <CoreSample()> the * <gm->hmm> safely. To avoid such things, the caller * provides a clean copy of the core model in <core>. * * i endpoints are normalized/discretized to 1..<Lbins>, so * we can collate i statistics from sampled sequences of * varying L. Note this causes discretization artifacts, * leading to underrepresentation of j=M and * overrepresentation of i=1. * * Returns: <eslOK> on success; returns normalized sequence coords in * <*ret_i1> and <*ret_i2>, and the model entry/exit coords * in <*ret_k1> and <*ret_k2>. * * Xref: STL11/142-143 */ static int profile_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *core, P7_PROFILE *gm, ESL_SQ *sq, P7_TRACE *tr, int Lbins, int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2) { int status; int i1,i2; int k1,k2; int t1,t2; /* entry/exit positions in local trace, tr */ int tg1, tg2; /* entry/exit positions in global trace, tr2 */ int tpos; int nterm, cterm; /* offsets at N, C terminus. */ int L; /* inferred length from 3-part patching */ ESL_SQ *sq2 = NULL; P7_TRACE *tr2 = NULL; int failsafe = 0; if (gm->mode != p7_UNILOCAL) ESL_XEXCEPTION(eslEINVAL, "profile must be unilocal"); if ((sq2 = esl_sq_CreateDigital(gm->abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((tr = p7_trace_Create()) == NULL) { status = eslEMEM; goto ERROR; } /* sample local alignment from the implicit model */ if (gm->h2_mode) { if ((status = p7_H2_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR; } else { if ((status = p7_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR; } /* Get initial trace coords */ for (tpos = 0; tpos < tr->N; tpos++) if (tr->st[tpos] == p7T_B) { t1 = tpos+1; break; } for (tpos = tr->N-1; tpos >= 0; tpos--) if (tr->st[tpos] == p7T_E) { t2 = tpos-1; break; } /* Match a core trace to this local trace by rejection sampling; * this is to let us calculate sequence offsets; see comments above in preamble */ do { if (failsafe++ == 100000) ESL_XEXCEPTION(eslENOHALT, "failed to match core,local traces in %d tries\n", failsafe); if ((status = p7_CoreEmit(r, core, sq2, tr2)) != eslOK) goto ERROR; for (tpos = 0; tpos < tr2->N; tpos++) if (tr2->k[tpos] == tr->k[t1]) { tg1 = tpos; break; } for (tpos = tr2->N-1; tpos >= 0; tpos--) if (tr2->k[tpos] == tr->k[t2]) { tg2 = tpos; break; } } while (tr2->st[tg1] != tr->st[t1] && tr2->st[tg2] != tr->st[t2]); /* tg1..tg2 in core trace is now matched to t1..t2 in the profile trace. * Calculate # of residues preceding tg1 and following tg2 in the core trace. * A core trace can only generate residues from M or I states. */ for (nterm = 0, tpos = 0; tpos < tg1; tpos++) if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) nterm++; for (cterm = 0, tpos = tr2->N-1; tpos > tg2; tpos--) if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) cterm++; /* rectify the t2 endpoint, rolling back any trailing D path */ for (; t2 >= 0; t2--) if (tr->st[t2] == p7T_M) break; if (t2 < t1) ESL_XEXCEPTION(eslEINCONCEIVABLE, "this only happens on an all-D path through profile"); /* determine initial endpoint coords from t1 and t2 */ i1 = tr->i[t1]; i2 = tr->i[t2]; k1 = tr->k[t1]; k2 = tr->k[t2]; /* offset the i coords. */ L = (i2-i1+1) + nterm + cterm; i2 = (i2-i1+1) + nterm; i1 = nterm+1; /* normalize the i coords into range 1..Lbins, instead of 1..L */ i1 = ((i1-1) * Lbins / L) + 1; i2 = ((i2-1) * Lbins / L) + 1; *ret_i1 = i1; *ret_i2 = i2; *ret_k1 = k1; *ret_k2 = k2; p7_trace_Destroy(tr2); esl_sq_Destroy(sq2); return eslOK; ERROR: if (sq2 != NULL) esl_sq_Destroy(sq2); if (tr2 != NULL) p7_trace_Destroy(tr2); *ret_i1 = 0.; *ret_i2 = 0.; *ret_k1 = 0; *ret_k2 = 0; return status; }
int main(int argc, char **argv) { ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_GETOPTS *go = NULL; /* command line processing */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ P7_HMM *hmm = NULL; /* sampled HMM to emit from */ P7_HMM *core = NULL; /* safe copy of the HMM, before config */ P7_BG *bg = NULL; /* null model */ ESL_SQ *sq = NULL; /* sampled sequence */ P7_TRACE *tr = NULL; /* sampled trace */ P7_PROFILE *gm = NULL; /* profile */ int i,j; int i1,i2; int k1,k2; int iseq; FILE *fp = NULL; double expected; int do_ilocal; char *hmmfile = NULL; int nseq; int do_swlike; int do_ungapped; int L; int M; int do_h2; char *ipsfile = NULL; char *kpsfile = NULL; ESL_DMATRIX *imx = NULL; ESL_DMATRIX *kmx = NULL; ESL_DMATRIX *iref = NULL; /* reference matrix: expected i distribution under ideality */ int Lbins; int status; char errbuf[eslERRBUFSIZE]; /***************************************************************** * Parse the command line *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:\n"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2 = indentation; 80=textwidth*/ return eslOK; } do_ilocal = esl_opt_GetBoolean(go, "-i"); hmmfile = esl_opt_GetString (go, "-m"); nseq = esl_opt_GetInteger(go, "-n"); do_swlike = esl_opt_GetBoolean(go, "-s"); do_ungapped = esl_opt_GetBoolean(go, "-u"); L = esl_opt_GetInteger(go, "-L"); M = esl_opt_GetInteger(go, "-M"); do_h2 = esl_opt_GetBoolean(go, "-2"); ipsfile = esl_opt_GetString (go, "--ips"); kpsfile = esl_opt_GetString (go, "--kps"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); printf("Usage: %s [options]\n", argv[0]); return eslFAIL; } r = esl_randomness_CreateFast(0); if (hmmfile != NULL) { /* Read the HMM (and get alphabet from it) */ P7_HMMFILE *hfp = NULL; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); if ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslOK) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else esl_fatal("Unexpected error in reading HMMs"); } M = hmm->M; p7_hmmfile_Close(hfp); } else { /* Or sample the HMM (create alphabet first) */ abc = esl_alphabet_Create(eslAMINO); if (do_ungapped) p7_hmm_SampleUngapped(r, M, abc, &hmm); else if (do_swlike) p7_hmm_SampleUniform (r, M, abc, 0.05, 0.5, 0.05, 0.2, &hmm); /* tmi, tii, tmd, tdd */ else p7_hmm_Sample (r, M, abc, &hmm); } Lbins = M; imx = esl_dmatrix_Create(Lbins, Lbins); iref = esl_dmatrix_Create(Lbins, Lbins); kmx = esl_dmatrix_Create(M, M); esl_dmatrix_SetZero(imx); esl_dmatrix_SetZero(iref); esl_dmatrix_SetZero(kmx); tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); core = p7_hmm_Clone(hmm); if (do_h2) { gm = p7_profile_Create(hmm->M, abc); p7_H2_ProfileConfig(hmm, bg, gm, p7_UNILOCAL); } else { gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); } /* Sample endpoints. * Also sample an ideal reference distribution for i endpoints. i * endpoints are prone to discretization artifacts, when emitted * sequences have varying lengths. Taking log odds w.r.t. an ideal * reference that is subject to the same discretization artifacts * cancels out the effect. */ for (iseq = 0; iseq < nseq; iseq++) { if (do_ilocal) ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); else profile_local_endpoints(r, core, gm, sq, tr, Lbins, &i1, &i2, &k1, &k2); imx->mx[i1-1][i2-1] += 1.; kmx->mx[k1-1][k2-1] += 1.; /* reference distribution for i */ ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); iref->mx[i1-1][i2-1] += 1.; } /* Adjust both mx's to log_2(obs/exp) ratio */ printf("Before normalization/log-odds:\n"); printf(" i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf(" k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); printf("iref matrix values range from %f to %f\n", dmx_upper_min(iref), dmx_upper_max(iref)); expected = (double) nseq * 2. / (double) (M*(M+1)); for (i = 0; i < kmx->m; i++) for (j = i; j < kmx->n; j++) kmx->mx[i][j] = log(kmx->mx[i][j] / expected) / log(2.0); for (i = 0; i < imx->m; i++) for (j = i; j < imx->m; j++) if (iref->mx[i][j] == 0. && imx->mx[i][j] == 0.) imx->mx[i][j] = 0.; else if (iref->mx[i][j] == 0.) imx->mx[i][j] = eslINFINITY; else if (imx->mx[i][j] == 0.) imx->mx[i][j] = -eslINFINITY; else imx->mx[i][j] = log(imx->mx[i][j] / iref->mx[i][j]) / log(2.0); /* Print ps files */ if (kpsfile != NULL) { if ((fp = fopen(kpsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", kpsfile); dmx_Visualize(fp, kmx, -4., 5.); fclose(fp); } if (ipsfile != NULL) { if ((fp = fopen(ipsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", ipsfile); dmx_Visualize(fp, imx, -4., 5.); /* dmx_Visualize(fp, imx, dmx_upper_min(imx), dmx_upper_max(imx)); */ fclose(fp); } printf("After normalization/log-odds:\n"); printf("i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf("k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(core); p7_hmm_Destroy(hmm); p7_trace_Destroy(tr); esl_sq_Destroy(sq); esl_dmatrix_Destroy(imx); esl_dmatrix_Destroy(kmx); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *ox1 = NULL; P7_OMX *ox2 = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); float null2[p7_MAXCODE]; int i,j,d,pos; int nsamples = 200; float fsc, bsc; double Mcs; if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, L); ox1 = p7_omx_Create(gm->M, L, L); ox2 = p7_omx_Create(gm->M, L, L); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_Forward (dsq, L, om, ox1, &fsc); if (esl_opt_GetBoolean(go, "-t")) { P7_TRACE *tr = p7_trace_Create(); float *n2sc = malloc(sizeof(float) * (L+1)); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { /* This is approximately what p7_domaindef.c::region_trace_ensemble() is doing: */ for (j = 0; j < nsamples; j++) { p7_StochasticTrace(r, dsq, L, om, ox1, tr); p7_trace_Index(tr); pos = 1; for (d = 0; d < tr->ndom; d++) { p7_Null2_ByTrace(om, tr, tr->tfrom[d], tr->tto[d], ox2, null2); for (; pos <= tr->sqfrom[d]; pos++) n2sc[pos] += 1.0; for (; pos < tr->sqto[d]; pos++) n2sc[pos] += null2[dsq[pos]]; } for (; pos <= L; pos++) n2sc[pos] += 1.0; p7_trace_Reuse(tr); } for (pos = 1; pos <= L; pos++) n2sc[pos] = logf(n2sc[pos] / nsamples); } esl_stopwatch_Stop(w); free(n2sc); p7_trace_Destroy(tr); } else { p7_Backward(dsq, L, om, ox1, ox2, &bsc); p7_Decoding(om, ox1, ox2, ox2); esl_stopwatch_Start(w); for (i = 0; i < N; i++) p7_Null2_ByExpectation(om, ox2, null2); esl_stopwatch_Stop(w); } Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / (double) w->user; esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("# M = %d\n", gm->M); printf("# %.1f Mc/s\n", Mcs); free(dsq); p7_omx_Destroy(ox1); p7_omx_Destroy(ox2); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* Function: hmmpgmd2msa() * Synopsis: Convert an HMMPGMD-derived data stream to an MSA, based * on the corresponding hmm * * Purpose: Given a data stream from HMMPGMD of the form shown * here, produce an MSA: * HMMD_SEARCH_STATS * P7_HITS array of size (nhits) from above? * then repeats of P7_DOMAIN and P7_ALIDISPLAY data * for the hits, where each hit with d domains * produces * d P7_DOMAINs * then * d P7_ALIDISPLAYs * ... optionally adding a sequence with length matching * that of the hmm, which will be included in the alignment. * * A further extension has been the ability to include or exclude * sequences form the list of hits. * * This function's expected use is as a helper function for * the hmmer website, which gets the above data stream from * hmmpgmd. * * Args : data: a pointer to binary data in the format given above * hmm: the HMM against which the alidisplay traces and * additional sequences/traces are threaded to reach * the returned msa. * qsq : optional sequence to be included in the output msa; * must have the same number of residues as the hmm * has states, as each residue i will be aligned to * state i. * incl: optional array of sequence names, in the case of * hmmpgmd a list of ints, which are are excluded due * to the sequence threshold, but have been selected * to be included in the alignment. This ties in * with the way jackhmmer is implemented on the * HMMER website. * incl_size: required size of the incl array. zero if incl is null. * excl: optional array of sequence names, in the case of * hmmpgmd a list of ints, which are are included as they * score above threshold, but have been selected * to be excluded from the alignment. * excl_size: required size of the excl array. zero if excl is null. * * * Returns: Pointer to completed MSA object. NULL on error * */ int hmmpgmd2msa(void *data, P7_HMM *hmm, ESL_SQ *qsq, int *incl, int incl_size, int *excl, int excl_size, ESL_MSA **ret_msa) { int i, j; int c; int status; int set_included; /* trace of the query sequence with N residues onto model with N match states */ P7_TRACE *qtr = NULL; int extra_sqcnt = 0; /* vars used to read from the binary data */ HMMD_SEARCH_STATS *stats = NULL; /* pointer to a single stats object, at the beginning of data */ P7_HIT *hits = NULL; /* an array of hits, at the appropriate offset in data */ /* vars used in msa construction */ P7_TOPHITS th; P7_ALIDISPLAY *ad, *ad2; ESL_MSA *msa = NULL; P7_DOMAIN *dom = NULL; char *p = (char*)data; /*pointer used to walk along data, must be char* to allow pointer arithmetic */ th.N = 0; th.unsrt = NULL; th.hit = NULL; /* optionally build a faux trace for the query sequence: relative to core model (B->M_1..M_L->E) */ if (qsq != NULL) { if (qsq->n != hmm->M) { status = eslFAIL; goto ERROR; } if ((qtr = p7_trace_Create()) == NULL) {status = eslFAIL; goto ERROR; } if ((status = p7_trace_Append(qtr, p7T_B, 0, 0)) != eslOK) goto ERROR; for (i = 1; i <= qsq->n; i++) if ((status = p7_trace_Append(qtr, p7T_M, i, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(qtr, p7T_E, 0, 0)) != eslOK) goto ERROR; qtr->M = qsq->n; qtr->L = qsq->n; extra_sqcnt = 1; } /* get search stats + hit info */ stats = (HMMD_SEARCH_STATS*)p; /* sanity check */ if ( ( stats->Z_setby != p7_ZSETBY_NTARGETS && stats->Z_setby != p7_ZSETBY_OPTION && stats->Z_setby != p7_ZSETBY_FILEINFO ) || ( stats->domZ_setby != p7_ZSETBY_NTARGETS && stats->domZ_setby != p7_ZSETBY_OPTION && stats->domZ_setby != p7_ZSETBY_FILEINFO ) || stats->nhits > 10000000 || stats->elapsed > 1000000 ) { status = eslFAIL; goto ERROR; } /* ok, it looks legitimate */ p += sizeof(HMMD_SEARCH_STATS); hits = (P7_HIT*)p; p += sizeof(P7_HIT) * stats->nhits; /* create a tophits object, to be passed to p7_tophits_Alignment() */ ESL_ALLOC( th.unsrt, sizeof(P7_HIT) * stats->nhits); memcpy( th.unsrt, hits, sizeof(P7_HIT) * stats->nhits); ESL_ALLOC( th.hit, sizeof(P7_HIT*) * stats->nhits); for (i=0; i<stats->nhits; i++) { th.hit[i] = &(th.unsrt[i]); if ( th.hit[i]->ndom > 10000 || th.hit[i]->flags > p7_IS_INCLUDED + p7_IS_REPORTED + p7_IS_NEW + p7_IS_DROPPED + p7_IS_DUPLICATE ) { status = eslFAIL; goto ERROR; } } // th.unsrt = NULL; th.N = stats->nhits; th.nreported = 0; th.nincluded = 0; th.is_sorted_by_sortkey = 0; th.is_sorted_by_seqidx = 0; for (i = 0; i < th.N; i++) { ESL_ALLOC( th.hit[i]->dcl, sizeof(P7_DOMAIN) * th.hit[i]->ndom); /* Go through the hits and set to be excluded or included as necessary */ set_included = 0; if(th.hit[i]->flags & p7_IS_INCLUDED){ if(excl_size > 0){ for( c = 0; c < excl_size; c++){ if(excl[c] == (long)(th.hit[i]->name) ){ th.hit[i]->flags = p7_IS_DROPPED; th.hit[i]->nincluded = 0; break; } } } }else{ if(incl_size > 0){ for( c = 0; c < incl_size; c++){ if(incl[c] == (long)th.hit[i]->name ){ th.hit[i]->flags = p7_IS_INCLUDED; set_included = 1; } } } } /* first grab all the P7_DOMAINs for the hit */ for (j=0; j < th.hit[i]->ndom; j++) { dom = th.hit[i]->dcl + j; memcpy(dom , (P7_DOMAIN*)p, sizeof(P7_DOMAIN)); /* Possibly set domains to be include if being * externally set via incl list*/ if(set_included) th.hit[i]->dcl[j].is_included = 1; p += sizeof(P7_DOMAIN); } /* then grab the P7_ALIDISPLAYs for the hit */ for (j=0; j < th.hit[i]->ndom; j++) { ad = (P7_ALIDISPLAY*)p; ESL_ALLOC(th.hit[i]->dcl[j].ad, sizeof(P7_ALIDISPLAY)); ad2 = th.hit[i]->dcl[j].ad; ad2->memsize = ad->memsize; ad2->rfline = ad->rfline; ad2->mmline = ad->mmline; ad2->csline = ad->csline ; ad2->model = ad->model ; ad2->mline = ad->mline ; ad2->aseq = ad->aseq ; ad2->ppline = ad->ppline; ad2->N = ad->N; ad2->hmmname = ad->hmmname; ad2->hmmacc = ad->hmmacc ; ad2->hmmdesc = ad->hmmdesc; ad2->hmmfrom = ad->hmmfrom; ad2->hmmto = ad->hmmto; ad2->M = ad->M; ad2->sqname = ad->sqname; ad2->sqacc = ad->sqacc ; ad2->sqdesc = ad->sqdesc; ad2->sqfrom = ad->sqfrom; ad2->sqto = ad->sqto; ad2->L = ad->L; p += sizeof(P7_ALIDISPLAY); ESL_ALLOC(ad2->mem, ad2->memsize); memcpy(ad2->mem, p, ad->memsize); p += ad2->memsize; p7_alidisplay_Deserialize(ad2); } } /* use the tophits and trace info above to produce an alignment */ if ( (status = p7_tophits_Alignment(&th, hmm->abc, &qsq, &qtr, extra_sqcnt, p7_ALL_CONSENSUS_COLS, &msa)) != eslOK) goto ERROR; /* free memory */ if (qtr != NULL) free(qtr); for (i = 0; i < th.N; i++) { for (j=0; j < th.hit[i]->ndom; j++) p7_alidisplay_Destroy(th.hit[i]->dcl[j].ad); if (th.hit[i]->dcl != NULL) free (th.hit[i]->dcl); } if (th.unsrt != NULL) free (th.unsrt); if (th.hit != NULL) free (th.hit); *ret_msa = msa; return eslOK; ERROR: /* free memory */ if (qtr != NULL) free(qtr); for (i = 0; i < th.N; i++) { for (j=0; j < th.hit[i]->ndom; j++) p7_alidisplay_Destroy(th.hit[i]->dcl[j].ad); if (th.hit[i]->dcl != NULL) free (th.hit[i]->dcl); } if (th.unsrt != NULL) free (th.unsrt); if (th.hit != NULL) free (th.hit); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; char *ghmmfile = esl_opt_GetArg(go, 1); /* HMMs parameterized for sequence generation */ char *ahmmfile = esl_opt_GetArg(go, 2); /* HMMs parameterized for alignment */ int N = esl_opt_GetInteger(go, "-N"); P7_HMMFILE *ghfp = NULL; P7_HMMFILE *ahfp = NULL; P7_HMM *ghmm = NULL; P7_HMM *ahmm = NULL; P7_PROFILE *ggm = NULL; P7_PROFILE *agm = NULL; P7_OPROFILE *aom = NULL; P7_BG *bg = NULL; ESL_SQ *sq = NULL; P7_TRACE *reftr = p7_trace_Create(); P7_TRACE *testtr = p7_trace_Create(); P7_TRACE_METRICS *tmetrics = p7_trace_metrics_Create(); P7_REFMX *rmx = p7_refmx_Create(100,100); // P7_FILTERMX *ox = NULL; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); P7_SPARSEMASK *sm = p7_sparsemask_Create(100, 100, hw->simd); P7_SPARSEMX *sxv = p7_sparsemx_Create(NULL); int idx; char errbuf[eslERRBUFSIZE]; int status; p7_Init(); /* open HMM file containing models parameterized for generation (sampling) of seqs */ status = p7_hmmfile_OpenE(ghmmfile, NULL, &ghfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ghmmfile, errbuf); /* open HMM file containing models parameterized for alignment (may be the same as ghmmfile) */ status = p7_hmmfile_OpenE(ahmmfile, NULL, &ahfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ahmmfile, errbuf); while ( (status = p7_hmmfile_Read(ghfp, &abc, &ghmm)) == eslOK) /* <abc> gets set on first read */ { /* read the counterpart HMM from <ahfp> */ status = p7_hmmfile_Read(ahfp, &abc, &ahmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ahfp->fname, ahfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ahfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", ahfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", ahfp->fname); /* try to validate that they're the "same" */ if (ahmm->M != ghmm->M || strcmp(ahmm->name, ghmm->name) != 0) p7_Fail("<gen-hmmfile>, <ali-hmmfile> contain different set or order of models"); /* deferred one-time creation of structures that need to know the alphabet */ if (!bg) bg = p7_bg_Create(abc); if (!sq) sq = esl_sq_CreateDigital(abc); ggm = p7_profile_Create(ghmm->M, abc); agm = p7_profile_Create(ahmm->M, abc); aom = p7_oprofile_Create(ahmm->M, abc, hw->simd); p7_profile_ConfigCustom(ggm, ghmm, bg, esl_opt_GetInteger(go, "--gL"), esl_opt_GetReal(go, "--gnj"), esl_opt_GetReal(go, "--gpglocal")); p7_profile_ConfigCustom(agm, ahmm, bg, 100, esl_opt_GetReal(go, "--anj"), esl_opt_GetReal(go, "--apglocal")); p7_oprofile_Convert(agm, aom); for (idx = 1; idx <= N; idx++) { p7_ProfileEmit(rng, ghmm, ggm, bg, sq, reftr); if (esl_opt_GetBoolean(go, "--dumpseqs")) { esl_sq_FormatName(sq, "seq%d", idx); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); } p7_bg_SetLength(bg, sq->n); p7_profile_SetLength(agm, sq->n); p7_sparsemask_Reinit(sm, agm->M, sq->n); p7_sparsemask_AddAll(sm); if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(sq->dsq, sq->n, agm, rmx, testtr, /*opt_vsc=*/NULL); else p7_SparseViterbi (sq->dsq, sq->n, agm, sm, sxv, testtr, /*opt_vsc=*/NULL); p7_trace_metrics(reftr, testtr, tmetrics); p7_sparsemask_Reuse(sm); p7_sparsemx_Reuse(sxv); //p7_filtermx_Reuse(ox); p7_refmx_Reuse(rmx); esl_sq_Reuse(sq); p7_trace_Reuse(reftr); p7_trace_Reuse(testtr); } p7_oprofile_Destroy(aom); p7_profile_Destroy(ggm); p7_profile_Destroy(agm); p7_hmm_Destroy(ghmm); p7_hmm_Destroy(ahmm); } /* we leave the loop with <status> set by a p7_hmmfile_Read() on ghfp; if all is well, status=eslEOF */ if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ghfp->fname, ghfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ghfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslEOF) p7_Fail("Unexpected error in reading HMMs from %s\n", ghfp->fname); p7_trace_metrics_Dump(stdout, tmetrics); p7_hmmfile_Close(ghfp); p7_hmmfile_Close(ahfp); // p7_filtermx_Destroy(ox); p7_sparsemask_Destroy(sm); p7_sparsemx_Destroy(sxv); p7_refmx_Destroy(rmx); p7_trace_metrics_Destroy(tmetrics); p7_trace_Destroy(testtr); p7_trace_Destroy(reftr); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); int N = esl_opt_GetInteger(go, "-N"); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; P7_ALIDISPLAY *ad = NULL; int i,z; if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, 0); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, 0, p7_UNIGLOCAL); /* that sets N,C,J to generate nothing */ om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_GetBoolean(go, "-p")) tr = p7_trace_CreateWithPP(); else tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_ProfileEmit(r, hmm, gm, bg, sq, tr); esl_sq_SetName(sq, "random"); if (! esl_opt_GetBoolean(go, "-b")) { if (esl_opt_GetBoolean(go, "-p")) for (z = 0; z < tr->N; z++) if (tr->i[z] > 0) tr->pp[z] = esl_random(r); ad = p7_alidisplay_Create(tr, 0, om, sq); p7_alidisplay_Print(stdout, ad, 40, 80, FALSE); p7_alidisplay_Destroy(ad); } p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_stopwatch_Destroy(w); esl_getopts_Destroy(go); return 0; }
/* Function: p7_alidisplay_Backconvert() * Synopsis: Convert an alidisplay to a faux trace and subsequence. * Incept: SRE, Wed Dec 10 09:49:28 2008 [Janelia] * * Purpose: Convert alignment display object <ad> to a faux subsequence * and faux subsequence trace, returning them in <ret_sq> and * <ret_tr>. * * The subsequence <*ret_sq> is digital; ascii residues in * <ad> are digitized using digital alphabet <abc>. * * The subsequence and trace are suitable for passing as * array elements to <p7_MultipleAlignment>. This is the * main purpose of backconversion. Results of a profile * search are stored in a hit list as a processed * <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to * reduce space and to reduce communication overhead in * parallelized search implementations. After reduction * to a final hit list, a master may want to construct a * multiple alignment of all the significant hits. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal * data corruption. On any exception, <*ret_sq> and <*ret_tr> are * <NULL>. * * Xref: J4/29. */ int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr) { ESL_SQ *sq = NULL; /* RETURN: faux subsequence */ P7_TRACE *tr = NULL; /* RETURN: faux trace */ int subL = 0; /* subsequence length in the <ad> */ int a, i, k; /* coords for <ad>, <sq->dsq>, model */ char st; /* state type: MDI */ int status; /* Make a first pass over <ad> just to calculate subseq length */ for (a = 0; a < ad->N; a++) if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++; /* Allocations */ if ((sq = esl_sq_CreateDigital(abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR; if ((tr = (ad->ppline == NULL) ? p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; } if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR; /* +6 is for SNB/ECT */ /* Construction of dsq, trace */ sq->dsq[0] = eslDSQ_SENTINEL; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR; k = ad->hmmfrom; i = 1; for (a = 0; a < ad->N; a++) { if (esl_abc_CIsResidue(abc, ad->model[a])) { st = (esl_abc_CIsResidue(abc, ad->aseq[a]) ? p7T_M : p7T_D); } else st = p7T_I; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, st, k, i) : p7_trace_AppendWithPP(tr, st, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR; switch (st) { case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); k++; i++; break; case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break; case p7T_D: k++; break; } } if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR; sq->dsq[i] = eslDSQ_SENTINEL; /* some sanity checks */ if (tr->N != ad->N + 6) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)", ad->sqname, ad->hmmname); if (k != ad->hmmto + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname); if (i != subL + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)", ad->sqname, ad->hmmname); /* Set up <sq> annotation as a subseq of a source sequence */ if ((status = esl_sq_FormatName(sq, "%s/%ld-%ld", ad->sqname, ad->sqfrom, ad->sqto)) != eslOK) goto ERROR; if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR; if ((status = esl_sq_SetSource (sq, ad->sqname)) != eslOK) goto ERROR; if (ad->sqacc[0] != '\0') { if ((status = esl_sq_SetAccession (sq, ad->sqacc)) != eslOK) goto ERROR; } sq->n = subL; sq->start = ad->sqfrom; sq->end = ad->sqto; sq->C = 0; sq->W = subL; sq->L = ad->L; tr->M = ad->M; tr->L = ad->L; *ret_sq = sq; *ret_tr = tr; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (tr != NULL) p7_trace_Destroy(tr); *ret_sq = NULL; *ret_tr = NULL; return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; ESL_RANDOMNESS *rng = esl_randomness_CreateFast(0); P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; int N = esl_opt_GetInteger(go, "-N"); int i; float vsc, fsc, tsc; char errbuf[eslERRBUFSIZE]; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_GetBoolean(go, "-p")) p7_oprofile_Dump(stdout, om); fwd = p7_omx_Create(gm->M, sq->n, sq->n); gx = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); if (esl_opt_GetBoolean(go, "-m") == TRUE) p7_omx_SetDumpMode(stdout, fwd, TRUE); p7_GViterbi(sq->dsq, sq->n, gm, gx, &vsc); p7_Forward (sq->dsq, sq->n, om, fwd, &fsc); for (i = 0; i < N; i++) { p7_StochasticTrace(rng, sq->dsq, sq->n, om, fwd, tr); p7_trace_Score(tr, sq->dsq, gm, &tsc); if (esl_opt_GetBoolean(go, "-t") == TRUE) p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace %d fails validation:\n%s\n", i, errbuf); printf("Sampled trace: %.4f nats\n", tsc); p7_trace_Reuse(tr); } printf("Forward score: %.4f nats\n", fsc); printf("Viterbi score: %.4f nats\n", vsc); /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_trace_Destroy(tr); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_randomness_Destroy(rng); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_TRACE *tr = NULL; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2)); int i; float sc, fsc, vsc; float bestsc = -eslINFINITY; if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); fwd = p7_omx_Create(gm->M, L, L); gx = p7_gmx_Create(gm->M, L); tr = p7_trace_Create(); esl_rsq_xfIID(r, bg->f, abc->K, L, dsq); p7_GViterbi(dsq, L, gm, gx, &vsc); p7_Forward (dsq, L, om, fwd, &fsc); esl_stopwatch_Start(w); for (i = 0; i < N; i++) { p7_StochasticTrace(r, dsq, L, om, fwd, tr); p7_trace_Score(tr, dsq, gm, &sc); bestsc = ESL_MAX(bestsc, sc); p7_trace_Reuse(tr); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); printf("forward sc = %.4f nats\n", fsc); printf("viterbi sc = %.4f nats\n", vsc); printf("max trace sc = %.4f nats\n", bestsc); free(dsq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx); p7_omx_Destroy(fwd); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* process_workunit() * * This is the routine that actually does the work. * * A work unit consists of one HMM, <hmm>. * The result is the <scores> array, which contains an array of N scores; * caller provides this memory. * How those scores are generated is controlled by the application configuration in <cfg>. */ static int process_workunit(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { int L = esl_opt_GetInteger(go, "-L"); P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_REFMX *rmx = NULL; P7_CHECKPTMX *cx = NULL; P7_FILTERMX *fx = NULL; P7_TRACE *tr = NULL; ESL_DSQ *dsq = NULL; int i; int scounts[p7T_NSTATETYPES]; /* state usage counts from a trace */ float sc; float nullsc; int status; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); /* Optionally set a custom background, determined by model composition; * an experimental hack. */ if (esl_opt_GetBoolean(go, "--bgcomp")) { float *p = NULL; float KL; p7_hmm_CompositionKLDist(hmm, cfg->bg, &KL, &p); esl_vec_FCopy(p, cfg->abc->K, cfg->bg->f); } /* Create and configure our generic profile, as requested */ gm = p7_profile_Create(hmm->M, cfg->abc); if (esl_opt_GetBoolean(go, "--multi")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_Config (gm, hmm, cfg->bg); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigLocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigGlocal(gm, hmm, cfg->bg, L); } } else if (esl_opt_GetBoolean(go, "--uni")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_ConfigCustom (gm, hmm, cfg->bg, L, 0.0, 0.5); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigUnilocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigUniglocal(gm, hmm, cfg->bg, L); } } p7_profile_SetLength(gm, L); p7_bg_SetLength(cfg->bg, L); if (esl_opt_GetBoolean(go, "--x-no-lengthmodel")) elide_length_model(gm, cfg->bg); /* Allocate DP matrix for <gm>. */ rmx = p7_refmx_Create(gm->M, L); /* Create and configure the vectorized profile, if needed; * and allocate its DP matrix */ if (esl_opt_GetBoolean(go, "--vector")) { om = p7_oprofile_Create(gm->M, cfg->abc, om->simd); p7_oprofile_Convert(gm, om); cx = p7_checkptmx_Create(gm->M, L, ESL_MBYTES(32), om->simd); fx = p7_filtermx_Create(gm->M, om->simd); } /* Remaining allocation */ ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2)); tr = p7_trace_Create(); /* Collect scores from N random sequences of length L */ for (i = 0; i < cfg->N; i++) { esl_rsq_xfIID(cfg->r, cfg->bg->f, cfg->abc->K, L, dsq); sc = eslINFINITY; /* Vectorized implementations of Viterbi, MSV may overflow. * In this case, they'll leave sc=eslINFINITY. * Then we fail over to the nonvector "generic" implementation. * That's why this next block isn't an if/else. */ if (esl_opt_GetBoolean(go, "--vector")) { if (esl_opt_GetBoolean(go, "--vit")) p7_ViterbiFilter(dsq, L, om, fx, &sc); else if (esl_opt_GetBoolean(go, "--fwd")) p7_ForwardFilter(dsq, L, om, cx, &sc); else if (esl_opt_GetBoolean(go, "--msv")) p7_MSVFilter (dsq, L, om, fx, &sc); } /* If we tried a vector calculation above but it overflowed, * or if we're to do --generic DP calculations, sc==eslINFINITY now; * hence the if condition here: */ if (sc == eslINFINITY) { if (esl_opt_GetBoolean(go, "--fwd")) p7_ReferenceForward(dsq, L, gm, rmx, &sc); /* any mode: dual,local,glocal; gm's config takes care of this */ else if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(dsq, L, gm, rmx, tr, &sc); /* local-only mode. cmdline opts processing has already assured that --local set */ else if (esl_opt_GetBoolean(go, "--msv")) p7_Die("We used to be able to do a generic MSV algorithm - but no longer"); } /* Optional: get Viterbi alignment length too. */ if (esl_opt_GetBoolean(go, "-a")) /* -a only works with Viterbi; getopts has checked this already; <tr> must be valid */ { p7_trace_GetStateUseCounts(tr, scounts); /* there's various ways we could counts "alignment length". * Here we'll use the total length of model used, in nodes: M+D states. * score vs al would gives us relative entropy / model position. */ /* alilens[i] = scounts[p7T_D] + scounts[p7T_I]; SRE: temporarily testing this instead */ alilens[i] = scounts[p7T_ML] + scounts[p7T_DL] + scounts[p7T_IL] + scounts[p7T_MG] + scounts[p7T_DG] + scounts[p7T_IG]; p7_trace_Reuse(tr); } p7_bg_NullOne(cfg->bg, dsq, L, &nullsc); scores[i] = (sc - nullsc) / eslCONST_LOG2; if (cx) p7_checkptmx_Reuse(cx); if (fx) p7_filtermx_Reuse(fx); p7_refmx_Reuse(rmx); } status = eslOK; /* deliberate flowthru */ ERROR: if (dsq != NULL) free(dsq); p7_checkptmx_Destroy(cx); p7_filtermx_Destroy(fx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_refmx_Destroy(rmx); p7_trace_Destroy(tr); if (status == eslEMEM) sprintf(errbuf, "allocation failure"); return status; }
/* "generation" test * Compare a randomly sampled profile to sequences sampled * from that profile. * * This test is not very stringent, because we don't know the "true" * envelopes. Rather, this is more of a test that nothing obviously * bad happens, like a crash, or obviously incorrect data. * * We test: * 1. Seq coordinates of each envelope are coherent: * 1 <= oa <= ia <= i0 <= ib <= ob <= L * * 2. Envelopes do not overlap (assuming default threshold of * 0.5 when defining them): * ia(d) > ib(d-1) for d = 2..D * (Outer envelopes, in contrast, can overlap.) * * 3. envsc(d) <= asc_sc <= fwdsc. * * 4. If D=1 (single domain) in both the generated trace * and the inferred envelopes, and the domain coords in * the trace are encompassed by the outer envelope, * then envsc(d) >= generated trace score. */ static void utest_generation(ESL_RANDOMNESS *rng, int M, const ESL_ALPHABET *abc, int N) { char msg[] = "reference_envelopes:: generation unit test failed"; ESL_SQ *sq = esl_sq_CreateDigital(abc); P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = p7_profile_Create(M, abc); P7_TRACE *gtr = p7_trace_Create(); // generated trace P7_TRACE *vtr = p7_trace_Create(); // Viterbi trace P7_REFMX *rxf = p7_refmx_Create(M, 20); // Fwd, Vit ~~> ASC Decode UP P7_REFMX *rxd = p7_refmx_Create(M, 20); // Bck, Decode ~~> ASC Decode DOWN P7_REFMX *afu = p7_refmx_Create(M, 20); // ASC Fwd UP P7_REFMX *afd = p7_refmx_Create(M, 20); // ASC Fwd DOWN P7_REFMX *apu = rxf; // for 'clarity' we use two names for this mx P7_REFMX *apd = rxd; // ... and this one too. float *wrk = NULL; P7_ANCHORS *anch = p7_anchors_Create(); P7_ANCHORHASH *ah = p7_anchorhash_Create(); P7_ENVELOPES *env = p7_envelopes_Create(); float tol = 0.001; float gsc, fsc, asc; int idx; int d; if ( p7_modelsample(rng, M, abc, &hmm) != eslOK) esl_fatal(msg); if ( p7_profile_Config(gm, hmm, bg) != eslOK) esl_fatal(msg); for (idx = 0; idx < N; idx++) { /* Emit sequence from model, using an arbitrary length model of <M>; * restrict the emitted sequence length to 6M, arbitrarily, to * keep it down to something reasonable. */ if ( p7_profile_SetLength(gm, M) != eslOK) esl_fatal(msg); do { esl_sq_Reuse(sq); if (p7_ProfileEmit(rng, hmm, gm, bg, sq, gtr) != eslOK) esl_fatal(msg); } while (sq->n > M * 6); if (p7_trace_Index (gtr) != eslOK) esl_fatal(msg); if (p7_trace_Score (gtr, sq->dsq, gm, &gsc) != eslOK) esl_fatal(msg); /* Reset the length model to the actual length sq->n, then * put it through the domain postprocessing analysis pipeline */ if ( p7_profile_SetLength(gm, sq->n) != eslOK) esl_fatal(msg); /* First pass analysis */ if ( p7_ReferenceViterbi (sq->dsq, sq->n, gm, rxf, vtr, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceForward (sq->dsq, sq->n, gm, rxf, &fsc) != eslOK) esl_fatal(msg); if ( p7_ReferenceBackward(sq->dsq, sq->n, gm, rxd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceDecoding(sq->dsq, sq->n, gm, rxf, rxd, rxd) != eslOK) esl_fatal(msg); /* Anchor determination (MPAS algorithm) */ if ( p7_reference_Anchors(rng, sq->dsq, sq->n, gm, rxf, rxd, vtr, &wrk, ah, afu, afd, anch, &asc, NULL, NULL) != eslOK) esl_fatal(msg); /* Reuse rxf,rxd as apu, apd; finish ASC analysis with Backward, Decoding */ p7_refmx_Reuse(apu); p7_refmx_Reuse(apd); if ( p7_ReferenceASCBackward(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, NULL) != eslOK) esl_fatal(msg); if ( p7_ReferenceASCDecoding(sq->dsq, sq->n, gm, anch->a, anch->D, afu, afd, apu, apd, apu, apd) != eslOK) esl_fatal(msg); /* Envelope calculation */ if ( p7_reference_Envelopes(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, afu, afd, env) != eslOK) esl_fatal(msg); /* Test 1. Coords of each domain are coherent */ if (anch->D != env->D) esl_fatal(msg); for (d = 1; d <= anch->D; d++) if (! (1 <= env->arr[d].oa && env->arr[d].oa <= env->arr[d].ia && env->arr[d].ia <= env->arr[d].i0 && env->arr[d].i0 <= env->arr[d].ib && env->arr[d].ib <= env->arr[d].ob && env->arr[d].ob <= sq->n)) esl_fatal(msg); /* Test 2. Envelopes do not overlap. */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].ia > env->arr[d-1].ib)) esl_fatal(msg); /* Test 3. envsc(d) <= asc_sc <= fwdsc */ for (d = 1; d <= anch->D; d++) if (! (env->arr[d].env_sc <= asc+tol && asc <= fsc+tol)) esl_fatal(msg); /* Test 4, only on D=1 case with generated trace's domain * encompassed by the outer envelope */ if (gtr->ndom == 1 && anch->D == 1 && gtr->sqfrom[0] >= env->arr[1].oa && // in <gtr>, domains are 0..D-1; in <env>, 1..D gtr->sqto[0] <= env->arr[1].ob) if (! ( env->arr[1].env_sc >= gsc)) esl_fatal(msg); p7_envelopes_Reuse(env); p7_anchors_Reuse(anch); p7_anchorhash_Reuse(ah); p7_refmx_Reuse(rxf); p7_refmx_Reuse(rxd); p7_refmx_Reuse(afu); p7_refmx_Reuse(afd); p7_trace_Reuse(gtr); p7_trace_Reuse(vtr); esl_sq_Reuse(sq); } if (wrk) free(wrk); p7_envelopes_Destroy(env); p7_anchors_Destroy(anch); p7_anchorhash_Destroy(ah); p7_refmx_Destroy(afu); p7_refmx_Destroy(afd); p7_refmx_Destroy(rxf); p7_refmx_Destroy(rxd); p7_trace_Destroy(vtr); p7_trace_Destroy(gtr); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; P7_ANCHORS *anch = p7_anchors_Create(); P7_ANCHORHASH *ah = p7_anchorhash_Create(); P7_ENVELOPES *env = p7_envelopes_Create(); P7_REFMX *rxf = NULL; P7_REFMX *rxd = NULL; P7_REFMX *afu = NULL; P7_REFMX *afd = NULL; P7_REFMX *apu = NULL; P7_REFMX *apd = NULL; P7_TRACE *tr = NULL; float *wrk = NULL; P7_MPAS_PARAMS prm; P7_MPAS_STATS stats; float fsc, vsc, asc, asc_b; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open sequence file */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Read one sequence */ status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) p7_Fail("Parse failed (sequence file %s)\n%s\n", sqfp->filename, sqfp->get_error(sqfp)); else if (status != eslOK) p7_Fail("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_profile_Config(gm, hmm, bg); /* Set the profile and null model's target length models */ p7_bg_SetLength (bg, sq->n); p7_profile_SetLength(gm, sq->n); /* Allocate DP matrices and tracebacks */ rxf = p7_refmx_Create(gm->M, sq->n); rxd = p7_refmx_Create(gm->M, sq->n); tr = p7_trace_Create(); afu = p7_refmx_Create(gm->M, sq->n); afd = p7_refmx_Create(gm->M, sq->n); /* First pass analysis */ p7_ReferenceViterbi (sq->dsq, sq->n, gm, rxf, tr, &vsc); p7_ReferenceForward (sq->dsq, sq->n, gm, rxf, &fsc); p7_ReferenceBackward(sq->dsq, sq->n, gm, rxd, NULL); p7_ReferenceDecoding(sq->dsq, sq->n, gm, rxf, rxd, rxd); /* Customize MPAS parameters if you want; these are the defaults. */ prm.max_iterations = 1000; prm.loss_threshold = 0.001; prm.nmax_sampling = FALSE; prm.be_verbose = FALSE; /* MPAS algorithm gets us an anchor set */ p7_reference_Anchors(rng, sq->dsq, sq->n, gm, rxf, rxd, tr, &wrk, ah, afu, afd, anch, &asc, &prm, &stats); //printf("# ASC Forward UP:\n"); p7_refmx_Dump(stdout, afu); //printf("# ASC Forward DOWN:\n"); p7_refmx_Dump(stdout, afd); /* We no longer need rxf and rxd. * Use their space for apu/apd pair, which will briefly * hold ASC Backward matrices, then get used for ASC Decoding. */ apu = rxf; p7_refmx_Reuse(apu); apd = rxd; p7_refmx_Reuse(apd); p7_ReferenceASCBackward(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, &asc_b); //printf("# Backward score (raw, nats): %.2f\n", asc_b); //printf("# ASC Backward UP:\n"); p7_refmx_Dump(stdout, apu); //printf("# ASC Backward DOWN:\n"); p7_refmx_Dump(stdout, apd); /* ASC Decoding takes afu/afd and abu/abd as input; * overwrites abu/abd with decoding matrices */ p7_ReferenceASCDecoding(sq->dsq, sq->n, gm, anch->a, anch->D, afu, afd, apu, apd, apu, apd); //printf("# ASC Decoding UP matrix:\n"); p7_refmx_Dump(stdout, apu); //printf("# ASC Decoding DOWN:\n"); p7_refmx_Dump(stdout, apu); /* Envelope calculation needs to get four matrices: * ASC Decoding pair, apu/apd, and it will leave these constant; * ASC Forward pair, afu/afd, and it will overwrite these. */ p7_reference_Envelopes(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, afu, afd, env); p7_envelopes_Dump(stdout, env); p7_envelopes_Destroy(env); p7_anchorhash_Destroy(ah); p7_anchors_Destroy(anch); if (wrk) free(wrk); p7_trace_Destroy(tr); p7_refmx_Destroy(afd); p7_refmx_Destroy(afu); p7_refmx_Destroy(rxd); p7_refmx_Destroy(rxf); esl_sq_Destroy(sq); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }