/* Function: p7_SingleBuilder() * Synopsis: Build a new HMM from a single sequence. * * Purpose: Take the sequence <sq> and a build configuration <bld>, and * build a new HMM. * * The single sequence scoring system in the <bld> * configuration must have been previously initialized by * <p7_builder_SetScoreSystem()>. * * Args: bld - build configuration * sq - query sequence * bg - null model (needed to paramaterize insert emission probs) * opt_hmm - optRETURN: new HMM * opt_gm - optRETURN: profile corresponding to <hmm> * opt_om - optRETURN: optimized profile corresponding to <gm> * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> if <bld> isn't properly configured somehow. */ int p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE **opt_tr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om) { P7_HMM *hmm = NULL; P7_TRACE *tr = NULL; int k; int status; bld->errbuf[0] = '\0'; if (! bld->Q) ESL_XEXCEPTION(eslEINVAL, "score system not initialized"); if ((status = p7_Seqmodel(bld->abc, sq->dsq, sq->n, sq->name, bld->Q, bg->f, bld->popen, bld->pextend, &hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetComposition(hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetConsensus(hmm, sq)) != eslOK) goto ERROR; if ((status = calibrate(bld, hmm, bg, opt_gm, opt_om)) != eslOK) goto ERROR; if ( bld->abc->type == eslDNA || bld->abc->type == eslRNA ) { if (bld->w_len > 0) hmm->max_length = bld->w_len; else if (bld->w_beta == 0.0) hmm->max_length = hmm->M *4; else if ( (status = p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR; } /* build a faux trace: relative to core model (B->M_1..M_L->E) */ if (opt_tr != NULL) { if ((tr = p7_trace_Create()) == NULL) goto ERROR; if ((status = p7_trace_Append(tr, p7T_B, 0, 0)) != eslOK) goto ERROR; for (k = 1; k <= sq->n; k++) if ((status = p7_trace_Append(tr, p7T_M, k, k)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_E, 0, 0)) != eslOK) goto ERROR; tr->M = sq->n; tr->L = sq->n; } /* note that <opt_gm> and <opt_om> were already set by calibrate() call above. */ if (opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm); if (opt_tr != NULL) *opt_tr = tr; return eslOK; ERROR: p7_hmm_Destroy(hmm); if (tr != NULL) p7_trace_Destroy(tr); if (opt_gm != NULL) p7_profile_Destroy(*opt_gm); if (opt_om != NULL) p7_oprofile_Destroy(*opt_om); return status; }
/* Function: p7_StochasticTrace() * Synopsis: Sample a traceback from a Forward matrix * Incept: SRE, Fri Aug 8 17:40:18 2008 [UA217, IAD-SFO] * * Purpose: Perform a stochastic traceback from Forward matrix <ox>, * using random number generator <r>, in order to sample an * alignment of model <om> to digital sequence <dsq> of * length <L>. * * The sampled traceback is returned in <tr>, which the * caller provides with at least an initial allocation; * the <tr> allocation will be grown as needed here. * * Args: r - source of random numbers * dsq - digital sequence being aligned, 1..L * L - length of dsq * om - profile * ox - Forward matrix to trace, LxM * tr - storage for the recovered traceback * * Returns: <eslOK> on success * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> on several types of problems, including: * the trace isn't empty (wasn't Reuse()'d); */ int p7_StochasticTrace(ESL_RANDOMNESS *rng, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, const P7_OMX *ox, P7_TRACE *tr) { int i; /* position in sequence 1..L */ int k; /* position in model 1..M */ int s0, s1; /* choice of a state */ int status; if (tr->N != 0) ESL_EXCEPTION(eslEINVAL, "trace not empty; needs to be Reuse()'d?"); i = L; k = 0; if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) return status; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) return status; s0 = tr->st[tr->N-1]; while (s0 != p7T_S) { switch (s0) { case p7T_M: s1 = select_m(rng, om, ox, i, k); k--; i--; break; case p7T_D: s1 = select_d(rng, om, ox, i, k); k--; break; case p7T_I: s1 = select_i(rng, om, ox, i, k); i--; break; case p7T_N: s1 = select_n(i); break; case p7T_C: s1 = select_c(rng, om, ox, i); break; case p7T_J: s1 = select_j(rng, om, ox, i); break; case p7T_E: s1 = select_e(rng, om, ox, i, &k); break; case p7T_B: s1 = select_b(rng, om, ox, i); break; default: ESL_EXCEPTION(eslEINVAL, "bogus state in traceback"); } if (s1 == -1) ESL_EXCEPTION(eslEINVAL, "Stochastic traceback choice failed"); if ((status = p7_trace_Append(tr, s1, k, i)) != eslOK) return status; if ( (s1 == p7T_N || s1 == p7T_J || s1 == p7T_C) && s1 == s0) i--; s0 = s1; } /* end traceback, at S state */ tr->M = om->M; tr->L = L; return p7_trace_Reverse(tr); }
/* Function: hmmpgmd2msa() * Synopsis: Convert an HMMPGMD-derived data stream to an MSA, based * on the corresponding hmm * * Purpose: Given a data stream from HMMPGMD of the form shown * here, produce an MSA: * HMMD_SEARCH_STATS * P7_HITS array of size (nhits) from above? * then repeats of P7_DOMAIN and P7_ALIDISPLAY data * for the hits, where each hit with d domains * produces * d P7_DOMAINs * then * d P7_ALIDISPLAYs * ... optionally adding a sequence with length matching * that of the hmm, which will be included in the alignment. * * A further extension has been the ability to include or exclude * sequences form the list of hits. * * This function's expected use is as a helper function for * the hmmer website, which gets the above data stream from * hmmpgmd. * * Args : data: a pointer to binary data in the format given above * hmm: the HMM against which the alidisplay traces and * additional sequences/traces are threaded to reach * the returned msa. * qsq : optional sequence to be included in the output msa; * must have the same number of residues as the hmm * has states, as each residue i will be aligned to * state i. * incl: optional array of sequence names, in the case of * hmmpgmd a list of ints, which are are excluded due * to the sequence threshold, but have been selected * to be included in the alignment. This ties in * with the way jackhmmer is implemented on the * HMMER website. * incl_size: required size of the incl array. zero if incl is null. * excl: optional array of sequence names, in the case of * hmmpgmd a list of ints, which are are included as they * score above threshold, but have been selected * to be excluded from the alignment. * excl_size: required size of the excl array. zero if excl is null. * * * Returns: Pointer to completed MSA object. NULL on error * */ int hmmpgmd2msa(void *data, P7_HMM *hmm, ESL_SQ *qsq, int *incl, int incl_size, int *excl, int excl_size, ESL_MSA **ret_msa) { int i, j; int c; int status; int set_included; /* trace of the query sequence with N residues onto model with N match states */ P7_TRACE *qtr = NULL; int extra_sqcnt = 0; /* vars used to read from the binary data */ HMMD_SEARCH_STATS *stats = NULL; /* pointer to a single stats object, at the beginning of data */ P7_HIT *hits = NULL; /* an array of hits, at the appropriate offset in data */ /* vars used in msa construction */ P7_TOPHITS th; P7_ALIDISPLAY *ad, *ad2; ESL_MSA *msa = NULL; P7_DOMAIN *dom = NULL; char *p = (char*)data; /*pointer used to walk along data, must be char* to allow pointer arithmetic */ th.N = 0; th.unsrt = NULL; th.hit = NULL; /* optionally build a faux trace for the query sequence: relative to core model (B->M_1..M_L->E) */ if (qsq != NULL) { if (qsq->n != hmm->M) { status = eslFAIL; goto ERROR; } if ((qtr = p7_trace_Create()) == NULL) {status = eslFAIL; goto ERROR; } if ((status = p7_trace_Append(qtr, p7T_B, 0, 0)) != eslOK) goto ERROR; for (i = 1; i <= qsq->n; i++) if ((status = p7_trace_Append(qtr, p7T_M, i, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(qtr, p7T_E, 0, 0)) != eslOK) goto ERROR; qtr->M = qsq->n; qtr->L = qsq->n; extra_sqcnt = 1; } /* get search stats + hit info */ stats = (HMMD_SEARCH_STATS*)p; /* sanity check */ if ( ( stats->Z_setby != p7_ZSETBY_NTARGETS && stats->Z_setby != p7_ZSETBY_OPTION && stats->Z_setby != p7_ZSETBY_FILEINFO ) || ( stats->domZ_setby != p7_ZSETBY_NTARGETS && stats->domZ_setby != p7_ZSETBY_OPTION && stats->domZ_setby != p7_ZSETBY_FILEINFO ) || stats->nhits > 10000000 || stats->elapsed > 1000000 ) { status = eslFAIL; goto ERROR; } /* ok, it looks legitimate */ p += sizeof(HMMD_SEARCH_STATS); hits = (P7_HIT*)p; p += sizeof(P7_HIT) * stats->nhits; /* create a tophits object, to be passed to p7_tophits_Alignment() */ ESL_ALLOC( th.unsrt, sizeof(P7_HIT) * stats->nhits); memcpy( th.unsrt, hits, sizeof(P7_HIT) * stats->nhits); ESL_ALLOC( th.hit, sizeof(P7_HIT*) * stats->nhits); for (i=0; i<stats->nhits; i++) { th.hit[i] = &(th.unsrt[i]); if ( th.hit[i]->ndom > 10000 || th.hit[i]->flags > p7_IS_INCLUDED + p7_IS_REPORTED + p7_IS_NEW + p7_IS_DROPPED + p7_IS_DUPLICATE ) { status = eslFAIL; goto ERROR; } } // th.unsrt = NULL; th.N = stats->nhits; th.nreported = 0; th.nincluded = 0; th.is_sorted_by_sortkey = 0; th.is_sorted_by_seqidx = 0; for (i = 0; i < th.N; i++) { ESL_ALLOC( th.hit[i]->dcl, sizeof(P7_DOMAIN) * th.hit[i]->ndom); /* Go through the hits and set to be excluded or included as necessary */ set_included = 0; if(th.hit[i]->flags & p7_IS_INCLUDED){ if(excl_size > 0){ for( c = 0; c < excl_size; c++){ if(excl[c] == (long)(th.hit[i]->name) ){ th.hit[i]->flags = p7_IS_DROPPED; th.hit[i]->nincluded = 0; break; } } } }else{ if(incl_size > 0){ for( c = 0; c < incl_size; c++){ if(incl[c] == (long)th.hit[i]->name ){ th.hit[i]->flags = p7_IS_INCLUDED; set_included = 1; } } } } /* first grab all the P7_DOMAINs for the hit */ for (j=0; j < th.hit[i]->ndom; j++) { dom = th.hit[i]->dcl + j; memcpy(dom , (P7_DOMAIN*)p, sizeof(P7_DOMAIN)); /* Possibly set domains to be include if being * externally set via incl list*/ if(set_included) th.hit[i]->dcl[j].is_included = 1; p += sizeof(P7_DOMAIN); } /* then grab the P7_ALIDISPLAYs for the hit */ for (j=0; j < th.hit[i]->ndom; j++) { ad = (P7_ALIDISPLAY*)p; ESL_ALLOC(th.hit[i]->dcl[j].ad, sizeof(P7_ALIDISPLAY)); ad2 = th.hit[i]->dcl[j].ad; ad2->memsize = ad->memsize; ad2->rfline = ad->rfline; ad2->mmline = ad->mmline; ad2->csline = ad->csline ; ad2->model = ad->model ; ad2->mline = ad->mline ; ad2->aseq = ad->aseq ; ad2->ppline = ad->ppline; ad2->N = ad->N; ad2->hmmname = ad->hmmname; ad2->hmmacc = ad->hmmacc ; ad2->hmmdesc = ad->hmmdesc; ad2->hmmfrom = ad->hmmfrom; ad2->hmmto = ad->hmmto; ad2->M = ad->M; ad2->sqname = ad->sqname; ad2->sqacc = ad->sqacc ; ad2->sqdesc = ad->sqdesc; ad2->sqfrom = ad->sqfrom; ad2->sqto = ad->sqto; ad2->L = ad->L; p += sizeof(P7_ALIDISPLAY); ESL_ALLOC(ad2->mem, ad2->memsize); memcpy(ad2->mem, p, ad->memsize); p += ad2->memsize; p7_alidisplay_Deserialize(ad2); } } /* use the tophits and trace info above to produce an alignment */ if ( (status = p7_tophits_Alignment(&th, hmm->abc, &qsq, &qtr, extra_sqcnt, p7_ALL_CONSENSUS_COLS, &msa)) != eslOK) goto ERROR; /* free memory */ if (qtr != NULL) free(qtr); for (i = 0; i < th.N; i++) { for (j=0; j < th.hit[i]->ndom; j++) p7_alidisplay_Destroy(th.hit[i]->dcl[j].ad); if (th.hit[i]->dcl != NULL) free (th.hit[i]->dcl); } if (th.unsrt != NULL) free (th.unsrt); if (th.hit != NULL) free (th.hit); *ret_msa = msa; return eslOK; ERROR: /* free memory */ if (qtr != NULL) free(qtr); for (i = 0; i < th.N; i++) { for (j=0; j < th.hit[i]->ndom; j++) p7_alidisplay_Destroy(th.hit[i]->dcl[j].ad); if (th.hit[i]->dcl != NULL) free (th.hit[i]->dcl); } if (th.unsrt != NULL) free (th.unsrt); if (th.hit != NULL) free (th.hit); return status; }
/* Function: p7_alidisplay_Backconvert() * Synopsis: Convert an alidisplay to a faux trace and subsequence. * Incept: SRE, Wed Dec 10 09:49:28 2008 [Janelia] * * Purpose: Convert alignment display object <ad> to a faux subsequence * and faux subsequence trace, returning them in <ret_sq> and * <ret_tr>. * * The subsequence <*ret_sq> is digital; ascii residues in * <ad> are digitized using digital alphabet <abc>. * * The subsequence and trace are suitable for passing as * array elements to <p7_MultipleAlignment>. This is the * main purpose of backconversion. Results of a profile * search are stored in a hit list as a processed * <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to * reduce space and to reduce communication overhead in * parallelized search implementations. After reduction * to a final hit list, a master may want to construct a * multiple alignment of all the significant hits. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal * data corruption. On any exception, <*ret_sq> and <*ret_tr> are * <NULL>. * * Xref: J4/29. */ int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr) { ESL_SQ *sq = NULL; /* RETURN: faux subsequence */ P7_TRACE *tr = NULL; /* RETURN: faux trace */ int subL = 0; /* subsequence length in the <ad> */ int a, i, k; /* coords for <ad>, <sq->dsq>, model */ char st; /* state type: MDI */ int status; /* Make a first pass over <ad> just to calculate subseq length */ for (a = 0; a < ad->N; a++) if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++; /* Allocations */ if ((sq = esl_sq_CreateDigital(abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR; if ((tr = (ad->ppline == NULL) ? p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; } if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR; /* +6 is for SNB/ECT */ /* Construction of dsq, trace */ sq->dsq[0] = eslDSQ_SENTINEL; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR; k = ad->hmmfrom; i = 1; for (a = 0; a < ad->N; a++) { if (esl_abc_CIsResidue(abc, ad->model[a])) { st = (esl_abc_CIsResidue(abc, ad->aseq[a]) ? p7T_M : p7T_D); } else st = p7T_I; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, st, k, i) : p7_trace_AppendWithPP(tr, st, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR; switch (st) { case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); k++; i++; break; case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break; case p7T_D: k++; break; } } if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR; sq->dsq[i] = eslDSQ_SENTINEL; /* some sanity checks */ if (tr->N != ad->N + 6) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)", ad->sqname, ad->hmmname); if (k != ad->hmmto + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname); if (i != subL + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)", ad->sqname, ad->hmmname); /* Set up <sq> annotation as a subseq of a source sequence */ if ((status = esl_sq_FormatName(sq, "%s/%ld-%ld", ad->sqname, ad->sqfrom, ad->sqto)) != eslOK) goto ERROR; if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR; if ((status = esl_sq_SetSource (sq, ad->sqname)) != eslOK) goto ERROR; if (ad->sqacc[0] != '\0') { if ((status = esl_sq_SetAccession (sq, ad->sqacc)) != eslOK) goto ERROR; } sq->n = subL; sq->start = ad->sqfrom; sq->end = ad->sqto; sq->C = 0; sq->W = subL; sq->L = ad->L; tr->M = ad->M; tr->L = ad->L; *ret_sq = sq; *ret_tr = tr; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (tr != NULL) p7_trace_Destroy(tr); *ret_sq = NULL; *ret_tr = NULL; return status; }
/* Function: p7_GStochasticTrace() * Synopsis: Stochastic traceback of a Forward matrix. * Incept: SRE, Thu Jan 3 15:39:20 2008 [Janelia] * * Purpose: Stochastic traceback of Forward matrix <gx> to * sample an alignment of digital sequence <dsq> * (of length <L>) to the profile <gm>. * * The sampled traceback is returned in <tr>, which the * caller must have at least made an initial allocation of * (the <tr> will be grown as needed here). * * Args: r - source of random numbers * dsq - digital sequence aligned to, 1..L * L - length of dsq * gm - profile * mx - Forward matrix to trace, L x M * tr - storage for the recovered traceback. * * Returns: <eslOK> on success. */ int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int status; int i; /* position in seq (1..L) */ int k; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; float *sc; /* scores of possible choices: up to 2M-1, in the case of exits to E */ int scur, sprv; /* we'll index M states as 1..M, and D states as 2..M = M+2..2M: M0, D1 are impossibles. */ ESL_ALLOC(sc, sizeof(float) * (2*M+1)); k = 0; i = L; if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) goto ERROR; sprv = p7T_C; while (sprv != p7T_S) { switch (tr->st[tr->N-1]) { /* C(i) comes from C(i-1) or E(i) */ case p7T_C: if (XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i); sc[0] = XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_C : p7T_E; break; /* E connects from any M or D state. k set here */ case p7T_E: if (XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { /* local models come from any M, D */ sc[0] = sc[M+1] = -eslINFINITY; for (k = 1; k <= M; k++) sc[k] = MMX(i,k); for (k = 2; k <= M; k++) sc[k+M] = DMX(i,k); esl_vec_FLogNorm(sc, 2*M+1); /* now sc is a prob vector */ k = esl_rnd_FChoose(r, sc, 2*M+1); if (k <= M) scur = p7T_M; else { k -= M; scur = p7T_D; } } else { /* glocal models come from M_M or D_M */ k = M; sc[0] = MMX(i,M); sc[1] = DMX(i,M); esl_vec_FLogNorm(sc, 2); /* now sc is a prob vector */ scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; } break; /* M connects from {MDI} i-1,k-1, or B */ case p7T_M: if (MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); sc[0] = XMX(i-1,p7G_B) + TSC(p7P_BM, k-1); sc[1] = MMX(i-1,k-1) + TSC(p7P_MM, k-1); sc[2] = IMX(i-1,k-1) + TSC(p7P_IM, k-1); sc[3] = DMX(i-1,k-1) + TSC(p7P_DM, k-1); esl_vec_FLogNorm(sc, 4); switch (esl_rnd_FChoose(r, sc, 4)) { case 0: scur = p7T_B; break; case 1: scur = p7T_M; break; case 2: scur = p7T_I; break; case 3: scur = p7T_D; break; } k--; i--; break; /* D connects from M,D at i,k-1 */ case p7T_D: if (DMX(i, k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); sc[0] = MMX(i, k-1) + TSC(p7P_MD, k-1); sc[1] = DMX(i, k-1) + TSC(p7P_DD, k-1); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; k--; break; /* I connects from M,I at i-1,k */ case p7T_I: if (IMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); sc[0] = MMX(i-1,k) + TSC(p7P_MI, k); sc[1] = IMX(i-1,k) + TSC(p7P_II, k); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_I; i--; break; /* N connects from S, N */ case p7T_N: if (XMX(i, p7G_N) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = (i == 0) ? p7T_S : p7T_N; break; /* B connects from N, J */ case p7T_B: if (XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i); sc[0] = XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; sc[1] = XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_N : p7T_J; break; /* J connects from E(i) or J(i-1) */ case p7T_J: if (XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i); sc[0] = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_J : p7T_E; break; default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) goto ERROR; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR; tr->M = gm->M; tr->L = L; free(sc); return eslOK; ERROR: if (sc != NULL) free(sc); return status; }
/* Function: p7_CoreEmit() * Incept: SRE, Tue Jan 9 10:20:51 2007 [Janelia] * * Purpose: Generate (sample) a sequence from a core HMM <hmm>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively, which the caller has * allocated. Having the caller provide these reusable * objects allows re-use of both <sq> and <tr> in repeated * calls, saving malloc/free wastage. Either can be passed * as <NULL> if it isn't needed. * * This does not set any fields in the <sq> except for the * sequence itself. Caller must set the name, and any other * annotation it wants to add. * * Trace is relative to the core model: it may include * I_0 and I_M states, B->DD->M entry is explicit, and a * 0 length generated sequence is possible. * * Args: r - source of randomness * hmm - core HMM to generate from * sq - opt: digital sequence sampled (or NULL) * tr - opt: trace sampled (or NULL) * * Returns: <eslOK> on success; * optionally return the digital sequence through <ret_sq>, * and optionally return its trace in <ret_tr>. * * Throws: <eslECORRUPT> if emission gets us into an illegal state, * probably indicating that a probability that should have * been zero wasn't. * * Throws <eslEMEM> on a reallocation error. * * In these cases, the contents of <sq> and <tr> may be * corrupted. Caller should not trust their data, but may * safely reuse them. * * Xref: STL11/124. */ int p7_CoreEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr) { int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ char st = p7T_B; /* state type */ int x; /* sampled residue */ int status; if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } while (st != p7T_E) { /* Sample next state type, given current state type (and current k) */ switch (st) { case p7T_B: case p7T_M: switch (esl_rnd_FChoose(r, hmm->t[k], 3)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_I: switch (esl_rnd_FChoose(r, hmm->t[k]+3, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_D: switch (esl_rnd_FChoose(r, hmm->t[k]+5, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Bump k,i if needed, depending on new state type */ if (st == p7T_M || st == p7T_D) k++; if (st == p7T_M || st == p7T_I) i++; /* a transit to M_M+1 is a transit to the E state */ if (k == hmm->M+1) { if (st == p7T_M) { st = p7T_E; k = 0; } else ESL_XEXCEPTION(eslECORRUPT, "failed to reach E state properly"); } /* Sample new residue x if in match or insert */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else x = eslDSQ_SENTINEL; /* Add state to trace */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } /* Add x to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL) if ((status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* Function: p7_ProfileEmit() * Synopsis: Sample a sequence from the search form of the model. * Incept: SRE, Mon Jan 22 10:23:28 2007 [Janelia] * * Purpose: Sample a sequence from the implicit * probabilistic model of a Plan7 profile <gm>. This * requires also having the core probabilities of * the accompanying <hmm>, and the background * frequencies of null1 model <bg>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively. Caller has allocated space for * both of these, though they may get reallocated/grown * here. Either can be passed as <NULL> if unneeded. * * Only the sequence field is set in the <sq>. Caller must * set the name, plus any other fields it wants to set. If * the <sq> was created in digital mode, this is the <sq->dsq>; * if the <sq> was created in text mode, this is <sq->seq>. * * <p7_ProfileEmit()> deliberately uses an <ESL_SQ> object * instead of a plain <ESL_DSQ *> or <char *> string, to * take advantage of the object's support for dynamic * reallocation of seq length, and to allow both digital and * text mode generation. * * Args: r - source of randomness * hmm - core probabilities of the profile * gm - configured search profile * sq - optRETURN: sampled sequence * tr - optRETURN: sampled trace * * Throws: (no abnormal error conditions) */ int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr) { char prv, st; /* prev, current state type */ int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ int x; /* sampled residue */ int kend = hmm->M; /* predestined end node */ int status; float xt[p7P_NXSTATES][p7P_NXTRANS]; /* Backcalculate the probabilities in the special states (loop and length model) */ for (i = 0; i < p7P_NXSTATES; i++) for (x = 0; x < p7P_NXTRANS; x++) xt[i][x] = exp(gm->xsc[i][x]); if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_S, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_N, k, i)) != eslOK) goto ERROR; } st = p7T_N; i = 0; while (st != p7T_T) { /* Sample a state transition. After this section, prv and st (prev->current state) are set; * k also gets set if we make a B->Mk entry transition. */ prv = st; switch (st) { case p7T_B: if (p7_profile_IsLocal(gm)) { /* local mode: enter the implicit profile: choose our entry and our predestined exit */ if ((status = sample_endpoints(r, gm, &k, &kend)) != eslOK) goto ERROR; st = p7T_M; /* must be, because left wing is retracted */ } else { /* glocal mode: treat B as M_0, use its transitions to MID. */ /* FIXME: this is wrong. It should sample from B->Mk distribution! */ switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, 0), p7H_NTMAT)) { case 0: st = p7T_M; k = 1; break; case 1: st = p7T_I; k = 0; break; case 2: st = p7T_D; k = 1; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_M: if (k == kend) st = p7T_E; /* check our preordained fate */ else { switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, k), p7H_NTMAT)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_D: if (k == kend) st = p7T_E; else st = (esl_rnd_FChoose(r, P7H_TDEL(hmm, k), p7H_NTDEL) == 0) ? p7T_M : p7T_D; break; case p7T_I: st = (esl_rnd_FChoose(r, P7H_TINS(hmm, k), p7H_NTINS) == 0) ? p7T_M : p7T_I; break; case p7T_N: st = (esl_rnd_FChoose(r, xt[p7P_N], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_N; break; case p7T_E: st = (esl_rnd_FChoose(r, xt[p7P_E], p7P_NXTRANS) == p7P_MOVE) ? p7T_C : p7T_J; break; case p7T_C: st = (esl_rnd_FChoose(r, xt[p7P_C], p7P_NXTRANS) == p7P_MOVE) ? p7T_T : p7T_C; break; case p7T_J: st = (esl_rnd_FChoose(r, xt[p7P_J], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_J; break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Based on the transition we just sampled, update k. */ if (st == p7T_E) k = 0; else if (st == p7T_M && prv != p7T_B) k++; /* be careful about B->Mk, where we already set k */ else if (st == p7T_D) k++; /* Based on the transition we just sampled, generate a residue. */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else if ((st == p7T_N || st == p7T_C || st == p7T_J) && prv==st) x = esl_rnd_FChoose(r, bg->f, hmm->abc->K); else x = eslDSQ_SENTINEL; if (x != eslDSQ_SENTINEL) i++; /* Add residue (if any) to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL && (status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; /* Add state to trace. */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* Function: p7_GTrace() * Incept: SRE, Thu Feb 1 10:25:56 2007 [UA 8018 St. Louis to Dulles] * * Purpose: Traceback of a Viterbi matrix: retrieval * of optimum alignment. * * This function is currently implemented as a * reconstruction traceback, rather than using a shadow * matrix. Because H3 uses floating point scores, and we * can't compare floats for equality, we have to compare * floats for near-equality and therefore, formally, we can * only guarantee a near-optimal traceback. However, even in * the unlikely event that a suboptimal is returned, the * score difference from true optimal will be negligible. * * Args: dsq - digital sequence aligned to, 1..L * L - length of <dsq> * gm - profile * mx - Viterbi matrix to trace, L x M * tr - storage for the recovered traceback. * * Return: <eslOK> on success. * <eslFAIL> if even the optimal path has zero probability; * in this case, the trace is set blank (<tr->N = 0>). * * Note: Care is taken to evaluate the prev+tsc+emission * calculations in exactly the same order that Viterbi did * them, lest you get numerical problems with * a+b+c = d; d-c != a+b because d,c are nearly equal. * (This bug appeared in dev: xref J1/121.) */ int p7_GTrace(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int i = L; /* position in seq (1..L) */ int k = 0; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; /* so {MDI}MX() macros work */ float *xmx = gx->xmx; /* so XMX() macro works */ float tol = 1e-5; /* floating point "equality" test */ float const *tsc = gm->tsc; int sprv, scur; /* previous, current state in trace */ int status; #ifdef p7_DEBUGGING if (tr->N != 0) ESL_EXCEPTION(eslEINVAL, "trace isn't empty: forgot to Reuse()?"); #endif if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) return status; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) return status; sprv = p7T_C; while (sprv != p7T_S) { float const *rsc = (i>0 ? gm->rsc[dsq[i]] : NULL); switch (sprv) { case p7T_C: /* C(i) comes from C(i-1) or E(i) */ if (XMX(i,p7G_C) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible C reached at i=%d", i); if (esl_FCompare(XMX(i, p7G_C), XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP], tol) == eslOK) scur = p7T_C; else if (esl_FCompare(XMX(i, p7G_C), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "C at i=%d couldn't be traced", i); break; case p7T_E: /* E connects from any M state. k set here */ if (XMX(i, p7G_E) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { scur = p7T_M; /* can't come from D, in a *local* Viterbi trace. */ for (k = M; k >= 1; k--) if (esl_FCompare(XMX(i, p7G_E), MMX(i,k), tol) == eslOK) break; if (k == 0) ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } else /* glocal mode: we either come from D_M or M_M */ { if (esl_FCompare(XMX(i, p7G_E), MMX(i,M), tol) == eslOK) { scur = p7T_M; k = M; } else if (esl_FCompare(XMX(i, p7G_E), DMX(i,M), tol) == eslOK) { scur = p7T_D; k = M; } else ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } break; case p7T_M: /* M connects from i-1,k-1, or B */ if (MMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); if (esl_FCompare(MMX(i,k), XMX(i-1,p7G_B) + TSC(p7P_BM, k-1) + MSC(k), tol) == eslOK) scur = p7T_B; else if (esl_FCompare(MMX(i,k), MMX(i-1,k-1) + TSC(p7P_MM, k-1) + MSC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(MMX(i,k), IMX(i-1,k-1) + TSC(p7P_IM, k-1) + MSC(k), tol) == eslOK) scur = p7T_I; else if (esl_FCompare(MMX(i,k), DMX(i-1,k-1) + TSC(p7P_DM, k-1) + MSC(k), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "M at k=%d,i=%d couldn't be traced", k,i); k--; i--; break; case p7T_D: /* D connects from M,D at i,k-1 */ if (DMX(i, k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); if (esl_FCompare(DMX(i,k), MMX(i, k-1) + TSC(p7P_MD, k-1), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(DMX(i,k), DMX(i, k-1) + TSC(p7P_DD, k-1), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "D at k=%d,i=%d couldn't be traced", k,i); k--; break; case p7T_I: /* I connects from M,I at i-1,k*/ if (IMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); if (esl_FCompare(IMX(i,k), MMX(i-1,k) + TSC(p7P_MI, k) + ISC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(IMX(i,k), IMX(i-1,k) + TSC(p7P_II, k) + ISC(k), tol) == eslOK) scur = p7T_I; else ESL_EXCEPTION(eslFAIL, "I at k=%d,i=%d couldn't be traced", k,i); i--; break; case p7T_N: /* N connects from S, N */ if (XMX(i, p7G_N) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = ( (i == 0) ? p7T_S : p7T_N); break; case p7T_B: /* B connects from N, J */ if (XMX(i,p7G_B) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible B reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], tol) == eslOK) scur = p7T_N; else if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE], tol) == eslOK) scur = p7T_J; else ESL_EXCEPTION(eslFAIL, "B at i=%d couldn't be traced", i); break; case p7T_J: /* J connects from E(i) or J(i-1) */ if (XMX(i,p7G_J) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible J reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_J), XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], tol) == eslOK) scur = p7T_J; else if (esl_FCompare(XMX(i,p7G_J), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "J at i=%d couldn't be traced", i); break; default: ESL_EXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) return status; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ tr->M = gm->M; tr->L = L; return p7_trace_Reverse(tr); }