/* Function: p7_GNull2_ByTrace() * Synopsis: Assign null2 scores to an envelope by the sampling method. * Incept: SRE, Thu May 1 10:00:43 2008 [Janelia] * * Purpose: Given a traceback <tr> for an alignment of model <gm> to * some target sequence; calculate null2 odds ratios $\frac{f'{x}}{f{x}}$ * as the state-usage-weighted emission probabilities, * with state usages calculated by counting emissions used * at positions <zstart..zend> in the trace. * * Because we only need to collect state usages from the * trace <tr>, the target sequence is irrelevant. Because * we are only averaging emission odds ratios from model * <gm>, the configuration of <gm> is irrelevant (uni * vs. multihit, or length config). * * Args: gm - model, in any configuration; only emission odds are used * tr - traceback for any region (or all) of a target sequence * zstart - first elem in <tr> to collect from; use 0 for complete * zend - last elem in <tr> to collect from; use tr->N-1 for complete * wrk - DP matrix w/ at least one row, for workspace * null2 - RESULT: odds ratios f'(x)/f(x) for all Kp residues * * Returns: <eslOK> on success, and the <ddef->n2sc> scores are set * for region <i..j>. * * Throws: <eslEMEM> on allocation error. */ int p7_GNull2_ByTrace(const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2) { float **dp = wrk->dp; /* so that {MDI}MX() macros work */ float *xmx = wrk->xmx; /* so that XMX() macro works */ int Ld = 0; int M = gm->M; int k; /* index over model position */ int x; /* index over residues */ int z; /* index over trace position */ float xfactor; /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[0..4]. */ esl_vec_FSet(wrk->dp[0], (M+1)*p7G_NSCELLS, 0.0); esl_vec_FSet(wrk->xmx, p7G_NXCELLS, 0.0); /* Calculate emitting state usage in this particular trace segment: */ for (z = zstart; z <= zend; z++) { switch (tr->st[z]) { case p7T_M: Ld++; MMX(0,tr->k[z]) += 1.0; break; case p7T_I: Ld++; IMX(0,tr->k[z]) += 1.0; break; case p7T_N: if (tr->st[z-1] == p7T_N) { Ld++; XMX(0,p7G_N) += 1.0; } break; case p7T_C: if (tr->st[z-1] == p7T_C) { Ld++; XMX(0,p7G_C) += 1.0; } break; case p7T_J: if (tr->st[z-1] == p7T_J) { Ld++; XMX(0,p7G_J) += 1.0; } break; } } esl_vec_FScale(wrk->dp[0], (M+1)*p7G_NSCELLS, (1.0 / (float) Ld)); esl_vec_FScale(wrk->xmx, p7G_NXCELLS, (1.0 / (float) Ld)); /* Calculate null2's odds ratio emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. */ esl_vec_FSet(null2, gm->abc->K, 0.0); xfactor = XMX(0,p7G_N) + XMX(0,p7G_C) + XMX(0,p7G_J); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] += MMX(0,k) * expf(p7P_MSC(gm, k, x)); null2[x] += IMX(0,k) * expf(p7P_ISC(gm, k, x)); } null2[x] += MMX(0,M) * expf(p7P_MSC(gm, M, x)); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} odds ratios for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_GDecoding() * Synopsis: Posterior decoding of residue assignments. * Incept: SRE, Fri Feb 29 10:16:21 2008 [Janelia] * * Purpose: Calculates a posterior decoding of the residues in a * target sequence, given profile <gm> and filled Forward * and Backward matrices <fwd>, <bck> for the profile * aligned to that target sequence. The resulting posterior * decoding is stored in a DP matrix <pp>, provided by the * caller. * * Each residue <i> must have been emitted by match state * <1..M>, insert state <1..M-1>, or an NN, CC, or JJ loop * transition. For <dp = pp->dp>, <xmx = pp->xmx>, * <MMX(i,k)> is the probability that match <k> emitted * residue <i>; <IMX(i,k)> is the probability that insert * <k> emitted residue <i>; <XMX(i,N)>,<XMX(i,C)>, * <XMX(i,J)> are the probabilities that residue <i> was * emitted on an NN, CC, or JJ transition. The sum over all * these possibilities for a given residue <i> is 1.0. * * Thus the only nonzero entries in a posterior decoding matrix * <pp> are <M_{1..M}>, <I_{1..M-1}>, <N_{1..L-1}> (residue L * can't be emitted by N), <C_{2..L}> (residue 1 can't be * emitted by C), and <J_{2..L-1}> (residues 1,L can't be * emitted by J). * * In particular, row i=0 is unused (all zeros) in a pp * matrix; the null2 calculation will take advantage of * this by using the zero row for workspace. * * The caller may pass the Backward matrix <bck> as <pp>, * in which case <bck> will be overwritten with * <pp>. However, the caller may \emph{not} overwrite <fwd> * this way; an <(i-1)> dependency in the calculation of * NN, CC, JJ transitions prevents this. * * Args: gm - profile (must be the same that was used to fill <fwd>, <bck>). * fwd - filled Forward matrix * bck - filled Backward matrix * pp - RESULT: posterior decoding matrix. * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) * * Note: Burns time renormalizing each row. If you don't do this, * probabilities will have an error of +/- 0.001 or so, creeping * in from error in FLogsum()'s table approximation and even * in log() and exp() themselves; including "probabilities" * up to ~1.001. Though this isn't going to break anything * in normal use, it does drive the unit tests wild; the SSE * implementation is more accurate, and unit tests that try * to compare SSE and generic results will see differences, * some sufficient to alter the choice of OA traceback. * */ int p7_GDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, P7_GMX *bck, P7_GMX *pp) { float **dp = pp->dp; float *xmx = pp->xmx; int L = fwd->L; int M = gm->M; int i,k; float overall_sc = fwd->xmx[p7G_NXCELLS*L + p7G_C] + gm->xsc[p7P_C][p7P_MOVE]; float denom; pp->M = M; pp->L = L; XMX(0, p7G_E) = 0.0; XMX(0, p7G_N) = 0.0; XMX(0, p7G_J) = 0.0; XMX(0, p7G_B) = 0.0; XMX(0, p7G_C) = 0.0; for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = 0.0; for (i = 1; i <= L; i++) { denom = 0.0; MMX(i,0) = IMX(i,0) = DMX(i,0) = 0.0; for (k = 1; k < M; k++) { MMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_M] + bck->dp[i][k*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,k); IMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_I] + bck->dp[i][k*p7G_NSCELLS + p7G_I] - overall_sc); denom += IMX(i,k); DMX(i,k) = 0.; } MMX(i,M) = expf(fwd->dp[i][M*p7G_NSCELLS + p7G_M] + bck->dp[i][M*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,M); IMX(i,M) = 0.; DMX(i,M) = 0.; /* order doesn't matter. note that this whole function is trivially simd parallel */ XMX(i,p7G_E) = 0.; XMX(i,p7G_N) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_N] + bck->xmx[p7G_NXCELLS*i + p7G_N] + gm->xsc[p7P_N][p7P_LOOP] - overall_sc); XMX(i,p7G_J) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_J] + bck->xmx[p7G_NXCELLS*i + p7G_J] + gm->xsc[p7P_J][p7P_LOOP] - overall_sc); XMX(i,p7G_B) = 0.; XMX(i,p7G_C) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_C] + bck->xmx[p7G_NXCELLS*i + p7G_C] + gm->xsc[p7P_C][p7P_LOOP] - overall_sc); denom += XMX(i,p7G_N) + XMX(i,p7G_J) + XMX(i,p7G_C); denom = 1.0 / denom; for (k = 1; k < M; k++) { MMX(i,k) *= denom; IMX(i,k) *= denom; } MMX(i,M) *= denom; XMX(i,p7G_N) *= denom; XMX(i,p7G_J) *= denom; XMX(i,p7G_C) *= denom; } return eslOK; }
/* Function: p7_omx_FDeconvert() * Synopsis: Convert an optimized DP matrix to generic one. * Incept: SRE, Tue Aug 19 17:58:13 2008 [Janelia] * * Purpose: Convert the 32-bit float values in optimized DP matrix * <ox> to a generic one <gx>. Caller provides <gx> with sufficient * space to hold the <ox->M> by <ox->L> matrix. * * This function is used to gain access to the * somewhat more powerful debugging and display * tools available for generic DP matrices. */ int p7_omx_FDeconvert(P7_OMX *ox, P7_GMX *gx) { int Q = p7O_NQF(ox->M); int i, q, r, k; union { __m128 v; float p[4]; } u; float **dp = gx->dp; float *xmx = gx->xmx; for (i = 0; i <= ox->L; i++) { MMX(i,0) = DMX(i,0) = IMX(i,0) = -eslINFINITY; for (q = 0; q < Q; q++) { u.v = MMO(ox->dpf[i],q); for (r = 0; r < 4; r++) { k = (Q*r)+q+1; if (k <= ox->M) MMX(i, (Q*r)+q+1) = u.p[r]; } u.v = DMO(ox->dpf[i],q); for (r = 0; r < 4; r++) { k = (Q*r)+q+1; if (k <= ox->M) DMX(i, (Q*r)+q+1) = u.p[r]; } u.v = IMO(ox->dpf[i],q); for (r = 0; r < 4; r++) { k = (Q*r)+q+1; if (k <= ox->M) IMX(i, (Q*r)+q+1) = u.p[r]; } } XMX(i,p7G_E) = ox->xmx[i*p7X_NXCELLS+p7X_E]; XMX(i,p7G_N) = ox->xmx[i*p7X_NXCELLS+p7X_N]; XMX(i,p7G_J) = ox->xmx[i*p7X_NXCELLS+p7X_J]; XMX(i,p7G_B) = ox->xmx[i*p7X_NXCELLS+p7X_B]; XMX(i,p7G_C) = ox->xmx[i*p7X_NXCELLS+p7X_C]; } gx->L = ox->L; gx->M = ox->M; return eslOK; }
static inline int select_i(const P7_PROFILE *gm, const P7_GMX *gx, int i, int k) { float **dp = gx->dp; /* so {MDI}MX() macros work */ float const *tsc = gm->tsc; /* so TSCDELTA() macro works */ float path[2]; path[0] = TSCDELTA(p7P_MI, k) * MMX(i-1,k); path[1] = TSCDELTA(p7P_II, k) * IMX(i-1,k); return ((path[0] >= path[1]) ? p7T_M : p7T_I); }
static inline int select_m(const P7_PROFILE *gm, const P7_GMX *gx, int i, int k) { float **dp = gx->dp; /* so {MDI}MX() macros work */ float *xmx = gx->xmx; /* so XMX() macro works */ float const *tsc = gm->tsc; /* so TSCDELTA() macro works */ float path[4]; int state[4] = { p7T_M, p7T_I, p7T_D, p7T_B }; path[0] = TSCDELTA(p7P_MM, k-1) * MMX(i-1,k-1); path[1] = TSCDELTA(p7P_IM, k-1) * IMX(i-1,k-1); path[2] = TSCDELTA(p7P_DM, k-1) * DMX(i-1,k-1); path[3] = TSCDELTA(p7P_BM, k-1) * XMX(i-1,p7G_B); return state[esl_vec_FArgMax(path, 4)]; }
static inline float get_postprob(const P7_GMX *pp, int scur, int sprv, int k, int i) { float **dp = pp->dp; float *xmx = pp->xmx; switch (scur) { case p7T_M: return MMX(i,k); case p7T_I: return IMX(i,k); case p7T_N: if (sprv == scur) return XMX(i,p7G_N); case p7T_C: if (sprv == scur) return XMX(i,p7G_C); case p7T_J: if (sprv == scur) return XMX(i,p7G_J); default: return 0.0; } }
/* Function: p7_GViterbi() * Synopsis: The Viterbi algorithm. * Incept: SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis] * * Purpose: The standard Viterbi dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <L> * by <gm->M> cells; calculate the maximum scoring path by * Viterbi; return the Viterbi score in <ret_sc>, and the * Viterbi matrix is in <gx>. * * The caller may then retrieve the Viterbi path by calling * <p7_GTrace()>. * * The Viterbi lod score is returned in nats. The caller * needs to subtract a null model lod score, then convert * to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Viterbi lod score in nats * * Return: <eslOK> on success. */ int p7_GViterbi(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i,k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ /* DP recursion */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < gm->M; k++) { /* match state */ sc = ESL_MAX( MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)); sc = ESL_MAX(sc, DMX(i-1,k-1) + TSC(p7P_DM,k-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1)); MMX(i,k) = sc + MSC(k); /* E state update */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc); /* in Viterbi alignments, Dk->E can't win in local mode (and * isn't possible in glocal mode), so don't bother * looking. */ /* insert state */ sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); } /* Unrolled match state M. */ sc = ESL_MAX( MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)); sc = ESL_MAX(sc, DMX(i-1,M-1 ) + TSC(p7P_DM,M-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1)); MMX(i,M) = sc + MSC(M); /* Unrolled delete state D_M * (Unlike internal Dk->E transitions that can never appear in * Viterbi alignments, D_M->E is possible in glocal mode.) */ DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* E state update; transition from M_M scores 0 by def'n */ sc = ESL_MAX(XMX(i,p7G_E), MMX(i,M)); XMX(i,p7G_E) = ESL_MAX(sc, DMX(i,M)); /* Now the special states. E must already be done, and B must follow N,J. * remember, N, C and J emissions are zero score by definition. */ /* J state */ sc = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; /* J->J */ XMX(i,p7G_J) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* E->J is E's "loop" */ /* C state */ sc = XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_C) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ sc = XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; /* N->B is N's move */ XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); /* J->B is J's move */ } /* T state (not stored) */ if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = gm->M; gx->L = L; return eslOK; }
/* Function: p7_GOptimalAccuracy() * Synopsis: Optimal accuracy decoding: fill. * Incept: SRE, Fri Feb 29 11:56:49 2008 [Janelia] * * Purpose: Calculates the fill step of the optimal accuracy decoding * algorithm \citep{Kall05}. * * Caller provides the posterior decoding matrix <pp>, * which was calculated by Forward/Backward on a target sequence * of length <L> using the query model <gm>. * * Caller also provides a DP matrix <gx>, allocated for the * <gm->M> by <pp->L> comparison. The routine fills this in * with OA scores. * * Args: gm - query profile * pp - posterior decoding matrix created by <p7_GPosteriorDecoding()> * gx - RESULT: caller provided DP matrix for <gm->M> by <L> * ret_e - RETURN: expected number of correctly decoded positions * * Returns: <eslOK> on success, and <*ret_e> contains the final OA * score, which is the expected number of correctly decoded * positions in the target sequence (up to <L>). * * Throws: (no abnormal error conditions) */ int p7_GOptimalAccuracy(const P7_PROFILE *gm, const P7_GMX *pp, P7_GMX *gx, float *ret_e) { int L = pp->L; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; int i,k; int M = gm->M; float esc = p7_profile_IsLocal(gm) ? 1.0 : 0.0; float t1, t2; /* Initialization of the zero row (i=0; no residues to account for. */ XMX(0,p7G_N) = 0.; /* S->N, p=1 */ XMX(0,p7G_B) = 0.; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ for (i = 1; i <= L; i++) { MMX(i,0) = IMX(i,0) = DMX(i,0) = XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < M; k++) { MMX(i,k) = ESL_MAX(ESL_MAX(TSCDELTA(p7P_MM, k-1) * (MMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_IM, k-1) * (IMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M])), ESL_MAX(TSCDELTA(p7P_DM, k-1) * (DMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_BM, k-1) * (XMX(i-1,p7G_B)+ pp->dp[i][k*p7G_NSCELLS + p7G_M]))); XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), esc * MMX(i,k)); IMX(i,k) = ESL_MAX(TSCDELTA(p7P_MI, k) * (MMX(i-1,k) + pp->dp[i][k*p7G_NSCELLS + p7G_I]), TSCDELTA(p7P_II, k) * (IMX(i-1,k) + pp->dp[i][k*p7G_NSCELLS + p7G_I])); DMX(i,k) = ESL_MAX(TSCDELTA(p7P_MD, k-1) * MMX(i,k-1), TSCDELTA(p7P_DD, k-1) * DMX(i,k-1)); } /* last node (k=M) is unrolled; it has no I state, and it has a p=1.0 {MD}->E transition even in local mode */ MMX(i,M) = ESL_MAX(ESL_MAX(TSCDELTA(p7P_MM, M-1) * (MMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_IM, M-1) * (IMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M])), ESL_MAX(TSCDELTA(p7P_DM, M-1) * (DMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_BM, M-1) * (XMX(i-1,p7G_B)+ pp->dp[i][M*p7G_NSCELLS + p7G_M]))); DMX(i,M) = ESL_MAX(TSCDELTA(p7P_MD, M-1) * MMX(i,M-1), TSCDELTA(p7P_DD, M-1) * DMX(i,M-1)); /* note: we calculated XMX before DMX in the loop, because we probably had MMX(i,k) in a register. * but now we can't do that, because XMX depends on DMX */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), ESL_MAX(MMX(i,M), DMX(i, M))); /* now the special states; it's important that E is already done, and B is done after N,J */ t1 = ( (gm->xsc[p7P_J][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_E][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i, p7G_J) = ESL_MAX( t1 * (XMX(i-1,p7G_J) + pp->xmx[i*p7G_NXCELLS + p7G_J]), t2 * XMX(i, p7G_E)); t1 = ( (gm->xsc[p7P_C][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_E][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_C) = ESL_MAX( t1 * (XMX(i-1,p7G_C) + pp->xmx[i*p7G_NXCELLS + p7G_C]), t2 * XMX(i, p7G_E)); t1 = ( (gm->xsc[p7P_N][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_N) = t1 * (XMX(i-1,p7G_N) + pp->xmx[i*p7G_NXCELLS + p7G_N]); t1 = ( (gm->xsc[p7P_N][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_J][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_B) = ESL_MAX( t1 * XMX(i, p7G_N), t2 * XMX(i, p7G_J)); } *ret_e = XMX(L,p7G_C); return eslOK; }
/* Function: p7_GStochasticTrace() * Synopsis: Stochastic traceback of a Forward matrix. * Incept: SRE, Thu Jan 3 15:39:20 2008 [Janelia] * * Purpose: Stochastic traceback of Forward matrix <gx> to * sample an alignment of digital sequence <dsq> * (of length <L>) to the profile <gm>. * * The sampled traceback is returned in <tr>, which the * caller must have at least made an initial allocation of * (the <tr> will be grown as needed here). * * Args: r - source of random numbers * dsq - digital sequence aligned to, 1..L * L - length of dsq * gm - profile * mx - Forward matrix to trace, L x M * tr - storage for the recovered traceback. * * Returns: <eslOK> on success. */ int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int status; int i; /* position in seq (1..L) */ int k; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; float *sc; /* scores of possible choices: up to 2M-1, in the case of exits to E */ int scur, sprv; /* we'll index M states as 1..M, and D states as 2..M = M+2..2M: M0, D1 are impossibles. */ ESL_ALLOC(sc, sizeof(float) * (2*M+1)); k = 0; i = L; if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) goto ERROR; sprv = p7T_C; while (sprv != p7T_S) { switch (tr->st[tr->N-1]) { /* C(i) comes from C(i-1) or E(i) */ case p7T_C: if (XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i); sc[0] = XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_C : p7T_E; break; /* E connects from any M or D state. k set here */ case p7T_E: if (XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { /* local models come from any M, D */ sc[0] = sc[M+1] = -eslINFINITY; for (k = 1; k <= M; k++) sc[k] = MMX(i,k); for (k = 2; k <= M; k++) sc[k+M] = DMX(i,k); esl_vec_FLogNorm(sc, 2*M+1); /* now sc is a prob vector */ k = esl_rnd_FChoose(r, sc, 2*M+1); if (k <= M) scur = p7T_M; else { k -= M; scur = p7T_D; } } else { /* glocal models come from M_M or D_M */ k = M; sc[0] = MMX(i,M); sc[1] = DMX(i,M); esl_vec_FLogNorm(sc, 2); /* now sc is a prob vector */ scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; } break; /* M connects from {MDI} i-1,k-1, or B */ case p7T_M: if (MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); sc[0] = XMX(i-1,p7G_B) + TSC(p7P_BM, k-1); sc[1] = MMX(i-1,k-1) + TSC(p7P_MM, k-1); sc[2] = IMX(i-1,k-1) + TSC(p7P_IM, k-1); sc[3] = DMX(i-1,k-1) + TSC(p7P_DM, k-1); esl_vec_FLogNorm(sc, 4); switch (esl_rnd_FChoose(r, sc, 4)) { case 0: scur = p7T_B; break; case 1: scur = p7T_M; break; case 2: scur = p7T_I; break; case 3: scur = p7T_D; break; } k--; i--; break; /* D connects from M,D at i,k-1 */ case p7T_D: if (DMX(i, k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); sc[0] = MMX(i, k-1) + TSC(p7P_MD, k-1); sc[1] = DMX(i, k-1) + TSC(p7P_DD, k-1); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; k--; break; /* I connects from M,I at i-1,k */ case p7T_I: if (IMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); sc[0] = MMX(i-1,k) + TSC(p7P_MI, k); sc[1] = IMX(i-1,k) + TSC(p7P_II, k); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_I; i--; break; /* N connects from S, N */ case p7T_N: if (XMX(i, p7G_N) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = (i == 0) ? p7T_S : p7T_N; break; /* B connects from N, J */ case p7T_B: if (XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i); sc[0] = XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; sc[1] = XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_N : p7T_J; break; /* J connects from E(i) or J(i-1) */ case p7T_J: if (XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i); sc[0] = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_J : p7T_E; break; default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) goto ERROR; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR; tr->M = gm->M; tr->L = L; free(sc); return eslOK; ERROR: if (sc != NULL) free(sc); return status; }
/* Function: p7_GViterbi_longtarget() * Synopsis: The Viterbi algorithm. * Incept: SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis] * * Purpose: Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <L> * by <gm->M> cells; calculates the Viterbi score for * regions of <dsq>, and captures the positions at which * such regions exceed the score required to be * significant in the eyes of the calling function * (usually p=0.001). * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * filtersc - null or bias correction, required for translating a P-value threshold into a score threshold * P - p-value below which a region is captured as being above threshold * windowlist - RETURN: array of hit windows (start and end of diagonal) for the above-threshold areas * * Return: <eslOK> on success. */ int p7_GViterbi_longtarget(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float filtersc, double P, P7_HMM_WINDOWLIST *windowlist) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i,k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; int16_t sc_thresh; float invP; /* Initialization of the zero row. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ /* * In p7_ViterbiFilter, converting from a scaled int Viterbi score * S (aka xE the score getting to state E) to a probability * goes like this: * S = XMX(i,p7G_E) * vsc = S + gm->xsc[p7P_E][p7P_MOVE] + gm->xsc[p7P_C][p7P_MOVE]; * P = esl_gumbel_surv((vfsc - filtersc) / eslCONST_LOG2 , gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]); * and we're computing the threshold vsc, so invert it: * (vsc - filtersc) / eslCONST_LOG2 = esl_gumbel_invsurv( P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]) * vsc = filtersc + eslCONST_LOG2 * esl_gumbel_invsurv( P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]) * S = vsc - gm->xsc[p7P_E][p7P_MOVE] - gm->xsc[p7P_C][p7P_MOVE] */ invP = esl_gumbel_invsurv(P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]); sc_thresh = (int) ceil (filtersc + (eslCONST_LOG2 * invP) - gm->xsc[p7P_E][p7P_MOVE] - gm->xsc[p7P_C][p7P_MOVE] ); /* DP recursion */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < gm->M; k++) { /* match state */ sc = ESL_MAX( MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)); sc = ESL_MAX(sc, DMX(i-1,k-1) + TSC(p7P_DM,k-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1)); MMX(i,k) = sc + MSC(k); /* E state update */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc); /* in Viterbi alignments, Dk->E can't win in local mode (and * isn't possible in glocal mode), so don't bother * looking. */ /* insert state */ sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); } /* Unrolled match state M. */ sc = ESL_MAX( MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)); sc = ESL_MAX(sc, DMX(i-1,M-1 ) + TSC(p7P_DM,M-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1)); MMX(i,M) = sc + MSC(M); /* Unrolled delete state D_M * (Unlike internal Dk->E transitions that can never appear in * Viterbi alignments, D_M->E is possible in glocal mode.) */ DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* E state update; transition from M_M scores 0 by def'n */ sc = ESL_MAX(XMX(i,p7G_E), MMX(i,M)); XMX(i,p7G_E) = ESL_MAX(sc, DMX(i,M)); if (XMX(i,p7G_E) >= sc_thresh) { //hit score threshold. Add a window to the list, then reset scores. for (k = 1; k <= gm->M; k++) { if (MMX(i,k) == XMX(i,p7G_E)) { p7_hmmwindow_new(windowlist, 0, i, 0, k, 1, 0.0, p7_NOCOMPLEMENT ); } MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; } } else { /* Now the special states. E must already be done, and B must follow N,J. * remember, N, C and J emissions are zero score by definition. */ /* J state */ sc = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; /* J->J */ XMX(i,p7G_J) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* E->J is E's "loop" */ /* C state */ sc = XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_C) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ sc = XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; /* N->B is N's move */ XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); /* J->B is J's move */ } } /* T state (not stored) */ gx->M = gm->M; gx->L = L; return eslOK; }
/* Function: p7_GTrace() * Incept: SRE, Thu Feb 1 10:25:56 2007 [UA 8018 St. Louis to Dulles] * * Purpose: Traceback of a Viterbi matrix: retrieval * of optimum alignment. * * This function is currently implemented as a * reconstruction traceback, rather than using a shadow * matrix. Because H3 uses floating point scores, and we * can't compare floats for equality, we have to compare * floats for near-equality and therefore, formally, we can * only guarantee a near-optimal traceback. However, even in * the unlikely event that a suboptimal is returned, the * score difference from true optimal will be negligible. * * Args: dsq - digital sequence aligned to, 1..L * L - length of <dsq> * gm - profile * mx - Viterbi matrix to trace, L x M * tr - storage for the recovered traceback. * * Return: <eslOK> on success. * <eslFAIL> if even the optimal path has zero probability; * in this case, the trace is set blank (<tr->N = 0>). * * Note: Care is taken to evaluate the prev+tsc+emission * calculations in exactly the same order that Viterbi did * them, lest you get numerical problems with * a+b+c = d; d-c != a+b because d,c are nearly equal. * (This bug appeared in dev: xref J1/121.) */ int p7_GTrace(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int i = L; /* position in seq (1..L) */ int k = 0; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; /* so {MDI}MX() macros work */ float *xmx = gx->xmx; /* so XMX() macro works */ float tol = 1e-5; /* floating point "equality" test */ float const *tsc = gm->tsc; int sprv, scur; /* previous, current state in trace */ int status; #ifdef p7_DEBUGGING if (tr->N != 0) ESL_EXCEPTION(eslEINVAL, "trace isn't empty: forgot to Reuse()?"); #endif if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) return status; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) return status; sprv = p7T_C; while (sprv != p7T_S) { float const *rsc = (i>0 ? gm->rsc[dsq[i]] : NULL); switch (sprv) { case p7T_C: /* C(i) comes from C(i-1) or E(i) */ if (XMX(i,p7G_C) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible C reached at i=%d", i); if (esl_FCompare(XMX(i, p7G_C), XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP], tol) == eslOK) scur = p7T_C; else if (esl_FCompare(XMX(i, p7G_C), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "C at i=%d couldn't be traced", i); break; case p7T_E: /* E connects from any M state. k set here */ if (XMX(i, p7G_E) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { scur = p7T_M; /* can't come from D, in a *local* Viterbi trace. */ for (k = M; k >= 1; k--) if (esl_FCompare(XMX(i, p7G_E), MMX(i,k), tol) == eslOK) break; if (k == 0) ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } else /* glocal mode: we either come from D_M or M_M */ { if (esl_FCompare(XMX(i, p7G_E), MMX(i,M), tol) == eslOK) { scur = p7T_M; k = M; } else if (esl_FCompare(XMX(i, p7G_E), DMX(i,M), tol) == eslOK) { scur = p7T_D; k = M; } else ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } break; case p7T_M: /* M connects from i-1,k-1, or B */ if (MMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); if (esl_FCompare(MMX(i,k), XMX(i-1,p7G_B) + TSC(p7P_BM, k-1) + MSC(k), tol) == eslOK) scur = p7T_B; else if (esl_FCompare(MMX(i,k), MMX(i-1,k-1) + TSC(p7P_MM, k-1) + MSC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(MMX(i,k), IMX(i-1,k-1) + TSC(p7P_IM, k-1) + MSC(k), tol) == eslOK) scur = p7T_I; else if (esl_FCompare(MMX(i,k), DMX(i-1,k-1) + TSC(p7P_DM, k-1) + MSC(k), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "M at k=%d,i=%d couldn't be traced", k,i); k--; i--; break; case p7T_D: /* D connects from M,D at i,k-1 */ if (DMX(i, k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); if (esl_FCompare(DMX(i,k), MMX(i, k-1) + TSC(p7P_MD, k-1), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(DMX(i,k), DMX(i, k-1) + TSC(p7P_DD, k-1), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "D at k=%d,i=%d couldn't be traced", k,i); k--; break; case p7T_I: /* I connects from M,I at i-1,k*/ if (IMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); if (esl_FCompare(IMX(i,k), MMX(i-1,k) + TSC(p7P_MI, k) + ISC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(IMX(i,k), IMX(i-1,k) + TSC(p7P_II, k) + ISC(k), tol) == eslOK) scur = p7T_I; else ESL_EXCEPTION(eslFAIL, "I at k=%d,i=%d couldn't be traced", k,i); i--; break; case p7T_N: /* N connects from S, N */ if (XMX(i, p7G_N) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = ( (i == 0) ? p7T_S : p7T_N); break; case p7T_B: /* B connects from N, J */ if (XMX(i,p7G_B) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible B reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], tol) == eslOK) scur = p7T_N; else if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE], tol) == eslOK) scur = p7T_J; else ESL_EXCEPTION(eslFAIL, "B at i=%d couldn't be traced", i); break; case p7T_J: /* J connects from E(i) or J(i-1) */ if (XMX(i,p7G_J) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible J reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_J), XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], tol) == eslOK) scur = p7T_J; else if (esl_FCompare(XMX(i,p7G_J), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "J at i=%d couldn't be traced", i); break; default: ESL_EXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) return status; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ tr->M = gm->M; tr->L = L; return p7_trace_Reverse(tr); }
/* Function: p7_GNull2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Thu Feb 28 09:52:28 2008 [Janelia] * * Purpose: Calculate the "null2" model for the envelope encompassed * by a posterior probability calculation <pp> for model * <gm>. Return the null2 odds emission probabilities * $\frac{f'{x}}{f{x}}$ in <null2>, which caller * provides as space for at least <alphabet->Kp> residues. * * The expectation method is applied to envelopes in * simple, well resolved regions (regions containing just a * single envelope, where no stochastic traceback * clustering was required). * * Make sure that the posterior probability matrix <pp> has * been calculated by the caller for only the envelope; thus * its rows are numbered <1..Ld>, for envelope <ienv..jenv> * of length <Ld=jenv-ienv+1>. * * Args: gm - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space * * Returns: <eslOK> on success; <null2> contains the null2 scores. The 0 * row of <pp> has been used as temp space, and happens to contain * the expected frequency that each M,I,N,C,J state is used in this * <pp> matrix to generate residues. * * Throws: (no abnormal error conditions) */ int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2) { int M = gm->M; int Ld = pp->L; float **dp = pp->dp; float *xmx = pp->xmx; float xfactor; int x; /* over symbols 0..K-1 */ int i; /* over offset envelope dsq positions 1..Ld */ int k; /* over model M states 1..M, I states 1..M-1 */ /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ esl_vec_FCopy(pp->dp[1], (M+1)*p7G_NSCELLS, pp->dp[0]); esl_vec_FCopy(pp->xmx+p7G_NXCELLS, p7G_NXCELLS, pp->xmx); for (i = 2; i <= Ld; i++) { esl_vec_FAdd(pp->dp[0], pp->dp[i], (M+1)*p7G_NSCELLS); esl_vec_FAdd(pp->xmx, pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); } /* Convert those expected #'s to log frequencies; these we'll use as * the log posterior weights. */ esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS); esl_vec_FLog(pp->xmx, p7G_NXCELLS); esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld)); esl_vec_FIncrement(pp->xmx, p7G_NXCELLS, -log((float)Ld)); /* Calculate null2's log odds emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. * This is dog-slow; a point for future optimization. */ xfactor = XMX(0,p7G_N); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C)); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J)); esl_vec_FSet(null2, gm->abc->K, -eslINFINITY); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x)); } null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], xfactor); } esl_vec_FExp (null2, gm->abc->K); /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing */ null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_GForward() * Synopsis: The Forward algorithm. * Incept: SRE, Mon Apr 16 13:57:35 2007 [Janelia] * * Purpose: The Forward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Forward algorithm; return the * Forward matrix in <gx>, and the Forward score in <ret_sc>. * * The Forward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Forward lod score in nats * * Return: <eslOK> on success. */ int p7_GForward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row, and the lookup table of the log * sum routine. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ p7_FLogsumInit(); /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * tsc[0] = impossible for all eight transitions (no node 0) * D_1 is wastefully calculated (doesn't exist) */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i, p7G_E) = -eslINFINITY; for (k = 1; k < M; k++) { /* match state */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,k-1), DMX(i-1,k-1) + TSC(p7P_DM,k-1))); MMX(i,k) = sc + MSC(k); /* insert state */ sc = p7_FLogsum(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = p7_FLogsum(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); /* E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,k) + esc, DMX(i,k) + esc), XMX(i,p7G_E)); } /* unrolled match state M_M */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,M-1), DMX(i-1,M-1) + TSC(p7P_DM,M-1))); MMX(i,M) = sc + MSC(M); IMX(i,M) = -eslINFINITY; /* unrolled delete state D_M */ DMX(i,M) = p7_FLogsum(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* unrolled E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,M), DMX(i,M)), XMX(i,p7G_E)); /* J state */ XMX(i,p7G_J) = p7_FLogsum(XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* C state */ XMX(i,p7G_C) = p7_FLogsum(XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); } if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = M; gx->L = L; return eslOK; }
/* Function: p7_GBackward() * Synopsis: The Backward algorithm. * Incept: SRE, Fri Dec 28 14:31:58 2007 [Janelia] * * Purpose: The Backward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Backward algorithm; return the * Backward matrix in <gx>, and the Backward score in <ret_sc>. * * The Backward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Backward lod score in nats * * Return: <eslOK> on success. */ int p7_GBackward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float const *rsc = NULL; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Note: backward calculates the probability we can get *out* of * cell i,k; exclusive of emitting residue x_i. */ p7_FLogsumInit(); /* Initialize the L row. */ XMX(L,p7G_J) = XMX(L,p7G_B) = XMX(L,p7G_N) = -eslINFINITY; XMX(L,p7G_C) = gm->xsc[p7P_C][p7P_MOVE]; /* C<-T */ XMX(L,p7G_E) = XMX(L,p7G_C) + gm->xsc[p7P_E][p7P_MOVE]; /* E<-C, no tail */ MMX(L,M) = DMX(L,M) = XMX(L,p7G_E); /* {MD}_M <- E (prob 1.0) */ IMX(L,M) = -eslINFINITY; /* no I_M state */ for (k = M-1; k >= 1; k--) { MMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_MD,k)); DMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_DD,k)); IMX(L,k) = -eslINFINITY; } /* Main recursion */ for (i = L-1; i >= 1; i--) { rsc = gm->rsc[dsq[i+1]]; XMX(i,p7G_B) = MMX(i+1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_B), MMX(i+1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = p7_FLogsum( XMX(i+1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_J][p7P_MOVE]); XMX(i,p7G_C) = XMX(i+1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_E) = p7_FLogsum( XMX(i, p7G_J) + gm->xsc[p7P_E][p7P_LOOP], XMX(i, p7G_C) + gm->xsc[p7P_E][p7P_MOVE]); XMX(i,p7G_N) = p7_FLogsum( XMX(i+1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); MMX(i,M) = DMX(i,M) = XMX(i,p7G_E); IMX(i,M) = -eslINFINITY; for (k = M-1; k >= 1; k--) { MMX(i,k) = p7_FLogsum( p7_FLogsum(MMX(i+1,k+1) + TSC(p7P_MM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_MI,k) + ISC(k)), p7_FLogsum(XMX(i,p7G_E) + esc, DMX(i, k+1) + TSC(p7P_MD,k))); IMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_IM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_II,k) + ISC(k)); DMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_DM,k) + MSC(k+1), p7_FLogsum( DMX(i, k+1) + TSC(p7P_DD,k), XMX(i, p7G_E) + esc)); } } /* At i=0, only N,B states are reachable. */ rsc = gm->rsc[dsq[1]]; XMX(0,p7G_B) = MMX(1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(0,p7G_B) = p7_FLogsum(XMX(0, p7G_B), MMX(1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = -eslINFINITY; XMX(i,p7G_C) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; XMX(i,p7G_N) = p7_FLogsum( XMX(1, p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(0, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); for (k = M; k >= 1; k--) MMX(i,M) = IMX(i,M) = DMX(i,M) = -eslINFINITY; if (opt_sc != NULL) *opt_sc = XMX(0,p7G_N); gx->M = M; gx->L = L; return eslOK; }