static inline int select_e(const P7_PROFILE *gm, const P7_GMX *gx, int i, int *ret_k) { float **dp = gx->dp; /* so {MDI}MX() macros work */ float max = -eslINFINITY; int smax = -1; /* will be returned as "error code" if no max found */ int kmax = -1; int k; if (! p7_profile_IsLocal(gm)) /* glocal/global is easier */ { *ret_k = gm->M; return ((MMX(i,gm->M) >= DMX(i,gm->M)) ? p7T_M : p7T_D); } for (k = 1; k <= gm->M; k++) { if (MMX(i,k) >= max) { max = MMX(i,k); smax = p7T_M; kmax = k; } if (DMX(i,k) > max) { max = DMX(i,k); smax = p7T_D; kmax = k; } } *ret_k = kmax; return smax; }
/* Function: p7_GViterbi() * Synopsis: The Viterbi algorithm. * Incept: SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis] * * Purpose: The standard Viterbi dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <L> * by <gm->M> cells; calculate the maximum scoring path by * Viterbi; return the Viterbi score in <ret_sc>, and the * Viterbi matrix is in <gx>. * * The caller may then retrieve the Viterbi path by calling * <p7_GTrace()>. * * The Viterbi lod score is returned in nats. The caller * needs to subtract a null model lod score, then convert * to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Viterbi lod score in nats * * Return: <eslOK> on success. */ int p7_GViterbi(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i,k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ /* DP recursion */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < gm->M; k++) { /* match state */ sc = ESL_MAX( MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)); sc = ESL_MAX(sc, DMX(i-1,k-1) + TSC(p7P_DM,k-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1)); MMX(i,k) = sc + MSC(k); /* E state update */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc); /* in Viterbi alignments, Dk->E can't win in local mode (and * isn't possible in glocal mode), so don't bother * looking. */ /* insert state */ sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); } /* Unrolled match state M. */ sc = ESL_MAX( MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)); sc = ESL_MAX(sc, DMX(i-1,M-1 ) + TSC(p7P_DM,M-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1)); MMX(i,M) = sc + MSC(M); /* Unrolled delete state D_M * (Unlike internal Dk->E transitions that can never appear in * Viterbi alignments, D_M->E is possible in glocal mode.) */ DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* E state update; transition from M_M scores 0 by def'n */ sc = ESL_MAX(XMX(i,p7G_E), MMX(i,M)); XMX(i,p7G_E) = ESL_MAX(sc, DMX(i,M)); /* Now the special states. E must already be done, and B must follow N,J. * remember, N, C and J emissions are zero score by definition. */ /* J state */ sc = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; /* J->J */ XMX(i,p7G_J) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* E->J is E's "loop" */ /* C state */ sc = XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_C) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ sc = XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; /* N->B is N's move */ XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); /* J->B is J's move */ } /* T state (not stored) */ if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = gm->M; gx->L = L; return eslOK; }
/* Function: p7_oprofile_IsLocal() * Synopsis: Returns TRUE if profile is in local alignment mode. * Incept: MSF Tue Nov 3, 2009 [Janelia] */ int p7_oprofile_IsLocal(const P7_OPROFILE *om) { return p7_profile_IsLocal(om); }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); if (! (hmm->flags & p7H_CONS)) ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->max_length = hmm->max_length; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_MMASK) strcpy(gm->mm, hmm->mm); if (hmm->flags & p7H_CONS) strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */ if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC(occ, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ { Z = log(hmm->t[0][p7H_MD]); p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]); for (k = 1; k < hmm->M; k++) { p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]); Z += log(hmm->t[k][p7H_DD]); } } /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits * N,C,J transitions are set later by length config */ if (p7_profile_IsMultihit(gm)) { gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2; gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2; gm->nj = 1.0f; } else { gm->xsc[p7P_E][p7P_MOVE] = 0.0f; gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY; gm->nj = 0.0f; } /* Transition scores. */ for (k = 1; k < gm->M; k++) { tp = gm->tsc + k * p7P_NTRANS; tp[p7P_MM] = log(hmm->t[k][p7H_MM]); tp[p7P_MI] = log(hmm->t[k][p7H_MI]); tp[p7P_MD] = log(hmm->t[k][p7H_MD]); tp[p7P_IM] = log(hmm->t[k][p7H_IM]); tp[p7P_II] = log(hmm->t[k][p7H_II]); tp[p7P_DM] = log(hmm->t[k][p7H_DM]); tp[p7P_DD] = log(hmm->t[k][p7H_DD]); } /* Match emission scores. */ sc[hmm->abc->K] = -eslINFINITY; /* gap character */ sc[hmm->abc->Kp-2] = -eslINFINITY; /* nonresidue character */ sc[hmm->abc->Kp-1] = -eslINFINITY; /* missing data character */ for (k = 1; k <= hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log((double)hmm->mat[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k * p7P_NR; rp[p7P_MSC] = sc[x]; } } /* Insert emission scores */ /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores * to 0, i.e. corresponding to the insertion emission probabilities * being equal to the background probabilities. Benchmarking shows * that setting inserts to informative emission distributions causes * more problems than it's worth: polar biased composition hits * driven by stretches of "insertion" occur, and are difficult to * correct for. */ for (x = 0; x < gm->abc->Kp; x++) { for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f; p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ } for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K) = -eslINFINITY; /* gap symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */ #if 0 /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */ for (k = 1; k < hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log(hmm->ins[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k*p7P_NR; rp[p7P_ISC] = sc[x]; } } for (x = 0; x < hmm->abc->Kp; x++) p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ #endif /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength() */ gm->L = 0; /* force ReconfigLength to reconfig */ if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR; return eslOK; ERROR: if (occ != NULL) free(occ); return status; }
/* Function: p7_GOptimalAccuracy() * Synopsis: Optimal accuracy decoding: fill. * Incept: SRE, Fri Feb 29 11:56:49 2008 [Janelia] * * Purpose: Calculates the fill step of the optimal accuracy decoding * algorithm \citep{Kall05}. * * Caller provides the posterior decoding matrix <pp>, * which was calculated by Forward/Backward on a target sequence * of length <L> using the query model <gm>. * * Caller also provides a DP matrix <gx>, allocated for the * <gm->M> by <pp->L> comparison. The routine fills this in * with OA scores. * * Args: gm - query profile * pp - posterior decoding matrix created by <p7_GPosteriorDecoding()> * gx - RESULT: caller provided DP matrix for <gm->M> by <L> * ret_e - RETURN: expected number of correctly decoded positions * * Returns: <eslOK> on success, and <*ret_e> contains the final OA * score, which is the expected number of correctly decoded * positions in the target sequence (up to <L>). * * Throws: (no abnormal error conditions) */ int p7_GOptimalAccuracy(const P7_PROFILE *gm, const P7_GMX *pp, P7_GMX *gx, float *ret_e) { int L = pp->L; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; int i,k; int M = gm->M; float esc = p7_profile_IsLocal(gm) ? 1.0 : 0.0; float t1, t2; /* Initialization of the zero row (i=0; no residues to account for. */ XMX(0,p7G_N) = 0.; /* S->N, p=1 */ XMX(0,p7G_B) = 0.; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ for (i = 1; i <= L; i++) { MMX(i,0) = IMX(i,0) = DMX(i,0) = XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < M; k++) { MMX(i,k) = ESL_MAX(ESL_MAX(TSCDELTA(p7P_MM, k-1) * (MMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_IM, k-1) * (IMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M])), ESL_MAX(TSCDELTA(p7P_DM, k-1) * (DMX(i-1,k-1) + pp->dp[i][k*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_BM, k-1) * (XMX(i-1,p7G_B)+ pp->dp[i][k*p7G_NSCELLS + p7G_M]))); XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), esc * MMX(i,k)); IMX(i,k) = ESL_MAX(TSCDELTA(p7P_MI, k) * (MMX(i-1,k) + pp->dp[i][k*p7G_NSCELLS + p7G_I]), TSCDELTA(p7P_II, k) * (IMX(i-1,k) + pp->dp[i][k*p7G_NSCELLS + p7G_I])); DMX(i,k) = ESL_MAX(TSCDELTA(p7P_MD, k-1) * MMX(i,k-1), TSCDELTA(p7P_DD, k-1) * DMX(i,k-1)); } /* last node (k=M) is unrolled; it has no I state, and it has a p=1.0 {MD}->E transition even in local mode */ MMX(i,M) = ESL_MAX(ESL_MAX(TSCDELTA(p7P_MM, M-1) * (MMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_IM, M-1) * (IMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M])), ESL_MAX(TSCDELTA(p7P_DM, M-1) * (DMX(i-1,M-1) + pp->dp[i][M*p7G_NSCELLS + p7G_M]), TSCDELTA(p7P_BM, M-1) * (XMX(i-1,p7G_B)+ pp->dp[i][M*p7G_NSCELLS + p7G_M]))); DMX(i,M) = ESL_MAX(TSCDELTA(p7P_MD, M-1) * MMX(i,M-1), TSCDELTA(p7P_DD, M-1) * DMX(i,M-1)); /* note: we calculated XMX before DMX in the loop, because we probably had MMX(i,k) in a register. * but now we can't do that, because XMX depends on DMX */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), ESL_MAX(MMX(i,M), DMX(i, M))); /* now the special states; it's important that E is already done, and B is done after N,J */ t1 = ( (gm->xsc[p7P_J][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_E][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i, p7G_J) = ESL_MAX( t1 * (XMX(i-1,p7G_J) + pp->xmx[i*p7G_NXCELLS + p7G_J]), t2 * XMX(i, p7G_E)); t1 = ( (gm->xsc[p7P_C][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_E][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_C) = ESL_MAX( t1 * (XMX(i-1,p7G_C) + pp->xmx[i*p7G_NXCELLS + p7G_C]), t2 * XMX(i, p7G_E)); t1 = ( (gm->xsc[p7P_N][p7P_LOOP] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_N) = t1 * (XMX(i-1,p7G_N) + pp->xmx[i*p7G_NXCELLS + p7G_N]); t1 = ( (gm->xsc[p7P_N][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); t2 = ( (gm->xsc[p7P_J][p7P_MOVE] == -eslINFINITY) ? FLT_MIN : 1.0); XMX(i,p7G_B) = ESL_MAX( t1 * XMX(i, p7G_N), t2 * XMX(i, p7G_J)); } *ret_e = XMX(L,p7G_C); return eslOK; }
/* Function: p7_GStochasticTrace() * Synopsis: Stochastic traceback of a Forward matrix. * Incept: SRE, Thu Jan 3 15:39:20 2008 [Janelia] * * Purpose: Stochastic traceback of Forward matrix <gx> to * sample an alignment of digital sequence <dsq> * (of length <L>) to the profile <gm>. * * The sampled traceback is returned in <tr>, which the * caller must have at least made an initial allocation of * (the <tr> will be grown as needed here). * * Args: r - source of random numbers * dsq - digital sequence aligned to, 1..L * L - length of dsq * gm - profile * mx - Forward matrix to trace, L x M * tr - storage for the recovered traceback. * * Returns: <eslOK> on success. */ int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int status; int i; /* position in seq (1..L) */ int k; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; float *sc; /* scores of possible choices: up to 2M-1, in the case of exits to E */ int scur, sprv; /* we'll index M states as 1..M, and D states as 2..M = M+2..2M: M0, D1 are impossibles. */ ESL_ALLOC(sc, sizeof(float) * (2*M+1)); k = 0; i = L; if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) goto ERROR; sprv = p7T_C; while (sprv != p7T_S) { switch (tr->st[tr->N-1]) { /* C(i) comes from C(i-1) or E(i) */ case p7T_C: if (XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i); sc[0] = XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_C : p7T_E; break; /* E connects from any M or D state. k set here */ case p7T_E: if (XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { /* local models come from any M, D */ sc[0] = sc[M+1] = -eslINFINITY; for (k = 1; k <= M; k++) sc[k] = MMX(i,k); for (k = 2; k <= M; k++) sc[k+M] = DMX(i,k); esl_vec_FLogNorm(sc, 2*M+1); /* now sc is a prob vector */ k = esl_rnd_FChoose(r, sc, 2*M+1); if (k <= M) scur = p7T_M; else { k -= M; scur = p7T_D; } } else { /* glocal models come from M_M or D_M */ k = M; sc[0] = MMX(i,M); sc[1] = DMX(i,M); esl_vec_FLogNorm(sc, 2); /* now sc is a prob vector */ scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; } break; /* M connects from {MDI} i-1,k-1, or B */ case p7T_M: if (MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); sc[0] = XMX(i-1,p7G_B) + TSC(p7P_BM, k-1); sc[1] = MMX(i-1,k-1) + TSC(p7P_MM, k-1); sc[2] = IMX(i-1,k-1) + TSC(p7P_IM, k-1); sc[3] = DMX(i-1,k-1) + TSC(p7P_DM, k-1); esl_vec_FLogNorm(sc, 4); switch (esl_rnd_FChoose(r, sc, 4)) { case 0: scur = p7T_B; break; case 1: scur = p7T_M; break; case 2: scur = p7T_I; break; case 3: scur = p7T_D; break; } k--; i--; break; /* D connects from M,D at i,k-1 */ case p7T_D: if (DMX(i, k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); sc[0] = MMX(i, k-1) + TSC(p7P_MD, k-1); sc[1] = DMX(i, k-1) + TSC(p7P_DD, k-1); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; k--; break; /* I connects from M,I at i-1,k */ case p7T_I: if (IMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); sc[0] = MMX(i-1,k) + TSC(p7P_MI, k); sc[1] = IMX(i-1,k) + TSC(p7P_II, k); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_I; i--; break; /* N connects from S, N */ case p7T_N: if (XMX(i, p7G_N) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = (i == 0) ? p7T_S : p7T_N; break; /* B connects from N, J */ case p7T_B: if (XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i); sc[0] = XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; sc[1] = XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_N : p7T_J; break; /* J connects from E(i) or J(i-1) */ case p7T_J: if (XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i); sc[0] = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_J : p7T_E; break; default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) goto ERROR; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR; tr->M = gm->M; tr->L = L; free(sc); return eslOK; ERROR: if (sc != NULL) free(sc); return status; }
/* Function: p7_GViterbi_longtarget() * Synopsis: The Viterbi algorithm. * Incept: SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis] * * Purpose: Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <L> * by <gm->M> cells; calculates the Viterbi score for * regions of <dsq>, and captures the positions at which * such regions exceed the score required to be * significant in the eyes of the calling function * (usually p=0.001). * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * filtersc - null or bias correction, required for translating a P-value threshold into a score threshold * P - p-value below which a region is captured as being above threshold * windowlist - RETURN: array of hit windows (start and end of diagonal) for the above-threshold areas * * Return: <eslOK> on success. */ int p7_GViterbi_longtarget(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float filtersc, double P, P7_HMM_WINDOWLIST *windowlist) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i,k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; int16_t sc_thresh; float invP; /* Initialization of the zero row. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ /* * In p7_ViterbiFilter, converting from a scaled int Viterbi score * S (aka xE the score getting to state E) to a probability * goes like this: * S = XMX(i,p7G_E) * vsc = S + gm->xsc[p7P_E][p7P_MOVE] + gm->xsc[p7P_C][p7P_MOVE]; * P = esl_gumbel_surv((vfsc - filtersc) / eslCONST_LOG2 , gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]); * and we're computing the threshold vsc, so invert it: * (vsc - filtersc) / eslCONST_LOG2 = esl_gumbel_invsurv( P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]) * vsc = filtersc + eslCONST_LOG2 * esl_gumbel_invsurv( P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]) * S = vsc - gm->xsc[p7P_E][p7P_MOVE] - gm->xsc[p7P_C][p7P_MOVE] */ invP = esl_gumbel_invsurv(P, gm->evparam[p7_VMU], gm->evparam[p7_VLAMBDA]); sc_thresh = (int) ceil (filtersc + (eslCONST_LOG2 * invP) - gm->xsc[p7P_E][p7P_MOVE] - gm->xsc[p7P_C][p7P_MOVE] ); /* DP recursion */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < gm->M; k++) { /* match state */ sc = ESL_MAX( MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)); sc = ESL_MAX(sc, DMX(i-1,k-1) + TSC(p7P_DM,k-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1)); MMX(i,k) = sc + MSC(k); /* E state update */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc); /* in Viterbi alignments, Dk->E can't win in local mode (and * isn't possible in glocal mode), so don't bother * looking. */ /* insert state */ sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); } /* Unrolled match state M. */ sc = ESL_MAX( MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)); sc = ESL_MAX(sc, DMX(i-1,M-1 ) + TSC(p7P_DM,M-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1)); MMX(i,M) = sc + MSC(M); /* Unrolled delete state D_M * (Unlike internal Dk->E transitions that can never appear in * Viterbi alignments, D_M->E is possible in glocal mode.) */ DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* E state update; transition from M_M scores 0 by def'n */ sc = ESL_MAX(XMX(i,p7G_E), MMX(i,M)); XMX(i,p7G_E) = ESL_MAX(sc, DMX(i,M)); if (XMX(i,p7G_E) >= sc_thresh) { //hit score threshold. Add a window to the list, then reset scores. for (k = 1; k <= gm->M; k++) { if (MMX(i,k) == XMX(i,p7G_E)) { p7_hmmwindow_new(windowlist, 0, i, 0, k, 1, 0.0, p7_NOCOMPLEMENT ); } MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; } } else { /* Now the special states. E must already be done, and B must follow N,J. * remember, N, C and J emissions are zero score by definition. */ /* J state */ sc = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; /* J->J */ XMX(i,p7G_J) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* E->J is E's "loop" */ /* C state */ sc = XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_C) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ sc = XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; /* N->B is N's move */ XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); /* J->B is J's move */ } } /* T state (not stored) */ gx->M = gm->M; gx->L = L; return eslOK; }
/* Function: p7_ProfileEmit() * Synopsis: Sample a sequence from the search form of the model. * Incept: SRE, Mon Jan 22 10:23:28 2007 [Janelia] * * Purpose: Sample a sequence from the implicit * probabilistic model of a Plan7 profile <gm>. This * requires also having the core probabilities of * the accompanying <hmm>, and the background * frequencies of null1 model <bg>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively. Caller has allocated space for * both of these, though they may get reallocated/grown * here. Either can be passed as <NULL> if unneeded. * * Only the sequence field is set in the <sq>. Caller must * set the name, plus any other fields it wants to set. If * the <sq> was created in digital mode, this is the <sq->dsq>; * if the <sq> was created in text mode, this is <sq->seq>. * * <p7_ProfileEmit()> deliberately uses an <ESL_SQ> object * instead of a plain <ESL_DSQ *> or <char *> string, to * take advantage of the object's support for dynamic * reallocation of seq length, and to allow both digital and * text mode generation. * * Args: r - source of randomness * hmm - core probabilities of the profile * gm - configured search profile * sq - optRETURN: sampled sequence * tr - optRETURN: sampled trace * * Throws: (no abnormal error conditions) */ int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr) { char prv, st; /* prev, current state type */ int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ int x; /* sampled residue */ int kend = hmm->M; /* predestined end node */ int status; float xt[p7P_NXSTATES][p7P_NXTRANS]; /* Backcalculate the probabilities in the special states (loop and length model) */ for (i = 0; i < p7P_NXSTATES; i++) for (x = 0; x < p7P_NXTRANS; x++) xt[i][x] = exp(gm->xsc[i][x]); if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_S, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_N, k, i)) != eslOK) goto ERROR; } st = p7T_N; i = 0; while (st != p7T_T) { /* Sample a state transition. After this section, prv and st (prev->current state) are set; * k also gets set if we make a B->Mk entry transition. */ prv = st; switch (st) { case p7T_B: if (p7_profile_IsLocal(gm)) { /* local mode: enter the implicit profile: choose our entry and our predestined exit */ if ((status = sample_endpoints(r, gm, &k, &kend)) != eslOK) goto ERROR; st = p7T_M; /* must be, because left wing is retracted */ } else { /* glocal mode: treat B as M_0, use its transitions to MID. */ /* FIXME: this is wrong. It should sample from B->Mk distribution! */ switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, 0), p7H_NTMAT)) { case 0: st = p7T_M; k = 1; break; case 1: st = p7T_I; k = 0; break; case 2: st = p7T_D; k = 1; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_M: if (k == kend) st = p7T_E; /* check our preordained fate */ else { switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, k), p7H_NTMAT)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_D: if (k == kend) st = p7T_E; else st = (esl_rnd_FChoose(r, P7H_TDEL(hmm, k), p7H_NTDEL) == 0) ? p7T_M : p7T_D; break; case p7T_I: st = (esl_rnd_FChoose(r, P7H_TINS(hmm, k), p7H_NTINS) == 0) ? p7T_M : p7T_I; break; case p7T_N: st = (esl_rnd_FChoose(r, xt[p7P_N], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_N; break; case p7T_E: st = (esl_rnd_FChoose(r, xt[p7P_E], p7P_NXTRANS) == p7P_MOVE) ? p7T_C : p7T_J; break; case p7T_C: st = (esl_rnd_FChoose(r, xt[p7P_C], p7P_NXTRANS) == p7P_MOVE) ? p7T_T : p7T_C; break; case p7T_J: st = (esl_rnd_FChoose(r, xt[p7P_J], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_J; break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Based on the transition we just sampled, update k. */ if (st == p7T_E) k = 0; else if (st == p7T_M && prv != p7T_B) k++; /* be careful about B->Mk, where we already set k */ else if (st == p7T_D) k++; /* Based on the transition we just sampled, generate a residue. */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else if ((st == p7T_N || st == p7T_C || st == p7T_J) && prv==st) x = esl_rnd_FChoose(r, bg->f, hmm->abc->K); else x = eslDSQ_SENTINEL; if (x != eslDSQ_SENTINEL) i++; /* Add residue (if any) to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL && (status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; /* Add state to trace. */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* Function: p7_GTrace() * Incept: SRE, Thu Feb 1 10:25:56 2007 [UA 8018 St. Louis to Dulles] * * Purpose: Traceback of a Viterbi matrix: retrieval * of optimum alignment. * * This function is currently implemented as a * reconstruction traceback, rather than using a shadow * matrix. Because H3 uses floating point scores, and we * can't compare floats for equality, we have to compare * floats for near-equality and therefore, formally, we can * only guarantee a near-optimal traceback. However, even in * the unlikely event that a suboptimal is returned, the * score difference from true optimal will be negligible. * * Args: dsq - digital sequence aligned to, 1..L * L - length of <dsq> * gm - profile * mx - Viterbi matrix to trace, L x M * tr - storage for the recovered traceback. * * Return: <eslOK> on success. * <eslFAIL> if even the optimal path has zero probability; * in this case, the trace is set blank (<tr->N = 0>). * * Note: Care is taken to evaluate the prev+tsc+emission * calculations in exactly the same order that Viterbi did * them, lest you get numerical problems with * a+b+c = d; d-c != a+b because d,c are nearly equal. * (This bug appeared in dev: xref J1/121.) */ int p7_GTrace(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int i = L; /* position in seq (1..L) */ int k = 0; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; /* so {MDI}MX() macros work */ float *xmx = gx->xmx; /* so XMX() macro works */ float tol = 1e-5; /* floating point "equality" test */ float const *tsc = gm->tsc; int sprv, scur; /* previous, current state in trace */ int status; #ifdef p7_DEBUGGING if (tr->N != 0) ESL_EXCEPTION(eslEINVAL, "trace isn't empty: forgot to Reuse()?"); #endif if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) return status; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) return status; sprv = p7T_C; while (sprv != p7T_S) { float const *rsc = (i>0 ? gm->rsc[dsq[i]] : NULL); switch (sprv) { case p7T_C: /* C(i) comes from C(i-1) or E(i) */ if (XMX(i,p7G_C) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible C reached at i=%d", i); if (esl_FCompare(XMX(i, p7G_C), XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP], tol) == eslOK) scur = p7T_C; else if (esl_FCompare(XMX(i, p7G_C), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "C at i=%d couldn't be traced", i); break; case p7T_E: /* E connects from any M state. k set here */ if (XMX(i, p7G_E) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { scur = p7T_M; /* can't come from D, in a *local* Viterbi trace. */ for (k = M; k >= 1; k--) if (esl_FCompare(XMX(i, p7G_E), MMX(i,k), tol) == eslOK) break; if (k == 0) ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } else /* glocal mode: we either come from D_M or M_M */ { if (esl_FCompare(XMX(i, p7G_E), MMX(i,M), tol) == eslOK) { scur = p7T_M; k = M; } else if (esl_FCompare(XMX(i, p7G_E), DMX(i,M), tol) == eslOK) { scur = p7T_D; k = M; } else ESL_EXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i); } break; case p7T_M: /* M connects from i-1,k-1, or B */ if (MMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); if (esl_FCompare(MMX(i,k), XMX(i-1,p7G_B) + TSC(p7P_BM, k-1) + MSC(k), tol) == eslOK) scur = p7T_B; else if (esl_FCompare(MMX(i,k), MMX(i-1,k-1) + TSC(p7P_MM, k-1) + MSC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(MMX(i,k), IMX(i-1,k-1) + TSC(p7P_IM, k-1) + MSC(k), tol) == eslOK) scur = p7T_I; else if (esl_FCompare(MMX(i,k), DMX(i-1,k-1) + TSC(p7P_DM, k-1) + MSC(k), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "M at k=%d,i=%d couldn't be traced", k,i); k--; i--; break; case p7T_D: /* D connects from M,D at i,k-1 */ if (DMX(i, k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); if (esl_FCompare(DMX(i,k), MMX(i, k-1) + TSC(p7P_MD, k-1), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(DMX(i,k), DMX(i, k-1) + TSC(p7P_DD, k-1), tol) == eslOK) scur = p7T_D; else ESL_EXCEPTION(eslFAIL, "D at k=%d,i=%d couldn't be traced", k,i); k--; break; case p7T_I: /* I connects from M,I at i-1,k*/ if (IMX(i,k) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); if (esl_FCompare(IMX(i,k), MMX(i-1,k) + TSC(p7P_MI, k) + ISC(k), tol) == eslOK) scur = p7T_M; else if (esl_FCompare(IMX(i,k), IMX(i-1,k) + TSC(p7P_II, k) + ISC(k), tol) == eslOK) scur = p7T_I; else ESL_EXCEPTION(eslFAIL, "I at k=%d,i=%d couldn't be traced", k,i); i--; break; case p7T_N: /* N connects from S, N */ if (XMX(i, p7G_N) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = ( (i == 0) ? p7T_S : p7T_N); break; case p7T_B: /* B connects from N, J */ if (XMX(i,p7G_B) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible B reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], tol) == eslOK) scur = p7T_N; else if (esl_FCompare(XMX(i,p7G_B), XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE], tol) == eslOK) scur = p7T_J; else ESL_EXCEPTION(eslFAIL, "B at i=%d couldn't be traced", i); break; case p7T_J: /* J connects from E(i) or J(i-1) */ if (XMX(i,p7G_J) == -eslINFINITY) ESL_EXCEPTION(eslFAIL, "impossible J reached at i=%d", i); if (esl_FCompare(XMX(i,p7G_J), XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], tol) == eslOK) scur = p7T_J; else if (esl_FCompare(XMX(i,p7G_J), XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP], tol) == eslOK) scur = p7T_E; else ESL_EXCEPTION(eslFAIL, "J at i=%d couldn't be traced", i); break; default: ESL_EXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) return status; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ tr->M = gm->M; tr->L = L; return p7_trace_Reverse(tr); }
/* Function: p7_GForward() * Synopsis: The Forward algorithm. * Incept: SRE, Mon Apr 16 13:57:35 2007 [Janelia] * * Purpose: The Forward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Forward algorithm; return the * Forward matrix in <gx>, and the Forward score in <ret_sc>. * * The Forward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Forward lod score in nats * * Return: <eslOK> on success. */ int p7_GForward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row, and the lookup table of the log * sum routine. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ p7_FLogsumInit(); /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * tsc[0] = impossible for all eight transitions (no node 0) * D_1 is wastefully calculated (doesn't exist) */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i, p7G_E) = -eslINFINITY; for (k = 1; k < M; k++) { /* match state */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,k-1), DMX(i-1,k-1) + TSC(p7P_DM,k-1))); MMX(i,k) = sc + MSC(k); /* insert state */ sc = p7_FLogsum(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = p7_FLogsum(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); /* E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,k) + esc, DMX(i,k) + esc), XMX(i,p7G_E)); } /* unrolled match state M_M */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,M-1), DMX(i-1,M-1) + TSC(p7P_DM,M-1))); MMX(i,M) = sc + MSC(M); IMX(i,M) = -eslINFINITY; /* unrolled delete state D_M */ DMX(i,M) = p7_FLogsum(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* unrolled E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,M), DMX(i,M)), XMX(i,p7G_E)); /* J state */ XMX(i,p7G_J) = p7_FLogsum(XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* C state */ XMX(i,p7G_C) = p7_FLogsum(XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); } if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = M; gx->L = L; return eslOK; }
/* Function: p7_GBackward() * Synopsis: The Backward algorithm. * Incept: SRE, Fri Dec 28 14:31:58 2007 [Janelia] * * Purpose: The Backward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Backward algorithm; return the * Backward matrix in <gx>, and the Backward score in <ret_sc>. * * The Backward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Backward lod score in nats * * Return: <eslOK> on success. */ int p7_GBackward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float const *rsc = NULL; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Note: backward calculates the probability we can get *out* of * cell i,k; exclusive of emitting residue x_i. */ p7_FLogsumInit(); /* Initialize the L row. */ XMX(L,p7G_J) = XMX(L,p7G_B) = XMX(L,p7G_N) = -eslINFINITY; XMX(L,p7G_C) = gm->xsc[p7P_C][p7P_MOVE]; /* C<-T */ XMX(L,p7G_E) = XMX(L,p7G_C) + gm->xsc[p7P_E][p7P_MOVE]; /* E<-C, no tail */ MMX(L,M) = DMX(L,M) = XMX(L,p7G_E); /* {MD}_M <- E (prob 1.0) */ IMX(L,M) = -eslINFINITY; /* no I_M state */ for (k = M-1; k >= 1; k--) { MMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_MD,k)); DMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_DD,k)); IMX(L,k) = -eslINFINITY; } /* Main recursion */ for (i = L-1; i >= 1; i--) { rsc = gm->rsc[dsq[i+1]]; XMX(i,p7G_B) = MMX(i+1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_B), MMX(i+1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = p7_FLogsum( XMX(i+1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_J][p7P_MOVE]); XMX(i,p7G_C) = XMX(i+1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_E) = p7_FLogsum( XMX(i, p7G_J) + gm->xsc[p7P_E][p7P_LOOP], XMX(i, p7G_C) + gm->xsc[p7P_E][p7P_MOVE]); XMX(i,p7G_N) = p7_FLogsum( XMX(i+1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); MMX(i,M) = DMX(i,M) = XMX(i,p7G_E); IMX(i,M) = -eslINFINITY; for (k = M-1; k >= 1; k--) { MMX(i,k) = p7_FLogsum( p7_FLogsum(MMX(i+1,k+1) + TSC(p7P_MM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_MI,k) + ISC(k)), p7_FLogsum(XMX(i,p7G_E) + esc, DMX(i, k+1) + TSC(p7P_MD,k))); IMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_IM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_II,k) + ISC(k)); DMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_DM,k) + MSC(k+1), p7_FLogsum( DMX(i, k+1) + TSC(p7P_DD,k), XMX(i, p7G_E) + esc)); } } /* At i=0, only N,B states are reachable. */ rsc = gm->rsc[dsq[1]]; XMX(0,p7G_B) = MMX(1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(0,p7G_B) = p7_FLogsum(XMX(0, p7G_B), MMX(1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = -eslINFINITY; XMX(i,p7G_C) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; XMX(i,p7G_N) = p7_FLogsum( XMX(1, p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(0, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); for (k = M; k >= 1; k--) MMX(i,M) = IMX(i,M) = DMX(i,M) = -eslINFINITY; if (opt_sc != NULL) *opt_sc = XMX(0,p7G_N); gx->M = M; gx->L = L; return eslOK; }