/* Function: p7_GNull2_ByTrace() * Synopsis: Assign null2 scores to an envelope by the sampling method. * Incept: SRE, Thu May 1 10:00:43 2008 [Janelia] * * Purpose: Given a traceback <tr> for an alignment of model <gm> to * some target sequence; calculate null2 odds ratios $\frac{f'{x}}{f{x}}$ * as the state-usage-weighted emission probabilities, * with state usages calculated by counting emissions used * at positions <zstart..zend> in the trace. * * Because we only need to collect state usages from the * trace <tr>, the target sequence is irrelevant. Because * we are only averaging emission odds ratios from model * <gm>, the configuration of <gm> is irrelevant (uni * vs. multihit, or length config). * * Args: gm - model, in any configuration; only emission odds are used * tr - traceback for any region (or all) of a target sequence * zstart - first elem in <tr> to collect from; use 0 for complete * zend - last elem in <tr> to collect from; use tr->N-1 for complete * wrk - DP matrix w/ at least one row, for workspace * null2 - RESULT: odds ratios f'(x)/f(x) for all Kp residues * * Returns: <eslOK> on success, and the <ddef->n2sc> scores are set * for region <i..j>. * * Throws: <eslEMEM> on allocation error. */ int p7_GNull2_ByTrace(const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2) { float **dp = wrk->dp; /* so that {MDI}MX() macros work */ float *xmx = wrk->xmx; /* so that XMX() macro works */ int Ld = 0; int M = gm->M; int k; /* index over model position */ int x; /* index over residues */ int z; /* index over trace position */ float xfactor; /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[0..4]. */ esl_vec_FSet(wrk->dp[0], (M+1)*p7G_NSCELLS, 0.0); esl_vec_FSet(wrk->xmx, p7G_NXCELLS, 0.0); /* Calculate emitting state usage in this particular trace segment: */ for (z = zstart; z <= zend; z++) { switch (tr->st[z]) { case p7T_M: Ld++; MMX(0,tr->k[z]) += 1.0; break; case p7T_I: Ld++; IMX(0,tr->k[z]) += 1.0; break; case p7T_N: if (tr->st[z-1] == p7T_N) { Ld++; XMX(0,p7G_N) += 1.0; } break; case p7T_C: if (tr->st[z-1] == p7T_C) { Ld++; XMX(0,p7G_C) += 1.0; } break; case p7T_J: if (tr->st[z-1] == p7T_J) { Ld++; XMX(0,p7G_J) += 1.0; } break; } } esl_vec_FScale(wrk->dp[0], (M+1)*p7G_NSCELLS, (1.0 / (float) Ld)); esl_vec_FScale(wrk->xmx, p7G_NXCELLS, (1.0 / (float) Ld)); /* Calculate null2's odds ratio emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. */ esl_vec_FSet(null2, gm->abc->K, 0.0); xfactor = XMX(0,p7G_N) + XMX(0,p7G_C) + XMX(0,p7G_J); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] += MMX(0,k) * expf(p7P_MSC(gm, k, x)); null2[x] += IMX(0,k) * expf(p7P_ISC(gm, k, x)); } null2[x] += MMX(0,M) * expf(p7P_MSC(gm, M, x)); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} odds ratios for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_Null2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Mon Aug 18 08:32:55 2008 [Janelia] * * Purpose: Identical to <p7_GNull2_ByExpectation()> except that * <om>, <pp> are SSE optimized versions of the profile * and the residue posterior probability matrix. See * <p7_GNull2_ByExpectation()> documentation. * * Args: om - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <om> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 log odds scores per residue; <0..Kp-1>; caller allocated space */ int p7_Null2_ByExpectation(const P7_OPROFILE *om, const P7_OMX *pp, float *null2) { int M = om->M; int Ld = pp->L; int Q = p7O_NQF(M); float *xmx = pp->xmx; /* enables use of XMXo(i,s) macro */ float norm; __m128 *rp; __m128 sv; float xfactor; int i,q,x; /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ memcpy(pp->dpf[0], pp->dpf[1], sizeof(__m128) * 3 * Q); XMXo(0,p7X_N) = XMXo(1,p7X_N); XMXo(0,p7X_C) = XMXo(1,p7X_C); /* 0.0 */ XMXo(0,p7X_J) = XMXo(1,p7X_J); /* 0.0 */ for (i = 2; i <= Ld; i++) { for (q = 0; q < Q; q++) { pp->dpf[0][q*3 + p7X_M] = _mm_add_ps(pp->dpf[i][q*3 + p7X_M], pp->dpf[0][q*3 + p7X_M]); pp->dpf[0][q*3 + p7X_I] = _mm_add_ps(pp->dpf[i][q*3 + p7X_I], pp->dpf[0][q*3 + p7X_I]); } XMXo(0,p7X_N) += XMXo(i,p7X_N); XMXo(0,p7X_C) += XMXo(i,p7X_C); XMXo(0,p7X_J) += XMXo(i,p7X_J); } /* Convert those expected #'s to frequencies, to use as posterior weights. */ norm = 1.0 / (float) Ld; sv = _mm_set1_ps(norm); for (q = 0; q < Q; q++) { pp->dpf[0][q*3 + p7X_M] = _mm_mul_ps(pp->dpf[0][q*3 + p7X_M], sv); pp->dpf[0][q*3 + p7X_I] = _mm_mul_ps(pp->dpf[0][q*3 + p7X_I], sv); } XMXo(0,p7X_N) *= norm; XMXo(0,p7X_C) *= norm; XMXo(0,p7X_J) *= norm; /* Calculate null2's emission odds, by taking posterior weighted sum * over all emission vectors used in paths explaining the domain. */ xfactor = XMXo(0, p7X_N) + XMXo(0, p7X_C) + XMXo(0, p7X_J); for (x = 0; x < om->abc->K; x++) { sv = _mm_setzero_ps(); rp = om->rfv[x]; for (q = 0; q < Q; q++) { sv = _mm_add_ps(sv, _mm_mul_ps(pp->dpf[0][q*3 + p7X_M], *rp)); rp++; sv = _mm_add_ps(sv, pp->dpf[0][q*3 + p7X_I]); /* insert odds implicitly 1.0 */ // sv = _mm_add_ps(sv, _mm_mul_ps(pp->dpf[0][q*3 + p7X_I], *rp)); rp++; } esl_sse_hsum_ps(sv, &(null2[x])); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(om->abc, null2); null2[om->abc->K] = 1.0; /* gap character */ null2[om->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[om->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_Null2_ByTrace() * Synopsis: Assign null2 scores to an envelope by the sampling method. * Incept: SRE, Mon Aug 18 10:22:49 2008 [Janelia] * * Purpose: Identical to <p7_GNull2_ByTrace()> except that * <om>, <wrk> are SSE optimized versions of the profile * and the residue posterior probability matrix. See * <p7_GNull2_ByTrace()> documentation. */ int p7_Null2_ByTrace(const P7_OPROFILE *om, const P7_TRACE *tr, int zstart, int zend, P7_OMX *wrk, float *null2) { union { __m128 v; float p[4]; } u; int Q = p7O_NQF(om->M); int Ld = 0; float *xmx = wrk->xmx; /* enables use of XMXo macro */ float norm; float xfactor; __m128 sv; __m128 *rp; int q, r, s; int x; int z; /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[][0]. */ for (q = 0; q < Q; q++) { wrk->dpf[0][q*3 + p7X_M] = _mm_setzero_ps(); wrk->dpf[0][q*3 + p7X_I] = _mm_setzero_ps(); } XMXo(0,p7X_N) = 0.0; XMXo(0,p7X_C) = 0.0; XMXo(0,p7X_J) = 0.0; /* Calculate emitting state usage in this particular trace segment */ for (z = zstart; z <= zend; z++) { if (tr->i[z] == 0) continue; /* quick test for whether this trace elem emitted or not */ Ld++; if (tr->k[z] > 0) /* must be an M or I */ { /* surely there's an easier way? but our workspace is striped, interleaved quads... */ s = ( (tr->st[z] == p7T_M) ? p7X_M : p7X_I); q = p7X_NSCELLS * ( (tr->k[z] - 1) % Q) + p7X_M; r = (tr->k[z] - 1) / Q; u.v = wrk->dpf[0][q]; u.p[r] += 1.0; /* all this to increment a count by one! */ wrk->dpf[0][q] = u.v; } else /* emitted an x_i with no k; must be an N,C,J */ { switch (tr->st[z]) { case p7T_N: XMXo(0,p7X_N) += 1.0; break; case p7T_C: XMXo(0,p7X_C) += 1.0; break; case p7T_J: XMXo(0,p7X_J) += 1.0; break; } } } norm = 1.0 / (float) Ld; sv = _mm_set1_ps(norm); for (q = 0; q < Q; q++) { wrk->dpf[0][q*3 + p7X_M] = _mm_mul_ps(wrk->dpf[0][q*3 + p7X_M], sv); wrk->dpf[0][q*3 + p7X_I] = _mm_mul_ps(wrk->dpf[0][q*3 + p7X_I], sv); } XMXo(0,p7X_N) *= norm; XMXo(0,p7X_C) *= norm; XMXo(0,p7X_J) *= norm; /* Calculate null2's emission odds, by taking posterior weighted sum * over all emission vectors used in paths explaining the domain. */ xfactor = XMXo(0,p7X_N) + XMXo(0,p7X_C) + XMXo(0,p7X_J); for (x = 0; x < om->abc->K; x++) { sv = _mm_setzero_ps(); rp = om->rfv[x]; for (q = 0; q < Q; q++) { sv = _mm_add_ps(sv, _mm_mul_ps(wrk->dpf[0][q*3 + p7X_M], *rp)); rp++; sv = _mm_add_ps(sv, wrk->dpf[0][q*3 + p7X_I]); /* insert emission odds implicitly 1.0 */ // sv = _mm_add_ps(sv, _mm_mul_ps(wrk->dpf[0][q*3 + p7X_I], *rp)); rp++; } esl_sse_hsum_ps(sv, &(null2[x])); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(om->abc, null2); null2[om->abc->K] = 1.0; /* gap character */ null2[om->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[om->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_GNull2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Thu Feb 28 09:52:28 2008 [Janelia] * * Purpose: Calculate the "null2" model for the envelope encompassed * by a posterior probability calculation <pp> for model * <gm>. Return the null2 odds emission probabilities * $\frac{f'{x}}{f{x}}$ in <null2>, which caller * provides as space for at least <alphabet->Kp> residues. * * The expectation method is applied to envelopes in * simple, well resolved regions (regions containing just a * single envelope, where no stochastic traceback * clustering was required). * * Make sure that the posterior probability matrix <pp> has * been calculated by the caller for only the envelope; thus * its rows are numbered <1..Ld>, for envelope <ienv..jenv> * of length <Ld=jenv-ienv+1>. * * Args: gm - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space * * Returns: <eslOK> on success; <null2> contains the null2 scores. The 0 * row of <pp> has been used as temp space, and happens to contain * the expected frequency that each M,I,N,C,J state is used in this * <pp> matrix to generate residues. * * Throws: (no abnormal error conditions) */ int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2) { int M = gm->M; int Ld = pp->L; float **dp = pp->dp; float *xmx = pp->xmx; float xfactor; int x; /* over symbols 0..K-1 */ int i; /* over offset envelope dsq positions 1..Ld */ int k; /* over model M states 1..M, I states 1..M-1 */ /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ esl_vec_FCopy(pp->dp[1], (M+1)*p7G_NSCELLS, pp->dp[0]); esl_vec_FCopy(pp->xmx+p7G_NXCELLS, p7G_NXCELLS, pp->xmx); for (i = 2; i <= Ld; i++) { esl_vec_FAdd(pp->dp[0], pp->dp[i], (M+1)*p7G_NSCELLS); esl_vec_FAdd(pp->xmx, pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); } /* Convert those expected #'s to log frequencies; these we'll use as * the log posterior weights. */ esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS); esl_vec_FLog(pp->xmx, p7G_NXCELLS); esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld)); esl_vec_FIncrement(pp->xmx, p7G_NXCELLS, -log((float)Ld)); /* Calculate null2's log odds emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. * This is dog-slow; a point for future optimization. */ xfactor = XMX(0,p7G_N); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C)); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J)); esl_vec_FSet(null2, gm->abc->K, -eslINFINITY); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x)); } null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], xfactor); } esl_vec_FExp (null2, gm->abc->K); /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing */ null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }