/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * Incept: SRE, Wed Dec 26 15:12:25 2007 [Janelia] * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: MSV score (in nats) * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { register __m128i mpv; /* previous row values */ register __m128i xEv; /* E state: keeps max for Mk->E as we go */ register __m128i xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m128i sv; /* temp storage of 1 curr row value in progress */ register __m128i biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ __m128i *dp = ox->dpb[0]; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ __m128i *rsc; /* will point at om->rbv[x] for residue x[i] */ __m128i xJv; /* vector for states score */ __m128i tjbmv; /* vector for cost of moving from either J or N through B to an M state */ __m128i tecv; /* vector for E->C cost */ __m128i basev; /* offset for scores */ __m128i ceilingv; /* saturateed simd value used to test for overflow */ __m128i tempv; /* work vector */ int cmp; int status = eslOK; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Try highly optimized ssv filter first */ status = p7_SSVFilter(dsq, L, om, ret_sc); if (status != eslENORESULT) return status; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv = _mm_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ for (q = 0; q < Q; q++) dp[q] = _mm_setzero_si128(); xJ = 0; /* saturate simd register for overflow test */ ceilingv = _mm_cmpeq_epi8(biasv, biasv); basev = _mm_set1_epi8((int8_t) om->base_b); tjbmv = _mm_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); tecv = _mm_set1_epi8((int8_t) om->tec_b); xJv = _mm_subs_epu8(biasv, biasv); xBv = _mm_subs_epu8(basev, tjbmv); #if p7_DEBUGGING if (ox->debugging) { uint8_t xB; xB = _mm_extract_epi16(xBv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = _mm_setzero_si128(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = _mm_slli_si128(dp[Q-1], 1); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = _mm_max_epu8(mpv, xBv); sv = _mm_adds_epu8(sv, biasv); sv = _mm_subs_epu8(sv, *rsc); rsc++; xEv = _mm_max_epu8(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* test for the overflow condition */ tempv = _mm_adds_epu8(xEv, biasv); tempv = _mm_cmpeq_epi8(tempv, ceilingv); cmp = _mm_movemask_epi8(tempv); /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use shuffles instead of shifts so when the last max has completed, * the last four elements of the simd register will contain the * max value. Then the last shuffle will broadcast the max value * to all simd elements. */ tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(2, 3, 0, 1)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 1, 2, 3)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_shufflelo_epi16(xEv, _MM_SHUFFLE(2, 3, 0, 1)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_srli_si128(xEv, 1); xEv = _mm_max_epu8(xEv, tempv); xEv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 0, 0, 0)); /* immediately detect overflow */ if (cmp != 0x0000) { *ret_sc = eslINFINITY; return eslERANGE; } xEv = _mm_subs_epu8(xEv, tecv); xJv = _mm_max_epu8(xJv,xEv); xBv = _mm_max_epu8(basev, xJv); xBv = _mm_subs_epu8(xBv, tjbmv); #if p7_DEBUGGING if (ox->debugging) { uint8_t xB, xE; xB = _mm_extract_epi16(xBv, 0); xE = _mm_extract_epi16(xEv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ xJ = (uint8_t) _mm_extract_epi16(xJv, 0); /* finally C->T, and add our missing precision on the NN,CC,JJ back */ *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ return eslOK; }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * Incept: SRE, Wed Dec 26 15:12:25 2007 [Janelia] * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: MSV score (in nats) * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector unsigned char mpv; /* previous row values */ vector unsigned char xEv; /* E state: keeps max for Mk->E as we go */ vector unsigned char xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector unsigned char sv; /* temp storage of 1 curr row value in progress */ vector unsigned char biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ vector unsigned char *dp; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ vector unsigned char *rsc; /* will point at om->rbv[x] for residue x[i] */ vector unsigned char zerov; /* vector of zeros */ vector unsigned char xJv; /* vector for states score */ vector unsigned char tjbmv; /* vector for B->Mk cost */ vector unsigned char tecv; /* vector for E->C cost */ vector unsigned char basev; /* offset for scores */ vector unsigned char ceilingv; /* saturateed simd value used to test for overflow */ vector unsigned char tempv; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ dp = ox->dpb[0]; for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0); xJ = 0; biasv = esl_vmx_set_u8(om->bias_b); zerov = vec_splat_u8(0); /* saturate simd register for overflow test */ tempv = vec_splat_u8(1); ceilingv = (vector unsigned char)vec_cmpeq(biasv, biasv); ceilingv = vec_subs(ceilingv, biasv); ceilingv = vec_subs(ceilingv, tempv); basev = esl_vmx_set_u8((int8_t) om->base_b); tecv = esl_vmx_set_u8((int8_t) om->tec_b); tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xJv = vec_subs(biasv, biasv); xBv = vec_subs(basev, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB; vec_ste(xBv, 0, &xB); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = vec_splat_u8(0); // xBv = vec_sub(xBv, tbmv); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = vec_sld(zerov, dp[Q-1], 15); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_max(mpv, xBv); sv = vec_adds(sv, biasv); sv = vec_subs(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use rotates instead of shifts so when the last max has completed, * all elements of the simd register will contain the max value. */ tempv = vec_sld(xEv, xEv, 1); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 2); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 4); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 8); xEv = vec_max(xEv, tempv); /* immediately detect overflow */ if (vec_any_gt(xEv, ceilingv)) { *ret_sc = eslINFINITY; return eslERANGE; } xEv = vec_subs(xEv, tecv); xJv = vec_max(xJv,xEv); xBv = vec_max(basev, xJv); xBv = vec_subs(xBv, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB, xE; vec_ste(xBv, 0, &xB); vec_ste(xEv, 0, &xE); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ vec_ste(xJv, 0, &xJ); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ return eslOK; }