/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u, tv; float path[4]; int state[4] = { p7T_M, p7T_I, p7T_D, p7T_B }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } /* paths are numbered so that most desirable choice in case of tie is first. */ u.v = xBv; tv.v = *tp; path[3] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = mpv; tv.v = *tp; path[0] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = ipv; tv.v = *tp; path[1] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = dpv; tv.v = *tp; path[2] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); return state[esl_vec_FArgMax(path, 4)]; }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u; float path[4]; int state[4] = { p7T_B, p7T_M, p7T_I, p7T_D }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } u.v = vec_madd(xBv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(mpv, *tp, zerov); tp++; path[1] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); tp++; path[2] = u.p[r]; u.v = vec_madd(dpv, *tp, zerov); path[3] = u.p[r]; esl_vec_FNorm(path, 4); return state[esl_rnd_FChoose(rng, path, 4)]; }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; union { vector float v; float p[4]; } mpv, dpv, tmdv, tddv; float path[2]; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv.v = ox->dpf[i][(q-1)*3 + p7X_M]; dpv.v = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv.v = om->tfv[7*(q-1) + p7O_MD]; tddv.v = om->tfv[7*Q + (q-1)]; } else { mpv.v = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv.v = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv.v = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv.v = vec_sld(zerov, om->tfv[8*Q-1], 12); } path[0] = ((tmdv.p[r] == 0.0) ? -eslINFINITY : mpv.p[r]); path[1] = ((tddv.p[r] == 0.0) ? -eslINFINITY : dpv.p[r]); return ((path[0] >= path[1]) ? p7T_M : p7T_D); }
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] ) { vec_u16_t onev = vec_splat_u16(1); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3; s0 = vec_ld( 0x00, dct ); s1 = vec_sld( s0, s0, 8 ); s2 = vec_ld( 0x10, dct ); s3 = vec_sld( s2, s2, 8 ); vec_s16_t d0, d1, d2, d3; IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 ); vec_s16_t tr0, tr1, tr2, tr3; VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 ); vec_s16_t idct0, idct1, idct2, idct3; IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 ); vec_u8_t perm_ldv = vec_lvsl( 0, dst ); vec_u16_t sixv = vec_splat_u16(6); LOAD_ZERO; ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv ); }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv, dpv; vector float tmdv, tddv; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_D }; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i][(q-1)*3 + p7X_M]; dpv = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv = om->tfv[7*(q-1) + p7O_MD]; tddv = om->tfv[7*Q + (q-1)]; } else { mpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv = vec_sld(zerov, om->tfv[8*Q-1], 12); } u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r]; u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
void foo (void) { vector bool int boolVec1 = (vector bool int) vec_splat_u32(3); vector bool short boolVec2 = (vector bool short) vec_splat_u16(3); vector bool char boolVec3 = (vector bool char) vec_splat_u8(3); boolVec1 = vec_sld( boolVec1, boolVec1, 4 ); boolVec2 = vec_sld( boolVec2, boolVec2, 2 ); boolVec3 = vec_sld( boolVec3, boolVec3, 1 ); }
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val)); ac = vec_add(ac, vec_sld(ac, ac, 8)); vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val)); bd = vec_add(bd, vec_sld(bd, bd, 8)); return v_float32x4(vec_mergeh(ac, bd)); }
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) { enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) }; if (CV_SHIFT == 16) return a; #ifdef __IBMCPP__ return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15)); #else return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT)); #endif }
void test_integer(void) { vf = vec_sld(vf, vf, idx); // expected-error {{no matching function}} // [email protected]:* 13 {{candidate function not viable}} // [email protected]:* 1 {{must be a constant integer from 0 to 15}} vd = vec_sld(vd, vd, idx); // expected-error {{no matching function}} // [email protected]:* 13 {{candidate function not viable}} // [email protected]:* 1 {{must be a constant integer from 0 to 15}} vuc = vec_msum_u128(vul, vul, vuc, idx); // expected-error {{must be a constant integer}} vuc = vec_msum_u128(vul, vul, vuc, -1); // expected-error {{should be a value from 0 to 15}} vuc = vec_msum_u128(vul, vul, vuc, 16); // expected-error {{should be a value from 0 to 15}} }
void v_load_deinterleave_f32(float *ptr, vector float* a, vector float* b, vector float* c) { vector float v1 = vec_xl( 0, ptr); vector float v2 = vec_xl(16, ptr); vector float v3 = vec_xl(32, ptr); static const vector unsigned char flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; *a = vec_perm(v1, vec_sld(v3, v2, 8), flp); static const vector unsigned char flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; *b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); *c = vec_perm(vec_sld(v2, v1, 8), v3, flp); }
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) { enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) }; if (CV_SHIFT == 16) return b; return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT)); }
void v_store_interleave_f32(float *ptr, vector float a, vector float b, vector float c) { vector float hbc = vec_mergeh(b, c); static const vector unsigned char ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; vec_xst(vec_perm(a, hbc, ahbc), 0, ptr); vector float lab = vec_mergel(a, b); vec_xst(vec_sld(lab, hbc, 8), 16, ptr); static const vector unsigned char clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15}; vec_xst(vec_perm(c, lab, clab), 32, ptr); }
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) { vec_s16 va0, va1, va2, va3; vec_s16 vz0, vz1, vz2, vz3; vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; vec_u8 va_u8; vec_u32 va_u32; vec_s16 vdst_ss; const vec_u16 v6us = vec_splat_u16(6); vec_u8 vdst, vdst_orig; vec_u8 vdst_mask = vec_lvsl(0, dst); int element = ((unsigned long)dst & 0xf) >> 2; LOAD_ZERO; block[0] += 32; /* add 32 as a DC-level for rounding */ vtmp0 = vec_ld(0,block); vtmp1 = vec_sld(vtmp0, vtmp0, 8); vtmp2 = vec_ld(16,block); vtmp3 = vec_sld(vtmp2, vtmp2, 8); memset(block, 0, 16 * sizeof(int16_t)); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); va0 = vec_sra(va0,v6us); va1 = vec_sra(va1,v6us); va2 = vec_sra(va2,v6us); va3 = vec_sra(va3,v6us); VEC_LOAD_U8_ADD_S16_STORE_U8(va0); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va1); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va2); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va3); }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
void foo(void) { const unsigned char *buf; vector pixel vp = { 3, 4, 5, 6 }; vector bool int vbi = { 1, 0, 1, 0 }; vector bool short vbs = { 1, 0, 1, 0, 1, 0, 1, 0 }; vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; vector signed char vsc; int a = 3; vec_dst(buf, a, 1); vec_dstst(buf, a, 2); vec_dststt(buf, a, 3); vec_dststt(buf, a, 2); vp = vec_sld(vp, vp, 5); vbc = vec_splat(vbc, 7); vbs = vec_splat(vbs, 12); vp = vec_splat(vp, 17); vbi = vec_splat(vbi, 31); }
/* Function: p7_ViterbiFilter() * Synopsis: Calculates Viterbi score, vewy vewy fast, in limited precision. * Incept: SRE, Tue Nov 27 09:15:24 2007 [Janelia] * * Purpose: Calculates an approximation of the Viterbi score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated Viterbi score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model must be in a local alignment mode; other modes * cannot provide the necessary guarantee of no underflow. * * This is a striped SIMD Viterbi implementation using Intel * VMX integer intrinsics \citep{Farrar07}, in reduced * precision (signed words, 16 bits). * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: Viterbi score (in nats) * * Returns: <eslOK> on success; * <eslERANGE> if the score overflows; in this case * <*ret_sc> is <eslINFINITY>, and the sequence can * be treated as a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small, or if * profile isn't in a local alignment mode. (Must be in local * alignment mode because that's what helps us guarantee * limited dynamic range.) * * Xref: [Farrar07] for ideas behind striped SIMD DP. * J2/46-47 for layout of HMMER's striped SIMD DP. * J2/50 for single row DP. * J2/60 for reduced precision (epu8) * J2/65 for initial benchmarking * J2/66 for precision maximization * J4/138-140 for reimplementation in 16-bit precision */ int p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector signed short mpv, dpv, ipv; /* previous row values */ vector signed short sv; /* temp storage of 1 curr row value in progress */ vector signed short dcv; /* delayed storage of D(i,q+1) */ vector signed short xEv; /* E state: keeps max for Mk->E as we go */ vector signed short xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector signed short Dmaxv; /* keeps track of maximum D cell on row */ int16_t xE, xB, xC, xJ, xN; /* special states' scores */ int16_t Dmax; /* maximum D cell score on row */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q; /* segment length: # of vectors */ vector signed short *dp; /* using {MDI}MX(q) macro requires initialization of <dp> */ vector signed short *rsc; /* will point at om->ru[x] for residue x[i] */ vector signed short *tsc; /* will point into (and step thru) om->tu */ vector signed short negInfv; Q = p7O_NQW(om->M); dp = ox->dpw[0]; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ8) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment"); ox->M = om->M; negInfv = esl_vmx_set_s16((signed short)-32768); /* Initialization. In unsigned arithmetic, -infinity is -32768 */ for (q = 0; q < Q; q++) MMXo(q) = IMXo(q) = DMXo(q) = negInfv; xN = om->base_w; xB = xN + om->xw[p7O_N][p7O_MOVE]; xJ = -32768; xC = -32768; xE = -32768; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */ #endif for (i = 1; i <= L; i++) { rsc = om->rwv[dsq[i]]; tsc = om->twv; dcv = negInfv; /* "-infinity" */ xEv = negInfv; Dmaxv = negInfv; xBv = esl_vmx_set_s16(xB); /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically; replace it with -32768. */ mpv = MMXo(Q-1); mpv = vec_sld(negInfv, mpv, 14); dpv = DMXo(Q-1); dpv = vec_sld(negInfv, dpv, 14); ipv = IMXo(Q-1); ipv = vec_sld(negInfv, ipv, 14); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_adds(xBv, *tsc); tsc++; sv = vec_max (sv, vec_adds(mpv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(ipv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(dpv, *tsc)); tsc++; sv = vec_adds(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMXo(q); dpv = DMXo(q); ipv = IMXo(q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMXo(q) = sv; DMXo(q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_adds(sv, *tsc); tsc++; Dmaxv = vec_max(dcv, Dmaxv); /* Calculate and store I(i,q) */ sv = vec_adds(mpv, *tsc); tsc++; IMXo(q)= vec_max(sv, vec_adds(ipv, *tsc)); tsc++; } /* Now the "special" states, which start from Mk->E (->C, ->J->B) */ xE = esl_vmx_hmax_s16(xEv); if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; } /* immediately detect overflow */ xN = xN + om->xw[p7O_N][p7O_LOOP]; xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]); xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]); xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Finally the "lazy F" loop (sensu [Farrar07]). We can often * prove that we don't need to evaluate any D->D paths at all. * * The observation is that if we can show that on the next row, * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths * for all k, then we don't need any D->D calculations. * * The test condition is: * max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i) * So: * max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound; * max_k D(i,k) is why we tracked Dmaxv; * xB(i) was just calculated above. */ Dmax = esl_vmx_hmax_s16(Dmaxv); if (Dmax + om->ddbound_w > xB) { /* Now we're obligated to do at least one complete DD path to be sure. */ /* dcv has carried through from end of q loop above */ dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } /* We may have to do up to three more passes; the check * is for whether crossing a segment boundary can improve * our score. */ do { dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { if (! vec_any_gt(dcv, DMXo(q))) break; DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } } while (q == Q); } else /* not calculating DD? then just store the last M->D vector calc'ed.*/ DMXo(0) = vec_sld(negInfv, dcv, 14); #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC); #endif } /* end loop over sequence residues 1..L */ /* finally C->T */ if (xC > -32768) { *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w; /* *ret_sc += L * om->ncj_roundoff; see J4/150 for rationale: superceded by -3.0nat approximation*/ *ret_sc /= om->scale_w; *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */ } else *ret_sc = -eslINFINITY; return eslOK; }
float vsincos2f(float x) { // Load x into an aligned float array float __attribute__((aligned(16))) xa[4]; xa[0] = x; // We want to calculate these: // nom = 166320.0 * x - 22260.0 * POW3(x) + 551.0 * POW5(x); // denom = 166320.0 + 5460.0 * POW2(x) + 75.0 * POW4(x); // res = nom/denom; // // We first setup our constants: // vc1 = | a1 | a3 | b0 | b2 | // vc2 = | 0.0 | a5 | 0.0 | 0.0 | vector float vc1 = { 166320.0, -22260, 166320.0, 5460.0 }, vc2 = { 0.0, 551.0, 0.0, 75.0 }; vector float vx = vec_ld(0, xa); vector float vres, vdenom, vest1, vx2, vx02, vx13, vx24, v0 = (vector float)vec_splat_u32(0), v1 = vec_ctf(vec_splat_u32(1),0); // Load x into a vector and splat it all over vx = vec_splat(vx, 0); // get the vector with all elements: x^2 vx2 = vec_madd(vx, vx, v0); // We need a vector with | 1.0 | x^2 | 1.0 | x^2 | vx02 = vec_mergeh(v1, vx2); // Multiply with x -> | x | x^3 | x | x^3 | vx13 = vec_madd(vx, vx02, v0); // Now shift left and combine with vx02 -> | x | x^3 | 1.0 | x^2 | vx13 = vec_sld(vx13, vx02, 8); // Again with x^2 -> | x^3 | x^5 | x^2 | x^4 | vx24 = vec_madd(vx13, vx2, v0); // Multiply with the coefficients vectors: // First with vc1 -> | a1*x | a3*x^3 | b0*1.0 | b2*x^2 | vres = vec_madd(vx13, vc1, v0); // Now with vc2 (and add previous result) -> | a1*x + 0*x^3 | a3*x^3 + a5*x^5 | b0*1.0 + 0.0*x^2 | b2*x^2 + b4*x^4 | vres = vec_madd(vx24, vc2, vres); // Shift left by 4 and add the vectors -> | nom | .. | denom | .. | vres = vec_add(vres, vec_sld(vres, vres, 4)); // Now splat denom (we don't have to splat nom, we'll just take the first element after the division. vdenom = vec_splat(vres, 2); vest1 = vec_re(vdenom); //1st round of Newton-Raphson refinement vdenom = vec_madd( vest1, vec_nmsub( vest1, vdenom, v1 ), vest1 ); // 2nd round of Newton-Raphson refinement // vdenom = vec_madd( vest2, vec_nmsub( vest2, vdenom, v1 ), vest2 ); vres = vec_madd(vres, vdenom, v0); vec_st(vres, 0, xa); //printf("vres = %2.7f %2.7f %2.7f %2.7f\n", xa[0], xa[1], xa[2], xa[3]); /* float nom, denom, res; nom = 166320.0 * x - 22260.0 * POW3(x) + 551.0 * POW5(x); denom = 166320.0 + 5460.0 * POW2(x) + 75.0 * POW4(x); printf("nom = %2.7f, denom = %2.7f\n", nom, denom); res = nom/denom; printf("res = %2.7f\n", res);*/ printf("res = %2.7f\n", xa[0]); return xa[0]; }
static inline vector float vec_reduce( vector float v ) { v = vec_add( v, vec_sld( v, v, 8 ) ); v = vec_add( v, vec_sld( v, v, 4 ) ); return ( v ); }
/* Function: p7_OptimalAccuracy() * Synopsis: DP fill of an optimal accuracy alignment calculation. * Incept: SRE, Mon Aug 18 11:04:48 2008 [Janelia] * * Purpose: Calculates the fill step of the optimal accuracy decoding * algorithm \citep{Kall05}. * * Caller provides the posterior decoding matrix <pp>, * which was calculated by Forward/Backward on a target sequence * of length <pp->L> using the query model <om>. * * Caller also provides a DP matrix <ox>, allocated for a full * <om->M> by <L> comparison. The routine fills this in * with OA scores. * * Args: gm - query profile * pp - posterior decoding matrix created by <p7_GPosteriorDecoding()> * gx - RESULT: caller provided DP matrix for <gm->M> by <L> * ret_e - RETURN: expected number of correctly decoded positions * * Returns: <eslOK> on success, and <*ret_e> contains the final OA * score, which is the expected number of correctly decoded * positions in the target sequence (up to <L>). * * Throws: (no abnormal error conditions) */ int p7_OptimalAccuracy(const P7_OPROFILE *om, const P7_OMX *pp, P7_OMX *ox, float *ret_e) { vector float mpv, dpv, ipv; /* previous row values */ vector float sv; /* temp storage of 1 curr row value in progress */ vector float xEv; /* E state: keeps max for Mk->E as we go */ vector float xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector float dcv; float *xmx = ox->xmx; vector float *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ vector float *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ vector float *ppp; /* quads in the <pp> posterior probability matrix */ vector float *tp; /* quads in the <om->tfv> transition scores */ vector float zerov; vector float infv; int M = om->M; int Q = p7O_NQF(M); int q; int j; int i; float t1, t2; zerov = (vector float) vec_splat_u32(0); infv = esl_vmx_set_float(-eslINFINITY); ox->M = om->M; ox->L = pp->L; for (q = 0; q < Q; q++) MMO(dpc, q) = IMO(dpc,q) = DMO(dpc,q) = infv; XMXo(0, p7X_E) = -eslINFINITY; XMXo(0, p7X_N) = 0.; XMXo(0, p7X_J) = -eslINFINITY; XMXo(0, p7X_B) = 0.; XMXo(0, p7X_C) = -eslINFINITY; for (i = 1; i <= pp->L; i++) { dpp = dpc; /* previous DP row in OA matrix */ dpc = ox->dpf[i]; /* current DP row in OA matrix */ ppp = pp->dpf[i]; /* current row in the posterior probabilities per position */ tp = om->tfv; /* transition probabilities */ dcv = infv; xEv = infv; xBv = esl_vmx_set_float(XMXo(i-1, p7X_B)); mpv = vec_sld(infv, MMO(dpp,Q-1), 12); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. */ dpv = vec_sld(infv, DMO(dpp,Q-1), 12); ipv = vec_sld(infv, IMO(dpp,Q-1), 12); for (q = 0; q < Q; q++) { sv = vec_and(vec_cmpgt(*tp, zerov), xBv); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), mpv)); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv)); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), dpv)); tp++; sv = vec_add(sv, *ppp); ppp += 2; xEv = vec_max(xEv, sv); mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); MMO(dpc,q) = sv; DMO(dpc,q) = dcv; dcv = vec_and(vec_cmpgt(*tp, zerov), sv); tp++; sv = vec_and(vec_cmpgt(*tp, zerov), mpv); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv)); tp++; IMO(dpc,q) = vec_add(sv, *ppp); ppp++; } /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ dcv = vec_sld(infv, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc, q) = vec_max(dcv, DMO(dpc, q)); dcv = vec_and(vec_cmpgt(*tp, zerov), DMO(dpc,q)); tp++; } /* fully serialized D->D; can optimize later */ for (j = 1; j < 4; j++) { dcv = vec_sld(infv, dcv, 12); tp = om->tfv + 7*Q; for (q = 0; q < Q; q++) { DMO(dpc, q) = vec_max(dcv, DMO(dpc, q)); dcv = vec_and(vec_cmpgt(*tp, zerov), dcv); tp++; } } /* D->E paths */ for (q = 0; q < Q; q++) xEv = vec_max(xEv, DMO(dpc,q)); /* Specials */ XMXo(i,p7X_E) = esl_vmx_hmax_float(xEv); t1 = ( (om->xf[p7O_J][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_J] + pp->xmx[i*p7X_NXCELLS+p7X_J]); t2 = ( (om->xf[p7O_E][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[ i *p7X_NXCELLS+p7X_E]); ox->xmx[i*p7X_NXCELLS+p7X_J] = ESL_MAX(t1, t2); t1 = ( (om->xf[p7O_C][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_C] + pp->xmx[i*p7X_NXCELLS+p7X_C]); t2 = ( (om->xf[p7O_E][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[ i *p7X_NXCELLS+p7X_E]); ox->xmx[i*p7X_NXCELLS+p7X_C] = ESL_MAX(t1, t2); ox->xmx[i*p7X_NXCELLS+p7X_N] = ((om->xf[p7O_N][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_N] + pp->xmx[i*p7X_NXCELLS+p7X_N]); t1 = ( (om->xf[p7O_N][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_N]); t2 = ( (om->xf[p7O_J][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_J]); ox->xmx[i*p7X_NXCELLS+p7X_B] = ESL_MAX(t1, t2); } *ret_e = ox->xmx[pp->L*p7X_NXCELLS+p7X_C]; return eslOK; }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * Incept: SRE, Wed Dec 26 15:12:25 2007 [Janelia] * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: MSV score (in nats) * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector unsigned char mpv; /* previous row values */ vector unsigned char xEv; /* E state: keeps max for Mk->E as we go */ vector unsigned char xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector unsigned char sv; /* temp storage of 1 curr row value in progress */ vector unsigned char biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ vector unsigned char *dp; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ vector unsigned char *rsc; /* will point at om->rbv[x] for residue x[i] */ vector unsigned char zerov; /* vector of zeros */ vector unsigned char xJv; /* vector for states score */ vector unsigned char tjbmv; /* vector for B->Mk cost */ vector unsigned char tecv; /* vector for E->C cost */ vector unsigned char basev; /* offset for scores */ vector unsigned char ceilingv; /* saturateed simd value used to test for overflow */ vector unsigned char tempv; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ dp = ox->dpb[0]; for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0); xJ = 0; biasv = esl_vmx_set_u8(om->bias_b); zerov = vec_splat_u8(0); /* saturate simd register for overflow test */ tempv = vec_splat_u8(1); ceilingv = (vector unsigned char)vec_cmpeq(biasv, biasv); ceilingv = vec_subs(ceilingv, biasv); ceilingv = vec_subs(ceilingv, tempv); basev = esl_vmx_set_u8((int8_t) om->base_b); tecv = esl_vmx_set_u8((int8_t) om->tec_b); tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xJv = vec_subs(biasv, biasv); xBv = vec_subs(basev, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB; vec_ste(xBv, 0, &xB); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = vec_splat_u8(0); // xBv = vec_sub(xBv, tbmv); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = vec_sld(zerov, dp[Q-1], 15); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_max(mpv, xBv); sv = vec_adds(sv, biasv); sv = vec_subs(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use rotates instead of shifts so when the last max has completed, * all elements of the simd register will contain the max value. */ tempv = vec_sld(xEv, xEv, 1); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 2); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 4); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 8); xEv = vec_max(xEv, tempv); /* immediately detect overflow */ if (vec_any_gt(xEv, ceilingv)) { *ret_sc = eslINFINITY; return eslERANGE; } xEv = vec_subs(xEv, tecv); xJv = vec_max(xJv,xEv); xBv = vec_max(basev, xJv); xBv = vec_subs(xBv, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB, xE; vec_ste(xBv, 0, &xB); vec_ste(xEv, 0, &xE); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ vec_ste(xJv, 0, &xJ); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ return eslOK; }
/* Function: p7_ViterbiScore() * Synopsis: Calculates Viterbi score, correctly, and vewy vewy fast. * Incept: SRE, Tue Nov 27 09:15:24 2007 [Janelia] * * Purpose: Calculates the Viterbi score for sequence <dsq> of length <L> * residues, using optimized profile <om>, and a preallocated * one-row DP matrix <ox>. Return the Viterbi score (in nats) * in <ret_sc>. * * The model <om> must be configured specially to have * lspace float scores, not its usual pspace float scores for * <p7_ForwardFilter()>. * * As with all <*Score()> implementations, the score is * accurate (full range and precision) and can be * calculated on models in any mode, not only local modes. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: Viterbi score (in nats) * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_ViterbiScore(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector float mpv, dpv, ipv; /* previous row values */ vector float sv; /* temp storage of 1 curr row value in progress */ vector float dcv; /* delayed storage of D(i,q+1) */ vector float xEv; /* E state: keeps max for Mk->E as we go */ vector float xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector float Dmaxv; /* keeps track of maximum D cell on row */ vector float infv; /* -eslINFINITY in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ float Dmax; /* maximum D cell on row */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ vector float *dp = ox->dpf[0]; /* using {MDI}MX(q) macro requires initialization of <dp> */ vector float *rsc; /* will point at om->rf[x] for residue x[i] */ vector float *tsc; /* will point into (and step thru) om->tf */ /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ4) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Initialization. */ infv = esl_vmx_set_float(-eslINFINITY); for (q = 0; q < Q; q++) MMXo(q) = IMXo(q) = DMXo(q) = infv; xN = 0.; xB = om->xf[p7O_N][p7O_MOVE]; xE = -eslINFINITY; xJ = -eslINFINITY; xC = -eslINFINITY; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFloatRow(ox, FALSE, 0, 5, 2, xE, xN, xJ, xB, xC); /* logify=FALSE, <rowi>=0, width=5, precision=2*/ #endif for (i = 1; i <= L; i++) { rsc = om->rf[dsq[i]]; tsc = om->tf; dcv = infv; xEv = infv; Dmaxv = infv; xBv = esl_vmx_set_float(xB); mpv = vec_sld(infv, MMXo(Q-1), 12); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. */ dpv = vec_sld(infv, DMXo(Q-1), 12); ipv = vec_sld(infv, IMXo(Q-1), 12); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_add(xBv, *tsc); tsc++; sv = vec_max(sv, vec_add(mpv, *tsc)); tsc++; sv = vec_max(sv, vec_add(ipv, *tsc)); tsc++; sv = vec_max(sv, vec_add(dpv, *tsc)); tsc++; sv = vec_add(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMXo(q); dpv = DMXo(q); ipv = IMXo(q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMXo(q) = sv; DMXo(q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_add(sv, *tsc); tsc++; Dmaxv = vec_max(dcv, Dmaxv); /* Calculate and store I(i,q) */ sv = vec_add(mpv, *tsc); tsc++; sv = vec_max(sv, vec_add(ipv, *tsc)); tsc++; IMXo(q) = vec_add(sv, *rsc); rsc++; } /* Now the "special" states, which start from Mk->E (->C, ->J->B) */ xE = esl_vmx_hmax_float(xEv); xN = xN + om->xf[p7O_N][p7O_LOOP]; xC = ESL_MAX(xC + om->xf[p7O_C][p7O_LOOP], xE + om->xf[p7O_E][p7O_MOVE]); xJ = ESL_MAX(xJ + om->xf[p7O_J][p7O_LOOP], xE + om->xf[p7O_E][p7O_LOOP]); xB = ESL_MAX(xJ + om->xf[p7O_J][p7O_MOVE], xN + om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Finally the "lazy F" loop (sensu [Farrar07]). We can often * prove that we don't need to evaluate any D->D paths at all. * * The observation is that if we can show that on the next row, * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths * for all k, then we don't need any D->D calculations. * * The test condition is: * max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i) * So: * max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound; * max_k D(i,k) is why we tracked Dmaxv; * xB(i) was just calculated above. */ Dmax = esl_vmx_hmax_float(Dmaxv); if (Dmax + om->ddbound_f > xB) { /* Now we're obligated to do at least one complete DD path to be sure. */ /* dcv has carried through from end of q loop above */ dcv = vec_sld(infv, dcv, 12); tsc = om->tf + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_add(DMXo(q), *tsc); tsc++; } /* We may have to do up to three more passes; the check * is for whether crossing a segment boundary can improve * our score. */ do { dcv = vec_sld(infv, dcv, 12); tsc = om->tf + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { if (! vec_any_gt(dcv, DMXo(q))) break; DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_add(DMXo(q), *tsc); tsc++; } } while (q == Q); } else { /* not calculating DD? then just store that last MD vector we calc'ed. */ dcv = vec_sld(infv, dcv, 12); DMXo(0) = dcv; } #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFloatRow(ox, FALSE, i, 5, 2, xE, xN, xJ, xB, xC); /* logify=FALSE, <rowi>=i, width=5, precision=2*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T */ *ret_sc = xC + om->xf[p7O_C][p7O_MOVE]; return eslOK; }
lsq = vec_perm(src, edges, align); // misalign the data (lsq) vec_st(lsq, 15, target); // Store the lsq part first vec_st(msq, 0, target); // Store the msq part } /* create a rotation and translation matrix (columnwise matrix) */ #define CNV_ANGL (16/3.1415927f) #define V_CNV_ANGL (vector float){CNV_ANGL, CNV_ANGL, CNV_ANGL, CNV_ANGL} void make_rotation(vector float rot[4], trans &t) { #ifdef USE_ALTIVEC vector float rotation = load_unaligned(&t.a) * V_CNV_ANGL; vector float translation = load_unaligned(&t.x); vector float sin = _cos_sin18_v(rotation - (vector float){8.f,8.f,8.f,8.f}); vector float cos = _cos_sin18_v(rotation); vector float sin_a = sin; vector float sin_b = vec_sld(sin, sin, 4); vector float sin_c = vec_sld(sin, sin, 8); vector float cos_a = cos; vector float cos_b = vec_sld(cos, cos, 4); vector float cos_c = vec_sld(cos, cos, 8); //vector float zero = (vector float)vec_splat_s32(0); /* row 0 */ vector float r00 = cos_b * cos_c; vector float r10 = -cos_b * sin_c; vector float r20 = sin_b; /* row 1 */ vector float r01 = sin_a * sin_b * cos_c + cos_a * sin_c; vector float r11 = -sin_a * sin_b * sin_c + cos_a * cos_c; vector float r21 = -sin_a * cos_b; /* row 2 */
/* Function: p7_SSVFilter_longtarget() * Synopsis: Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision) * * Purpose: Calculates an approximation of the SSV (single ungapped diagonal) * score for regions of sequence <dsq> of length <L> residues, using * optimized profile <om>, and a preallocated one-row DP matrix <ox>, * and captures the positions at which such regions exceed the score * required to be significant in the eyes of the calling function, * which depends on the <bg> and <p> (usually p=0.02 for nhmmer). * Note that this variant performs only SSV computations, never * passing through the J state - the score required to pass SSV at * the default threshold (or less restrictive) is sufficient to * pass MSV in essentially all DNA models we've tested. * * Above-threshold diagonals are captured into a preallocated list * <windowlist>. Rather than simply capturing positions at which a * score threshold is reached, this function establishes windows * around those high-scoring positions, using scores in <msvdata>. * These windows can be merged by the calling function. * * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * msvdata - compact representation of substitution scores, for backtracking diagonals * bg - the background model, required for translating a P-value threshold into a score threshold * P - p-value below which a region is captured as being above threshold * windowlist - preallocated container for all hits (resized if necessary) * * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *ssvdata, P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist) { vector unsigned char mpv; /* previous row values */ vector unsigned char xEv; /* E state: keeps max for Mk->E as we go */ vector unsigned char xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector unsigned char sv; /* temp storage of 1 curr row value in progress */ vector unsigned char biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ vector unsigned char *dp = ox->dpb[0]; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ vector unsigned char *rsc; /* will point at om->rbv[x] for residue x[i] */ vector unsigned char zerov; /* vector of zeros */ vector unsigned char tecv; /* vector for E->C cost */ vector unsigned char tjbmv; /* vector for [JN]->B->M move cost */ vector unsigned char basev; /* offset for scores */ int status; int k; int n; int end; int rem_sc; int start; int target_end; int target_start; int max_end; int max_sc; int sc; int pos_since_max; float ret_sc; union { vector unsigned char v; uint8_t b[16]; } u; /* * Computing the score required to let P meet the F1 prob threshold * In original code, converting from a scaled int MSV * score S (the score getting to state E) to a probability goes like this: * usc = S - om->tec_b - om->tjb_b - om->base_b; * usc /= om->scale_b; * usc -= 3.0; * P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda) * and we're computing the threshold usc, so reverse it: * (usc - nullsc) / eslCONST_LOG2 = inv_f( P, mu, lambda) * usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda) * usc += 3 * usc *= om->scale_b * S = usc + om->tec_b + om->tjb_b + om->base_b * * Here, I compute threshold with length model based on max_length. Doesn't * matter much - in any case, both the bg and om models will change with roughly * 1 bit for each doubling of the length model, so they offset. */ float nullsc; float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); vector unsigned char sc_threshv; /* pushes value to saturation if it's above pthresh */ int sc_thresh; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; p7_bg_SetLength(bg, om->max_length); p7_oprofile_ReconfigMSVLength(om, om->max_length); p7_bg_NullOne (bg, dsq, om->max_length, &nullsc); sc_thresh = (int) ceil( ( ( nullsc + (invP * eslCONST_LOG2) + 3.0 ) * om->scale_b ) + om->base_b + om->tec_b + om->tjb_b ); sc_threshv = esl_vmx_set_u8( (int8_t)sc_thresh - 1); /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv = esl_vmx_set_u8(om->bias_b); for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0); xJ = 0; zerov = vec_splat_u8(0); basev = esl_vmx_set_u8((int8_t) om->base_b); tecv = esl_vmx_set_u8((int8_t) om->tec_b); tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xBv = vec_subs(basev, tjbmv); for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = vec_splat_u8(0); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = vec_sld(zerov, dp[Q-1], 15); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_max(mpv, xBv); sv = vec_adds(sv, biasv); sv = vec_subs(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } if (vec_any_gt(xEv, sc_threshv) ) { //hit pthresh, so add position to list and reset values //figure out which model state hit threshold end = -1; rem_sc = -1; for (q = 0; q < Q; q++) { /// Unpack and unstripe, so we can find the state that exceeded pthresh u.v = dp[q]; for (k = 0; k < 16; k++) { // unstripe //(q+Q*k+1) is the model position k at which the xE score is found if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) { end = (q+Q*k+1); rem_sc = u.b[k]; } } dp[q] = vec_splat_u8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration } //recover the diagonal that hit threshold start = end; target_end = target_start = i; sc = rem_sc; while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) { rem_sc -= om->bias_b - ssvdata->ssv_scores[start*om->abc->Kp + dsq[target_start]]; --start; --target_start; //if ( start == 0 || target_start==0) break; } start++; target_start++; //extend diagonal further with single diagonal extension k = end+1; n = target_end+1; max_end = target_end; max_sc = sc; pos_since_max = 0; while (k<om->M && n<=L) { sc += om->bias_b - ssvdata->ssv_scores[k*om->abc->Kp + dsq[n]]; if (sc >= max_sc) { max_sc = sc; max_end = n; pos_since_max=0; } else { pos_since_max++; if (pos_since_max == 5) break; } k++; n++; } end += (max_end - target_end); target_end = max_end; ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b); ret_sc /= om->scale_b; ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ p7_hmmwindow_new( windowlist, 0, // sequence_id; used in the FM-based filter, but not here target_start, // position in the target at which the diagonal starts 0, // position in the target fm_index at which diagonal starts; not used here, just in FM-based filter end, // position in the model at which the diagonal ends end-start+1 , // length of diagonal ret_sc, // score of diagonal p7_NOCOMPLEMENT, // always p7_NOCOMPLEMENT here; varies in FM-based filter L ); i = target_end; // skip forward } } /* end loop over sequence residues 1..L */ return eslOK; ERROR: ESL_EXCEPTION(eslEMEM, "Error allocating memory for hit list\n"); }
void fluid_genPressure_black(fluid *in_f, int y, pvt_fluidMode *mode) { struct pressure *p = &mode->pressure; int w = fieldWidth(p->velX); int h = fieldHeight(p->velX); #ifdef __APPLE_ALTIVEC__ #elif defined __SSE3__ #else int sx = fieldStrideX(p->velX); #endif int sy = fieldStrideY(p->velY); float *velX = fieldData(p->velX); float *velY = fieldData(p->velY); float *pressure = fieldData(p->pressure); if (y == 0) { #ifdef X_SIMD x128f *vPressure = (x128f*)fluidFloatPointer(pressure, 0*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, 1*sy); int x; w/=4; for (x=0; x<w; x++) { vPressure[x] = vPressureP[x]; } #else int x; for (x=0; x<w; x++) { fluidFloatPointer(pressure,x*sx)[0] = fluidFloatPointer(pressure,x*sx + sy)[0]; } #endif } else if (y == h-1) { #ifdef X_SIMD x128f *vPressure = (x128f*)fluidFloatPointer(pressure, y*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy); int x; w/=4; for (x=0; x<w; x++) { vPressure[x] = vPressureP[x]; } #else int x; for (x=0; x<w; x++) { fluidFloatPointer(pressure,x*sx + y*sy)[0] = fluidFloatPointer(pressure,x*sx + (y-1)*sy)[0]; } #endif } else { #ifdef X_SIMD float *vPressureRow = fluidFloatPointer(pressure, y*sy); x128f *vPressure = (x128f*)vPressureRow; x128f *vVelX = (x128f*)fluidFloatPointer(velX, y*sy); x128f *vPressureN = (x128f*)fluidFloatPointer(pressure, (y+1)*sy); x128f *vVelYN = (x128f*)fluidFloatPointer(velY, (y+1)*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy); x128f *vVelYP = (x128f*)fluidFloatPointer(velY, (y-1)*sy); x128f div4 = {0.0f, 1.0f/4.0f, 0.0f, 1.0f/4.0f}; x128f mask = {1.0f, 0.0f, 1.0f, 0.0f}; #endif #ifdef __APPLE_ALTIVEC__ //int myTempVariable = __mfspr( 1023 ); vector float vZero = {0,0,0,0}; vec_dstst(vPressure, 0x01000001, 0); vec_dst(vVelX, 0x01000001, 1); vec_dst(vVelYN, 0x01000001, 2); vec_dst(vVelYP, 0x01000001, 3); int x; { vector float tmp; //Compute shifts vector float sl_p = vec_sld(vPressure[0], vPressure[1],4); vector float sr_p = vec_sld(vZero, vPressure[0], 12); vector float sl_vx = vec_sld(vVelX[0], vVelX[1],4); vector float sr_vx = vec_sld(vZero, vVelX[0], 12); //Sum everything!!! tmp = vec_add(sl_p, sr_p); tmp = vec_add(tmp, vPressureN[0]); tmp = vec_add(tmp, vPressureP[0]); tmp = vec_sub(tmp, sl_vx); tmp = vec_add(tmp, sr_vx); tmp = vec_sub(tmp, vVelYN[0]); tmp = vec_add(tmp, vVelYP[0]); vPressure[0] = vec_madd(tmp, div4, vZero); vPressureRow[0] = vPressureRow[1]; } x=1; while (x<w/4-5) { PRESSURE_VEC_PRE(0) PRESSURE_VEC_PRE(1) PRESSURE_VEC_PRE(2) PRESSURE_VEC_PRE(3) PRESSURE_VEC_SHIFT(0) PRESSURE_VEC_SHIFT(1) PRESSURE_VEC_SHIFT(2) PRESSURE_VEC_SHIFT(3) PRESSURE_VEC_END(0) PRESSURE_VEC_END(1) PRESSURE_VEC_END(2) PRESSURE_VEC_END(3) x+=4; } while (x<w/4-1) { PRESSURE_VEC_PRE(0) PRESSURE_VEC_SHIFT(0) PRESSURE_VEC_END(0) x++; } { vector float tmp; //Compute shifts vector float sl_p = vec_sld(vPressure[x], vZero,4); vector float sr_p = vec_sld(vPressure[x-1], vPressure[x], 12); vector float sl_vx = vec_sld(vVelX[x], vZero,4); vector float sr_vx = vec_sld(vVelX[x-1], vVelX[x], 12); //Sum everything!!! tmp = vec_add(sl_p, sr_p); tmp = vec_add(tmp, vPressureN[x]); tmp = vec_add(tmp, vPressureP[x]); tmp = vec_sub(tmp, sl_vx); tmp = vec_add(tmp, sr_vx); tmp = vec_sub(tmp, vVelYN[x]); tmp = vec_add(tmp, vVelYP[x]); vPressure[x] = vec_madd(tmp, div4, vZero); vPressureRow[w-1] = vPressureRow[w-2]; } #elif defined __SSE3__ int x; { __m128 tmp; //Compute shifts __m128 sl_p = _mm_srli_sf128(vPressure[0],4); sl_p = _mm_add_ps(sl_p,_mm_slli_sf128(vPressure[1],12)); __m128 sr_p = _mm_slli_sf128(vPressure[0],4); __m128 sl_vx = _mm_srli_sf128(vVelX[0],4); sl_vx = _mm_add_ps(sl_vx,_mm_slli_sf128(vVelX[1],12)); __m128 sr_vx = _mm_slli_sf128(vVelX[0],4); //Sum everything!!! tmp = _mm_add_ps(sl_p, sr_p); tmp = _mm_add_ps(tmp, vPressureN[0]); tmp = _mm_add_ps(tmp, vPressureP[0]); tmp = _mm_sub_ps(tmp, sl_vx); tmp = _mm_add_ps(tmp, sr_vx); tmp = _mm_sub_ps(tmp, vVelYN[0]); tmp = _mm_add_ps(tmp, vVelYP[0]); vPressure[0] = _mm_mul_ps(tmp, div4); vPressureRow[0] = vPressureRow[1]; } x=1; while (x<w/4-9) { //Compute shifts (1) PRESSURE_SSE_PRE(0); PRESSURE_SSE_PRE(1); PRESSURE_SSE_PRE(2); //Sum everything!!! (1) PRESSURE_SSE_POST(0); PRESSURE_SSE_POST(1); PRESSURE_SSE_POST(2); x+=3; } while (x<w/4-1) { //Compute shifts PRESSURE_SSE_PRE(0); //Sum everything!!! PRESSURE_SSE_POST(0); x++; } { __m128 tmp; //Compute shifts __m128 sl_p = _mm_srli_sf128(vPressure[x],4); __m128 sr_p = _mm_slli_sf128(vPressure[x],4); sr_p = _mm_add_ps(sr_p,_mm_srli_sf128(vPressure[x-1],12)); __m128 sl_vx = _mm_srli_sf128(vVelX[x],4); __m128 sr_vx = _mm_slli_sf128(vVelX[x],4); sr_vx = _mm_add_ps(sr_vx,_mm_srli_sf128(vVelX[x-1],12)); //Sum everything!!! tmp = _mm_add_ps(sl_p, sr_p); tmp = _mm_add_ps(tmp, vPressureN[x]); tmp = _mm_add_ps(tmp, vPressureP[x]); tmp = _mm_sub_ps(tmp, sl_vx); tmp = _mm_add_ps(tmp, sr_vx); tmp = _mm_sub_ps(tmp, vVelYN[x]); tmp = _mm_add_ps(tmp, vVelYP[x]); vPressure[x] = _mm_mul_ps(tmp, div4); vPressureRow[w-1] = vPressureRow[w-2]; } #else float lastPressureX = fluidFloatPointer(pressure,sx + y*sy)[0]; float lastVelX = fluidFloatPointer(velX, y*sy)[0]; float curPressureX = lastPressureX; float curVelX = fluidFloatPointer(velX, sx + y*sy)[0]; fluidFloatPointer(pressure,y*sy)[0] = lastPressureX; int x; int curxy = sx + y*sy; for (x=1; x<w-1; x++) { float nextPressureX = fluidFloatPointer(pressure,curxy + sx)[0]; float nextVelX = fluidFloatPointer(velX,curxy + sx)[0]; fluidFloatPointer(pressure,curxy)[0] = ( lastPressureX + nextPressureX + fluidFloatPointer(pressure,curxy - sy)[0] + fluidFloatPointer(pressure,curxy + sy)[0] - ( nextVelX - lastVelX + fluidFloatPointer(velY,curxy + sy)[0] - fluidFloatPointer(velY,curxy - sy)[0])) / 4.0f; lastPressureX = curPressureX; curPressureX = nextPressureX; lastVelX = curVelX; curVelX = nextVelX; curxy += sx; } fluidFloatPointer(pressure,(w-1)*sx + y*sy)[0] = fluidFloatPointer(pressure,(w-2)*sx + y*sy)[0]; #endif } }
static int backward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, const P7_OMX *fwd, P7_OMX *bck, float *opt_sc) { vector float mpv, ipv, dpv; /* previous row values */ vector float mcv, dcv; /* current row values */ vector float tmmv, timv, tdmv; /* tmp vars for accessing rotated transition scores */ vector float xBv; /* collects B->Mk components of B(i) */ vector float xEv; /* splatted E(i) */ vector float zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 0,1..L */ int q; /* counter over quads 0..Q-1 */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ int j; /* DD segment iteration counter (4 = full serialization) */ vector float *dpc; /* current DP row */ vector float *dpp; /* next ("previous") DP row */ vector float *rp; /* will point into om->rfv[x] for residue x[i+1] */ vector float *tp; /* will point into (and step thru) om->tfv transition scores */ /* initialize the L row. */ bck->M = om->M; bck->L = L; bck->has_own_scales = FALSE; /* backwards scale factors are *usually* given by <fwd> */ dpc = bck->dpf[L * do_full]; xJ = 0.0; xB = 0.0; xN = 0.0; xC = om->xf[p7O_C][p7O_MOVE]; /* C<-T */ xE = xC * om->xf[p7O_E][p7O_MOVE]; /* E<-C, no tail */ xEv = esl_vmx_set_float(xE); zerov = (vector float) vec_splat_u32(0); dcv = (vector float) vec_splat_u32(0);; /* solely to silence a compiler warning */ for (q = 0; q < Q; q++) MMO(dpc,q) = DMO(dpc,q) = xEv; for (q = 0; q < Q; q++) IMO(dpc,q) = zerov; /* init row L's DD paths, 1) first segment includes xE, from DMO(q) */ tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dpv = vec_sld(DMO(dpc,Q-1), zerov, 4); for (q = Q-1; q >= 1; q--) { DMO(dpc,q) = vec_madd(dpv, *tp, DMO(dpc,q)); tp--; dpv = DMO(dpc,q); } dcv = vec_madd(dpv, *tp, zerov); DMO(dpc,q) = vec_add(DMO(dpc,q), dcv); /* 2) three more passes, only extending DD component (dcv only; no xE contrib from DMO(q)) */ for (j = 1; j < 4; j++) { tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dcv = vec_sld(dcv, zerov, 4); for (q = Q-1; q >= 0; q--) { dcv = vec_madd(dcv, *tp, zerov); tp--; DMO(dpc,q) = vec_add(DMO(dpc,q), dcv); } } /* now MD init */ tp = om->tfv + 7*Q - 3; /* <*tp> now the [4 8 12 x] Mk->Dk+1 quad */ dcv = vec_sld(DMO(dpc,0), zerov, 4); for (q = Q-1; q >= 0; q--) { MMO(dpc,q) = vec_madd(dcv, *tp, MMO(dpc,q)); tp -= 7; dcv = DMO(dpc,q); } /* Sparse rescaling: same scale factors as fwd matrix */ if (fwd->xmx[L*p7X_NXCELLS+p7X_SCALE] > 1.0) { xE = xE / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xN = xN / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xC = xC / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xJ = xJ / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xB = xB / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xEv = esl_vmx_set_float(1.0 / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]); for (q = 0; q < Q; q++) { MMO(dpc,q) = vec_madd(MMO(dpc,q), xEv, zerov); DMO(dpc,q) = vec_madd(DMO(dpc,q), xEv, zerov); IMO(dpc,q) = vec_madd(IMO(dpc,q), xEv, zerov); } } bck->xmx[L*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; bck->totscale = log(bck->xmx[L*p7X_NXCELLS+p7X_SCALE]); /* Stores */ bck->xmx[L*p7X_NXCELLS+p7X_E] = xE; bck->xmx[L*p7X_NXCELLS+p7X_N] = xN; bck->xmx[L*p7X_NXCELLS+p7X_J] = xJ; bck->xmx[L*p7X_NXCELLS+p7X_B] = xB; bck->xmx[L*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, L, 9, 4, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=L, width=9, precision=4*/ #endif /* main recursion */ for (i = L-1; i >= 1; i--) /* backwards stride */ { /* phase 1. B(i) collected. Old row destroyed, new row contains * complete I(i,k), partial {MD}(i,k) w/ no {MD}->{DE} paths yet. */ dpc = bck->dpf[i * do_full]; dpp = bck->dpf[(i+1) * do_full]; rp = om->rfv[dsq[i+1]] + Q-1; /* <*rp> is now the [4 8 12 x] match emission quad */ tp = om->tfv + 7*Q - 1; /* <*tp> is now the [4 8 12 x] TII transition quad */ /* leftshift the first transition quads */ tmmv = vec_sld(om->tfv[1], zerov, 4); timv = vec_sld(om->tfv[2], zerov, 4); tdmv = vec_sld(om->tfv[3], zerov, 4); mpv = vec_madd(MMO(dpp,0), om->rfv[dsq[i+1]][0], zerov); /* precalc M(i+1,k+1)*e(M_k+1,x_{i+1}) */ mpv = vec_sld(mpv, zerov, 4); xBv = zerov; for (q = Q-1; q >= 0; q--) /* backwards stride */ { vector float t1; ipv = IMO(dpp,q); /* assumes emission odds ratio of 1.0; i+1's IMO(q) now free */ t1 = vec_madd(mpv, timv, zerov); IMO(dpc,q) = vec_madd(ipv, *tp, t1); tp--; DMO(dpc,q) = vec_madd(mpv, tdmv, zerov); t1 = vec_madd(mpv, tmmv, zerov); mcv = vec_madd(ipv, *tp, t1); tp -= 2; /* obtain mpv for next q. i+1's MMO(q) is freed */ mpv = vec_madd(MMO(dpp,q), *rp, zerov); rp--; MMO(dpc,q) = mcv; tdmv = *tp; tp--; timv = *tp; tp--; tmmv = *tp; tp--; xBv = vec_madd(mpv, *tp, xBv); tp--; } /* phase 2: now that we have accumulated the B->Mk transitions in xBv, we can do the specials */ xB = esl_vmx_hsum_float(xBv); xC = xC * om->xf[p7O_C][p7O_LOOP]; xJ = (xB * om->xf[p7O_J][p7O_MOVE]) + (xJ * om->xf[p7O_J][p7O_LOOP]); /* must come after xB */ xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); /* must come after xB */ xE = (xC * om->xf[p7O_E][p7O_MOVE]) + (xJ * om->xf[p7O_E][p7O_LOOP]); /* must come after xJ, xC */ xEv = esl_vmx_set_float(xE); /* splat */ /* phase 3: {MD}->E paths and one step of the D->D paths */ tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dpv = vec_add(DMO(dpc,0), xEv); dpv = vec_sld(dpv, zerov, 4); for (q = Q-1; q >= 1; q--) { dcv = vec_madd(dpv, *tp, xEv); tp--; DMO(dpc,q) = vec_add(DMO(dpc,q), dcv); dpv = DMO(dpc,q); MMO(dpc,q) = vec_add(MMO(dpc,q), xEv); } dcv = vec_madd(dpv, *tp, zerov); DMO(dpc,q) = vec_add(DMO(dpc,q), vec_add(dcv, xEv)); MMO(dpc,q) = vec_add(MMO(dpc,q), xEv); /* phase 4: finish extending the DD paths */ /* fully serialized for now */ for (j = 1; j < 4; j++) /* three passes: we've already done 1 segment, we need 4 total */ { dcv = vec_sld(dcv, zerov, 4); tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ for (q = Q-1; q >= 0; q--) { dcv = vec_madd(dcv, *tp, zerov); tp--; DMO(dpc,q) = vec_add(DMO(dpc,q), dcv); } } /* phase 5: add M->D paths */ dcv = vec_sld(DMO(dpc,0), zerov, 4); tp = om->tfv + 7*Q - 3; /* <*tp> is now the [4 8 12 x] Mk->Dk+1 quad */ for (q = Q-1; q >= 0; q--) { MMO(dpc,q) = vec_madd(dcv, *tp, MMO(dpc,q)); tp -= 7; dcv = DMO(dpc,q); } /* Sparse rescaling */ /* In rare cases [J3/119] scale factors from <fwd> are * insufficient and backwards will overflow. In this case, we * switch on the fly to using our own scale factors, different * from those in <fwd>. This will complicate subsequent * posterior decoding routines. */ if (xB > 1.0e16) bck->has_own_scales = TRUE; if (bck->has_own_scales) bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = (xB > 1.0e4) ? xB : 1.0; else bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[i*p7X_NXCELLS+p7X_SCALE]; if (bck->xmx[i*p7X_NXCELLS+p7X_SCALE] > 1.0) { xE /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xN /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xJ /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xB /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xC /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xBv = esl_vmx_set_float(1.0 / bck->xmx[i*p7X_NXCELLS+p7X_SCALE]); for (q = 0; q < Q; q++) { MMO(dpc,q) = vec_madd(MMO(dpc,q), xBv, zerov); DMO(dpc,q) = vec_madd(DMO(dpc,q), xBv, zerov); IMO(dpc,q) = vec_madd(IMO(dpc,q), xBv, zerov); } bck->totscale += log(bck->xmx[i*p7X_NXCELLS+p7X_SCALE]); } /* Stores are separate only for pedagogical reasons: easy to * turn this into a more memory efficient version just by * deleting the stores. */ bck->xmx[i*p7X_NXCELLS+p7X_E] = xE; bck->xmx[i*p7X_NXCELLS+p7X_N] = xN; bck->xmx[i*p7X_NXCELLS+p7X_J] = xJ; bck->xmx[i*p7X_NXCELLS+p7X_B] = xB; bck->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, i, 9, 4, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=9, precision=4*/ #endif } /* thus ends the loop over sequence positions i */ /* Termination at i=0, where we can only reach N,B states. */ dpp = bck->dpf[1 * do_full]; tp = om->tfv; /* <*tp> is now the [1 5 9 13] TBMk transition quad */ rp = om->rfv[dsq[1]]; /* <*rp> is now the [1 5 9 13] match emission quad */ xBv = (vector float) vec_splat_u32(0); for (q = 0; q < Q; q++) { mpv = vec_madd(MMO(dpp,q), *rp, zerov); rp++; xBv = vec_madd(mpv, *tp, xBv); tp += 7; } /* horizontal sum of xBv */ xB = esl_vmx_hsum_float(xBv); xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); bck->xmx[p7X_B] = xB; bck->xmx[p7X_C] = 0.0; bck->xmx[p7X_J] = 0.0; bck->xmx[p7X_N] = xN; bck->xmx[p7X_E] = 0.0; bck->xmx[p7X_SCALE] = 1.0; #if p7_DEBUGGING dpc = bck->dpf[0]; for (q = 0; q < Q; q++) /* Not strictly necessary, but if someone's looking at DP matrices, this is nice to do: */ MMO(dpc,q) = DMO(dpc,q) = IMO(dpc,q) = zerov; if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, 0, 9, 4, bck->xmx[p7X_E], bck->xmx[p7X_N], bck->xmx[p7X_J], bck->xmx[p7X_B], bck->xmx[p7X_C]); /* logify=TRUE, <rowi>=0, width=9, precision=4*/ #endif if (isnan(xN)) ESL_EXCEPTION(eslERANGE, "backward score is NaN"); else if (L>0 && xN == 0.0) ESL_EXCEPTION(eslERANGE, "backward score underflow (is 0.0)"); /* [J5/118] */ else if (isinf(xN) == 1) ESL_EXCEPTION(eslERANGE, "backward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = bck->totscale + log(xN); return eslOK; }
vector double test_shift_left_double (vector double x, vector double y) { return vec_sld (x, y, /* shift_by */ 10); }
void *mem_searchrn(void *s, size_t len) { vector unsigned char v_cr; vector unsigned char v_nl; vector unsigned char v0; vector unsigned char v_perm; vector unsigned char c; vector bool char rr, rn; vector bool char last_rr; char *p; ssize_t k; size_t block_num; unsigned f; if(unlikely(!s || !len)) return NULL; /* only do one prefetch, this covers nearly 128k */ block_num = DIV_ROUNDUP(len, 512); f = block_num >= 256 ? 0 : block_num << 16; f |= 512; vec_dst((const unsigned char *)s, f, 2); v_cr = vec_splat_u8('\r'); v_nl = vec_splat_u8('\n'); v0 = vec_splat_u8(0); last_rr = (vector bool char)v0; k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len; p = (char *)ALIGN_DOWN(s, SOVUC); c = vec_ldl(0, (const vector unsigned char *)p); if(unlikely(k > 0)) goto K_SHIFT; v_perm = vec_lvsl(0, (unsigned char *)s); c = vec_perm(c, v0, v_perm); v_perm = vec_lvsr(0, (unsigned char *)s); c = vec_perm(v0, c, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); k = -k; goto START_LOOP; do { p += SOVUC; c = vec_ldl(0, (const vector unsigned char *)p); k -= SOVUC; if(k > 0) { rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) { vec_dss(2); return p - 1; } START_LOOP: last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1); rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) { vec_dss(2); return p + vec_zpos(rr); } } } while(k > 0); k = -k; K_SHIFT: vec_dss(2); v_perm = vec_lvsr(0, (unsigned char *)k); c = vec_perm(v0, c, v_perm); v_perm = vec_lvsl(0, (unsigned char *)k); c = vec_perm(c, v0, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) return p - 1; rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) return p + vec_zpos(rr); return NULL; }
static av_always_inline void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int my, int w, int is6tap) { LOAD_V_SUBPEL_FILTER(my-1); vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); vec_u16 c7 = vec_splat_u16(7); // we want pixels 0-7 to be in the even positions and 8-15 in the odd, // so combine this permute with the alignment permute vector align_vech = vec_lvsl(0, src); align_vecl = vec_sld(align_vech, align_vech, 8); if (w ==16) perm_vec = vec_mergeh(align_vech, align_vecl); else perm_vec = vec_mergeh(align_vech, align_vech); if (is6tap) s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); if (is6tap) s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); src += (2+is6tap)*src_stride; while (h --> 0) { if (is6tap) s5 = load_with_perm_vec(0, src, perm_vec); else s4 = load_with_perm_vec(0, src, perm_vec); FILTER_V(f16h, vec_mule); if (w == 16) { FILTER_V(f16l, vec_mulo); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); if (w == 4) filt = (vec_u8)vec_splat((vec_u32)filt, 0); else vec_ste((vec_u32)filt, 4, (uint32_t*)dst); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); } if (is6tap) s0 = s1; s1 = s2; s2 = s3; s3 = s4; if (is6tap) s4 = s5; dst += dst_stride; src += src_stride; } }
void foo() { vector bool int boolVector = (vector bool int) vec_splat_u32(3); boolVector = vec_sld( boolVector, boolVector, 1 ); /* { dg-bogus "no instance of overloaded" } */ }
inline ushort v_reduce_sum(const v_uint16x8& a) { const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8)))); return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3)); }