Esempio n. 1
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp    = om->tfv + 7*q;       	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u, tv;
  float   path[4];
  int     state[4] = { p7T_M, p7T_I, p7T_D, p7T_B };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  

  /* paths are numbered so that most desirable choice in case of tie is first. */
  u.v = xBv;  tv.v = *tp;  path[3] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = mpv;  tv.v = *tp;  path[0] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = ipv;  tv.v = *tp;  path[1] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = dpv;  tv.v = *tp;  path[2] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  
  return state[esl_vec_FArgMax(path, 4)];
}
Esempio n. 2
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp = om->tfv + 7*q;    	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u;
  float   path[4];
  int     state[4] = { p7T_B, p7T_M, p7T_I, p7T_D };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  
  
  u.v = vec_madd(xBv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[1] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov); tp++;  path[2] = u.p[r];
  u.v = vec_madd(dpv, *tp, zerov);        path[3] = u.p[r];
  esl_vec_FNorm(path, 4);
  return state[esl_rnd_FChoose(rng, path, 4)];
}
Esempio n. 3
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float zerov;
  union { vector float v; float p[4]; } mpv, dpv, tmdv, tddv;
  float   path[2];

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv.v  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv.v  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv.v = om->tfv[7*(q-1) + p7O_MD];
    tddv.v = om->tfv[7*Q + (q-1)];
  } else {
    mpv.v  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv.v  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv.v = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv.v = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  path[0] = ((tmdv.p[r] == 0.0) ? -eslINFINITY : mpv.p[r]);
  path[1] = ((tddv.p[r] == 0.0) ? -eslINFINITY : dpv.p[r]);
  return  ((path[0] >= path[1]) ? p7T_M : p7T_D);
}
Esempio n. 4
0
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
{
    vec_u16_t onev = vec_splat_u16(1);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3;

    s0 = vec_ld( 0x00, dct );
    s1 = vec_sld( s0, s0, 8 );
    s2 = vec_ld( 0x10, dct );
    s3 = vec_sld( s2, s2, 8 );

    vec_s16_t d0, d1, d2, d3;
    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );

    vec_s16_t tr0, tr1, tr2, tr3;

    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );

    vec_s16_t idct0, idct1, idct2, idct3;
    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );

    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
    vec_u16_t sixv = vec_splat_u16(6);
    LOAD_ZERO;

    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
}
Esempio n. 5
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float  zerov;
  vector float  mpv, dpv;
  vector float  tmdv, tddv;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_D };

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv = om->tfv[7*(q-1) + p7O_MD];
    tddv = om->tfv[7*Q + (q-1)];
  } else {
    mpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r];
  u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Esempio n. 6
0
void foo (void) 
{
  vector bool int boolVec1 = (vector bool int) vec_splat_u32(3);
  vector bool short boolVec2 = (vector bool short) vec_splat_u16(3);
  vector bool char boolVec3 = (vector bool char) vec_splat_u8(3);

  boolVec1 = vec_sld( boolVec1, boolVec1, 4 );
  boolVec2 = vec_sld( boolVec2, boolVec2, 2 );
  boolVec3 = vec_sld( boolVec3, boolVec3, 1 );
}
Esempio n. 7
0
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
{
    vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
    ac = vec_add(ac, vec_sld(ac, ac, 8));

    vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
    bd = vec_add(bd, vec_sld(bd, bd, 8));
    return v_float32x4(vec_mergeh(ac, bd));
}
Esempio n. 8
0
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
{
    enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
    if (CV_SHIFT == 16)
        return a;
#ifdef __IBMCPP__
    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
#else
    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
#endif
}
void test_integer(void) {
  vf = vec_sld(vf, vf, idx);    // expected-error {{no matching function}}
                                // [email protected]:* 13 {{candidate function not viable}}
                                // [email protected]:* 1 {{must be a constant integer from 0 to 15}}
  vd = vec_sld(vd, vd, idx);    // expected-error {{no matching function}}
                                // [email protected]:* 13 {{candidate function not viable}}
                                // [email protected]:* 1 {{must be a constant integer from 0 to 15}}

  vuc = vec_msum_u128(vul, vul, vuc, idx);  // expected-error {{must be a constant integer}}
  vuc = vec_msum_u128(vul, vul, vuc, -1);   // expected-error {{should be a value from 0 to 15}}
  vuc = vec_msum_u128(vul, vul, vuc, 16);   // expected-error {{should be a value from 0 to 15}}
}
Esempio n. 10
0
void v_load_deinterleave_f32(float *ptr, vector float* a, vector float* b, vector float* c)
{
    vector float v1 = vec_xl( 0, ptr);
    vector float v2 = vec_xl(16, ptr);
    vector float v3 = vec_xl(32, ptr);

    static const vector unsigned char flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};
    *a = vec_perm(v1, vec_sld(v3, v2, 8), flp);

    static const vector unsigned char flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};
    *b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);

    *c = vec_perm(vec_sld(v2, v1, 8), v3, flp);
}
Esempio n. 11
0
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
{
    enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
    if (CV_SHIFT == 16)
        return b;
    return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
}
Esempio n. 12
0
void v_store_interleave_f32(float *ptr, vector float a, vector float b, vector float c)
{
    vector float hbc = vec_mergeh(b, c);

    static const vector unsigned char ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};
    vec_xst(vec_perm(a, hbc, ahbc),  0, ptr);

    vector float lab = vec_mergel(a, b);
    vec_xst(vec_sld(lab, hbc, 8), 16, ptr);

    static const vector unsigned char clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};
    vec_xst(vec_perm(c, lab, clab), 32, ptr);
}
Esempio n. 13
0
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
    vec_s16 va0, va1, va2, va3;
    vec_s16 vz0, vz1, vz2, vz3;
    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
    vec_u8 va_u8;
    vec_u32 va_u32;
    vec_s16 vdst_ss;
    const vec_u16 v6us = vec_splat_u16(6);
    vec_u8 vdst, vdst_orig;
    vec_u8 vdst_mask = vec_lvsl(0, dst);
    int element = ((unsigned long)dst & 0xf) >> 2;
    LOAD_ZERO;

    block[0] += 32;  /* add 32 as a DC-level for rounding */

    vtmp0 = vec_ld(0,block);
    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
    vtmp2 = vec_ld(16,block);
    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
    memset(block, 0, 16 * sizeof(int16_t));

    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);

    va0 = vec_sra(va0,v6us);
    va1 = vec_sra(va1,v6us);
    va2 = vec_sra(va2,v6us);
    va3 = vec_sra(va3,v6us);

    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
}
Esempio n. 14
0
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
    vec_s16 dc16;
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
    LOAD_ZERO;
    DECLARE_ALIGNED(16, int, dc);
    int i;

    dc = (block[0] + 32) >> 6;
    block[0] = 0;
    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);

    if (size == 4)
        dc16 = vec_sld(dc16, zero_s16v, 8);
    dcplus = vec_packsu(dc16, zero_s16v);
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);

    aligner = vec_lvsr(0, dst);
    dcplus = vec_perm(dcplus, dcplus, aligner);
    dcminus = vec_perm(dcminus, dcminus, aligner);

    for (i = 0; i < size; i += 4) {
        v0 = vec_ld(0, dst+0*stride);
        v1 = vec_ld(0, dst+1*stride);
        v2 = vec_ld(0, dst+2*stride);
        v3 = vec_ld(0, dst+3*stride);

        v0 = vec_adds(v0, dcplus);
        v1 = vec_adds(v1, dcplus);
        v2 = vec_adds(v2, dcplus);
        v3 = vec_adds(v3, dcplus);

        v0 = vec_subs(v0, dcminus);
        v1 = vec_subs(v1, dcminus);
        v2 = vec_subs(v2, dcminus);
        v3 = vec_subs(v3, dcminus);

        vec_st(v0, 0, dst+0*stride);
        vec_st(v1, 0, dst+1*stride);
        vec_st(v2, 0, dst+2*stride);
        vec_st(v3, 0, dst+3*stride);

        dst += 4*stride;
    }
}
Esempio n. 15
0
void foo(void) {
  const unsigned char *buf;
  vector pixel vp = { 3, 4, 5, 6 };
  vector bool int vbi = { 1, 0, 1, 0 };
  vector bool short vbs = { 1, 0, 1, 0, 1, 0, 1, 0 };
  vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
  vector signed char vsc;
  int a = 3;
  
  vec_dst(buf, a, 1);
  vec_dstst(buf, a, 2);
  vec_dststt(buf, a, 3);
  vec_dststt(buf, a, 2);

  vp = vec_sld(vp, vp, 5);
  vbc = vec_splat(vbc, 7);
  vbs = vec_splat(vbs, 12);
  vp = vec_splat(vp, 17);
  vbi = vec_splat(vbi, 31);  
}
Esempio n. 16
0
/* Function:  p7_ViterbiFilter()
 * Synopsis:  Calculates Viterbi score, vewy vewy fast, in limited precision.
 * Incept:    SRE, Tue Nov 27 09:15:24 2007 [Janelia]
 *
 * Purpose:   Calculates an approximation of the Viterbi score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and a preallocated one-row DP matrix <ox>. Return the 
 *            estimated Viterbi score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow. 
 *            
 *            The model must be in a local alignment mode; other modes
 *            cannot provide the necessary guarantee of no underflow.
 *            
 *            This is a striped SIMD Viterbi implementation using Intel
 *            VMX integer intrinsics \citep{Farrar07}, in reduced
 *            precision (signed words, 16 bits).
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: Viterbi score (in nats)          
 *
 * Returns:   <eslOK> on success;
 *            <eslERANGE> if the score overflows; in this case
 *            <*ret_sc> is <eslINFINITY>, and the sequence can 
 *            be treated as a high-scoring hit.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small, or if
 *            profile isn't in a local alignment mode. (Must be in local
 *            alignment mode because that's what helps us guarantee 
 *            limited dynamic range.)
 *
 * Xref:      [Farrar07] for ideas behind striped SIMD DP.
 *            J2/46-47 for layout of HMMER's striped SIMD DP.
 *            J2/50 for single row DP.
 *            J2/60 for reduced precision (epu8)
 *            J2/65 for initial benchmarking
 *            J2/66 for precision maximization
 *            J4/138-140 for reimplementation in 16-bit precision
 */
int
p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  vector signed short mpv, dpv, ipv; /* previous row values                                       */
  vector signed short sv;	     /* temp storage of 1 curr row value in progress              */
  vector signed short dcv;	     /* delayed storage of D(i,q+1)                               */
  vector signed short xEv;	     /* E state: keeps max for Mk->E as we go                     */
  vector signed short xBv;	     /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector signed short Dmaxv;         /* keeps track of maximum D cell on row                      */
  int16_t  xE, xB, xC, xJ, xN;	     /* special states' scores                                    */
  int16_t  Dmax;		     /* maximum D cell score on row                               */
  int i;			     /* counter over sequence positions 1..L                      */
  int q;			     /* counter over vectors 0..nq-1                              */
  int Q;                             /* segment length: # of vectors                              */
  vector signed short *dp;           /* using {MDI}MX(q) macro requires initialization of <dp>    */
  vector signed short *rsc;	     /* will point at om->ru[x] for residue x[i]                  */
  vector signed short *tsc;	     /* will point into (and step thru) om->tu                    */

  vector signed short negInfv;

  Q = p7O_NQW(om->M);
  dp = ox->dpw[0];

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ8)                                 ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment");
  ox->M   = om->M;

  negInfv = esl_vmx_set_s16((signed short)-32768);
  
  /* Initialization. In unsigned arithmetic, -infinity is -32768
   */
  for (q = 0; q < Q; q++)
    MMXo(q) = IMXo(q) = DMXo(q) = negInfv;
  xN   = om->base_w;
  xB   = xN + om->xw[p7O_N][p7O_MOVE];
  xJ   = -32768;
  xC   = -32768;
  xE   = -32768;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */
#endif

  for (i = 1; i <= L; i++)
    {
      rsc   = om->rwv[dsq[i]];
      tsc   = om->twv;
      dcv   = negInfv;               /* "-infinity" */
      xEv   = negInfv;
      Dmaxv = negInfv;
      xBv   = esl_vmx_set_s16(xB);

      /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically; replace it with -32768.
       */
      mpv = MMXo(Q-1);  mpv = vec_sld(negInfv, mpv, 14);
      dpv = DMXo(Q-1);  dpv = vec_sld(negInfv, dpv, 14);
      ipv = IMXo(Q-1);  ipv = vec_sld(negInfv, ipv, 14);

      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
	  sv   =              vec_adds(xBv, *tsc);  tsc++;
	  sv   = vec_max (sv, vec_adds(mpv, *tsc)); tsc++;
	  sv   = vec_max (sv, vec_adds(ipv, *tsc)); tsc++;
	  sv   = vec_max (sv, vec_adds(dpv, *tsc)); tsc++;
	  sv   = vec_adds(sv, *rsc);                rsc++;
	  xEv  = vec_max(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMXo(q);
	  dpv = DMXo(q);
	  ipv = IMXo(q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMXo(q) = sv;
	  DMXo(q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = vec_adds(sv, *tsc);  tsc++;
	  Dmaxv = vec_max(dcv, Dmaxv);

	  /* Calculate and store I(i,q) */
	  sv     =             vec_adds(mpv, *tsc);  tsc++;
	  IMXo(q)= vec_max(sv, vec_adds(ipv, *tsc)); tsc++;
	}	  

      /* Now the "special" states, which start from Mk->E (->C, ->J->B) */
      xE = esl_vmx_hmax_s16(xEv);
      if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; }	/* immediately detect overflow */
      xN = xN + om->xw[p7O_N][p7O_LOOP];
      xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]);
      xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]);
      xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Finally the "lazy F" loop (sensu [Farrar07]). We can often
       * prove that we don't need to evaluate any D->D paths at all.
       *
       * The observation is that if we can show that on the next row,
       * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths
       * for all k, then we don't need any D->D calculations.
       * 
       * The test condition is:
       *      max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i)
       * So:
       *   max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound;
       *   max_k D(i,k) is why we tracked Dmaxv;
       *   xB(i) was just calculated above.
       */
      Dmax = esl_vmx_hmax_s16(Dmaxv);
      if (Dmax + om->ddbound_w > xB) 
	{
	  /* Now we're obligated to do at least one complete DD path to be sure. */
	  /* dcv has carried through from end of q loop above */
	  dcv = vec_sld(negInfv, dcv, 14); 
	  tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
	  for (q = 0; q < Q; q++) 
	    {
	      DMXo(q) = vec_max(dcv, DMXo(q));	
	      dcv     = vec_adds(DMXo(q), *tsc); tsc++;
	    }

	  /* We may have to do up to three more passes; the check
	   * is for whether crossing a segment boundary can improve
	   * our score. 
	   */
	  do {
	    dcv = vec_sld(negInfv, dcv, 14); 
	    tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
	    for (q = 0; q < Q; q++) 
	      {
		if (! vec_any_gt(dcv, DMXo(q))) break;
		DMXo(q) = vec_max(dcv, DMXo(q));	
		dcv     = vec_adds(DMXo(q), *tsc);   tsc++;
	      }	    
	  } while (q == Q);
	}
      else  /* not calculating DD? then just store the last M->D vector calc'ed.*/
	DMXo(0) = vec_sld(negInfv, dcv, 14);
	  
#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC);   
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T */
  if (xC > -32768) 
    {
      *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w;
      /* *ret_sc += L * om->ncj_roundoff;  see J4/150 for rationale: superceded by -3.0nat approximation*/
      *ret_sc /= om->scale_w;
      *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */
    }
  else *ret_sc = -eslINFINITY;
  return eslOK;
}
Esempio n. 17
0
float vsincos2f(float x) {

  // Load x into an aligned float array
  float __attribute__((aligned(16))) xa[4];
  xa[0] = x;

  // We want to calculate these:
  // nom   = 166320.0 * x - 22260.0 * POW3(x) + 551.0 * POW5(x);
  // denom = 166320.0 + 5460.0 * POW2(x) + 75.0 * POW4(x);
  // res = nom/denom;
  //
  // We first setup our constants:
  // vc1 = | a1  | a3 | b0  |  b2 |
  // vc2 = | 0.0 | a5 | 0.0 | 0.0 |
  vector float vc1 = { 166320.0, -22260, 166320.0, 5460.0 },
	       vc2 = { 0.0, 551.0, 0.0, 75.0 };
  vector float vx = vec_ld(0, xa);
  vector float vres, vdenom, vest1, 
               vx2, vx02, vx13, vx24,
               v0 = (vector float)vec_splat_u32(0),
               v1 = vec_ctf(vec_splat_u32(1),0);

  // Load x into a vector and splat it all over
  vx = vec_splat(vx, 0);
  // get the vector with all elements: x^2 
  vx2 = vec_madd(vx, vx, v0);

  // We need a vector with | 1.0 | x^2 | 1.0 | x^2 |
  vx02 = vec_mergeh(v1, vx2);
  // Multiply with x -> | x | x^3 | x | x^3 |
  vx13 = vec_madd(vx, vx02, v0);
  // Now shift left and combine with vx02 -> | x | x^3 | 1.0 | x^2 |
  vx13 = vec_sld(vx13, vx02, 8);
  // Again with x^2 -> | x^3 | x^5 | x^2 | x^4 |
  vx24 = vec_madd(vx13, vx2, v0);

  // Multiply with the coefficients vectors:
  // First with vc1 -> | a1*x | a3*x^3 | b0*1.0 | b2*x^2 |
  vres = vec_madd(vx13, vc1, v0);
  // Now with vc2 (and add previous result) -> | a1*x + 0*x^3 | a3*x^3 + a5*x^5 | b0*1.0 + 0.0*x^2 | b2*x^2 + b4*x^4 |
  vres = vec_madd(vx24, vc2, vres);
  // Shift left by 4 and add the vectors -> | nom | .. | denom | .. |
  vres = vec_add(vres, vec_sld(vres, vres, 4));

  // Now splat denom (we don't have to splat nom, we'll just take the first element after the division.
  vdenom = vec_splat(vres, 2);

  vest1 = vec_re(vdenom);
  //1st round of Newton-Raphson refinement
  vdenom = vec_madd( vest1, vec_nmsub( vest1, vdenom, v1 ), vest1 );
  // 2nd round of Newton-Raphson refinement
  // vdenom = vec_madd( vest2, vec_nmsub( vest2, vdenom, v1 ), vest2 );
  vres = vec_madd(vres, vdenom, v0);
  vec_st(vres, 0, xa);
  //printf("vres = %2.7f %2.7f %2.7f %2.7f\n", xa[0], xa[1], xa[2], xa[3]);

/*  float nom, denom, res;
  nom   = 166320.0 * x - 22260.0 * POW3(x) + 551.0 * POW5(x);
  denom = 166320.0 + 5460.0 * POW2(x) + 75.0 * POW4(x);

  printf("nom = %2.7f, denom = %2.7f\n", nom, denom);

  res = nom/denom;
  printf("res = %2.7f\n", res);*/
  printf("res = %2.7f\n", xa[0]);
  return xa[0];
}
Esempio n. 18
0
static inline vector float vec_reduce( vector float v ) {
    v = vec_add( v, vec_sld( v, v, 8 ) );
    v = vec_add( v, vec_sld( v, v, 4 ) );
    return ( v );
}
Esempio n. 19
0
/* Function:  p7_OptimalAccuracy()
 * Synopsis:  DP fill of an optimal accuracy alignment calculation.
 * Incept:    SRE, Mon Aug 18 11:04:48 2008 [Janelia]
 *
 * Purpose:   Calculates the fill step of the optimal accuracy decoding
 *            algorithm \citep{Kall05}.
 *            
 *            Caller provides the posterior decoding matrix <pp>,
 *            which was calculated by Forward/Backward on a target sequence
 *            of length <pp->L> using the query model <om>.
 *            
 *            Caller also provides a DP matrix <ox>, allocated for a full
 *            <om->M> by <L> comparison. The routine fills this in
 *            with OA scores.
 *  
 * Args:      gm    - query profile      
 *            pp    - posterior decoding matrix created by <p7_GPosteriorDecoding()>
 *            gx    - RESULT: caller provided DP matrix for <gm->M> by <L> 
 *            ret_e - RETURN: expected number of correctly decoded positions 
 *
 * Returns:   <eslOK> on success, and <*ret_e> contains the final OA
 *            score, which is the expected number of correctly decoded
 *            positions in the target sequence (up to <L>).
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_OptimalAccuracy(const P7_OPROFILE *om, const P7_OMX *pp, P7_OMX *ox, float *ret_e)
{
  vector float mpv, dpv, ipv;      /* previous row values                                       */
  vector float sv;		   /* temp storage of 1 curr row value in progress              */
  vector float xEv;		   /* E state: keeps max for Mk->E as we go                     */
  vector float xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector float dcv;
  float  *xmx = ox->xmx;
  vector float *dpc = ox->dpf[0];  /* current row, for use in {MDI}MO(dpp,q) access macro       */
  vector float *dpp;               /* previous row, for use in {MDI}MO(dpp,q) access macro      */
  vector float *ppp;		   /* quads in the <pp> posterior probability matrix            */
  vector float *tp;		   /* quads in the <om->tfv> transition scores                  */
  vector float zerov;
  vector float infv;
  int M = om->M;
  int Q = p7O_NQF(M);
  int q;
  int j;
  int i;
  float t1, t2;

  zerov = (vector float) vec_splat_u32(0);
  infv  = esl_vmx_set_float(-eslINFINITY);

  ox->M = om->M;
  ox->L = pp->L;
  for (q = 0; q < Q; q++) MMO(dpc, q) = IMO(dpc,q) = DMO(dpc,q) = infv;
  XMXo(0, p7X_E)    = -eslINFINITY;
  XMXo(0, p7X_N)    = 0.;
  XMXo(0, p7X_J)    = -eslINFINITY;
  XMXo(0, p7X_B)    = 0.;
  XMXo(0, p7X_C)    = -eslINFINITY;

  for (i = 1; i <= pp->L; i++)
    {
      dpp = dpc;		/* previous DP row in OA matrix */
      dpc = ox->dpf[i];   	/* current DP row in OA matrix  */
      ppp = pp->dpf[i];		/* current row in the posterior probabilities per position */
      tp  = om->tfv;		/* transition probabilities */
      dcv = infv;
      xEv = infv;
      xBv = esl_vmx_set_float(XMXo(i-1, p7X_B));

      mpv = vec_sld(infv, MMO(dpp,Q-1), 12);  /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. */
      dpv = vec_sld(infv, DMO(dpp,Q-1), 12);
      ipv = vec_sld(infv, IMO(dpp,Q-1), 12);
      for (q = 0; q < Q; q++)
	{
	  sv  =             vec_and(vec_cmpgt(*tp, zerov), xBv);  tp++;
	  sv  = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), mpv)); tp++;
	  sv  = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv)); tp++;
	  sv  = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), dpv)); tp++;
	  sv  = vec_add(sv, *ppp);                                ppp += 2;
	  xEv = vec_max(xEv, sv);
	  
	  mpv = MMO(dpp,q);
	  dpv = DMO(dpp,q);
	  ipv = IMO(dpp,q);

	  MMO(dpc,q) = sv;
	  DMO(dpc,q) = dcv;

	  dcv = vec_and(vec_cmpgt(*tp, zerov), sv); tp++;

	  sv         =             vec_and(vec_cmpgt(*tp, zerov), mpv);   tp++;
	  sv         = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv));  tp++;
	  IMO(dpc,q) = vec_add(sv, *ppp);                                 ppp++;
	}
      
      /* dcv has carried through from end of q loop above; store it 
       * in first pass, we add M->D and D->D path into DMX
       */
      dcv = vec_sld(infv, dcv, 12);
      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
      for (q = 0; q < Q; q++)
	{
	  DMO(dpc, q) = vec_max(dcv, DMO(dpc, q));
	  dcv         = vec_and(vec_cmpgt(*tp, zerov), DMO(dpc,q));   tp++;
	}

      /* fully serialized D->D; can optimize later */
      for (j = 1; j < 4; j++)
	{
	  dcv = vec_sld(infv, dcv, 12);
	  tp  = om->tfv + 7*Q;	
	  for (q = 0; q < Q; q++)
	    {
	      DMO(dpc, q) = vec_max(dcv, DMO(dpc, q));
	      dcv         = vec_and(vec_cmpgt(*tp, zerov), dcv);   tp++;
	    }
	}

      /* D->E paths */
      for (q = 0; q < Q; q++) xEv = vec_max(xEv, DMO(dpc,q));
      
      /* Specials */
      XMXo(i,p7X_E) = esl_vmx_hmax_float(xEv);
      
      t1 = ( (om->xf[p7O_J][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_J] + pp->xmx[i*p7X_NXCELLS+p7X_J]);
      t2 = ( (om->xf[p7O_E][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[   i *p7X_NXCELLS+p7X_E]);
      ox->xmx[i*p7X_NXCELLS+p7X_J] = ESL_MAX(t1, t2);

      t1 = ( (om->xf[p7O_C][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_C] + pp->xmx[i*p7X_NXCELLS+p7X_C]);
      t2 = ( (om->xf[p7O_E][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[   i *p7X_NXCELLS+p7X_E]);
      ox->xmx[i*p7X_NXCELLS+p7X_C] = ESL_MAX(t1, t2);
      
      ox->xmx[i*p7X_NXCELLS+p7X_N] = ((om->xf[p7O_N][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_N] + pp->xmx[i*p7X_NXCELLS+p7X_N]);
      
      t1 = ( (om->xf[p7O_N][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_N]);
      t2 = ( (om->xf[p7O_J][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_J]);
      ox->xmx[i*p7X_NXCELLS+p7X_B] = ESL_MAX(t1, t2);
    }

  *ret_e = ox->xmx[pp->L*p7X_NXCELLS+p7X_C];
  return eslOK;
}
Esempio n. 20
0
/* Function:  p7_MSVFilter()
 * Synopsis:  Calculates MSV score, vewy vewy fast, in limited precision.
 * Incept:    SRE, Wed Dec 26 15:12:25 2007 [Janelia]
 *
 * Purpose:   Calculates an approximation of the MSV score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and a preallocated one-row DP matrix <ox>. Return the 
 *            estimated MSV score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow.
 *            
 *            The model may be in any mode, because only its match
 *            emission scores will be used. The MSV filter inherently
 *            assumes a multihit local mode, and uses its own special
 *            state transition scores, not the scores in the profile.
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: MSV score (in nats)          
 *                      
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *            <eslERANGE> if the score overflows the limited range; in
 *            this case, this is a high-scoring hit.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  vector unsigned char mpv;        /* previous row values                                       */
  vector unsigned char xEv;	   /* E state: keeps max for Mk->E as we go                     */
  vector unsigned char xBv;	   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector unsigned char sv;	   /* temp storage of 1 curr row value in progress              */
  vector unsigned char biasv;	   /* emission bias in a vector                                 */
  uint8_t xJ;                      /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  vector unsigned char *dp;	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  vector unsigned char *rsc;	   /* will point at om->rbv[x] for residue x[i]                 */

  vector unsigned char zerov;	   /* vector of zeros                                           */
  vector unsigned char xJv;        /* vector for states score                                   */
  vector unsigned char tjbmv;       /* vector for B->Mk cost                                     */
  vector unsigned char tecv;       /* vector for E->C  cost                                     */
  vector unsigned char basev;      /* offset for scores                                         */
  vector unsigned char ceilingv;   /* saturateed simd value used to test for overflow           */
  vector unsigned char tempv;

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;

  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.
   */
  dp  = ox->dpb[0];
  for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0);
  xJ   = 0;

  biasv = esl_vmx_set_u8(om->bias_b);
  zerov = vec_splat_u8(0);

  /* saturate simd register for overflow test */
  tempv = vec_splat_u8(1);
  ceilingv = (vector unsigned char)vec_cmpeq(biasv, biasv);
  ceilingv = vec_subs(ceilingv, biasv);
  ceilingv = vec_subs(ceilingv, tempv);

  basev = esl_vmx_set_u8((int8_t) om->base_b);

  tecv = esl_vmx_set_u8((int8_t) om->tec_b);
  tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b);

  xJv = vec_subs(biasv, biasv);
  xBv = vec_subs(basev, tjbmv);

#if p7_DEBUGGING
  if (ox->debugging)
	{
	  unsigned char xB;
	  vec_ste(xBv, 0, &xB);
	  vec_ste(xJv, 0, &xJ);
	  p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ);
	}
#endif

  for (i = 1; i <= L; i++)
  {
      rsc = om->rbv[dsq[i]];
      xEv = vec_splat_u8(0);
//      xBv = vec_sub(xBv, tbmv);

      /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically, which is our -infinity.
       */
      mpv = vec_sld(zerov, dp[Q-1], 15);   
      for (q = 0; q < Q; q++)
      {
        /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
        sv   = vec_max(mpv, xBv);
        sv   = vec_adds(sv, biasv);
        sv   = vec_subs(sv, *rsc);   rsc++;
        xEv  = vec_max(xEv, sv);

        mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
        dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
      }

      /* Now the "special" states, which start from Mk->E (->C, ->J->B)
       * Use rotates instead of shifts so when the last max has completed,
       * all elements of the simd register will contain the max value.
       */
      tempv = vec_sld(xEv, xEv, 1);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 2);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 4);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 8);
      xEv = vec_max(xEv, tempv);

      /* immediately detect overflow */
      if (vec_any_gt(xEv, ceilingv))
      {
        *ret_sc = eslINFINITY;
        return eslERANGE;
      }

      xEv = vec_subs(xEv, tecv);
      xJv = vec_max(xJv,xEv);

      xBv = vec_max(basev, xJv);
      xBv = vec_subs(xBv, tjbmv);
	  
#if p7_DEBUGGING
      if (ox->debugging)
      {
        unsigned char xB, xE;
        vec_ste(xBv, 0, &xB);
        vec_ste(xEv, 0, &xE);
        vec_ste(xJv, 0, &xJ);
        p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ);
      }
#endif
  } /* end loop over sequence residues 1..L */

  /* finally C->T, and add our missing precision on the NN,CC,JJ back */
  vec_ste(xJv, 0, &xJ);
  *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b);
  *ret_sc /= om->scale_b;
  *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */

  return eslOK;
}
Esempio n. 21
0
/* Function:  p7_ViterbiScore()
 * Synopsis:  Calculates Viterbi score, correctly, and vewy vewy fast.
 * Incept:    SRE, Tue Nov 27 09:15:24 2007 [Janelia]
 *
 * Purpose:   Calculates the Viterbi score for sequence <dsq> of length <L> 
 *            residues, using optimized profile <om>, and a preallocated
 *            one-row DP matrix <ox>. Return the Viterbi score (in nats)
 *            in <ret_sc>.
 *            
 *            The model <om> must be configured specially to have
 *            lspace float scores, not its usual pspace float scores for
 *            <p7_ForwardFilter()>.
 *            
 *            As with all <*Score()> implementations, the score is
 *            accurate (full range and precision) and can be
 *            calculated on models in any mode, not only local modes.
 *            
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: Viterbi score (in nats)          
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_ViterbiScore(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  vector float mpv, dpv, ipv;      /* previous row values                                       */
  vector float sv;		   /* temp storage of 1 curr row value in progress              */
  vector float dcv;		   /* delayed storage of D(i,q+1)                               */
  vector float xEv;		   /* E state: keeps max for Mk->E as we go                     */
  vector float xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector float Dmaxv;              /* keeps track of maximum D cell on row                      */
  vector float infv;		   /* -eslINFINITY in a vector                                  */
  float    xN, xE, xB, xC, xJ;	   /* special states' scores                                    */
  float    Dmax;		   /* maximum D cell on row                                     */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over vectors 0..nq-1                              */
  int Q       = p7O_NQF(om->M);	   /* segment length: # of vectors                              */
  vector float *dp  = ox->dpf[0];  /* using {MDI}MX(q) macro requires initialization of <dp>    */
  vector float *rsc;		   /* will point at om->rf[x] for residue x[i]                  */
  vector float *tsc;		   /* will point into (and step thru) om->tf                    */

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ4) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M  = om->M;

  /* Initialization. */
  infv = esl_vmx_set_float(-eslINFINITY);
  for (q = 0; q < Q; q++)
    MMXo(q) = IMXo(q) = DMXo(q) = infv;
  xN   = 0.;
  xB   = om->xf[p7O_N][p7O_MOVE];
  xE   = -eslINFINITY;
  xJ   = -eslINFINITY;
  xC   = -eslINFINITY;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpFloatRow(ox, FALSE, 0, 5, 2, xE, xN, xJ, xB, xC); /* logify=FALSE, <rowi>=0, width=5, precision=2*/
#endif

  for (i = 1; i <= L; i++)
    {
      rsc   = om->rf[dsq[i]];
      tsc   = om->tf;
      dcv   = infv;
      xEv   = infv;
      Dmaxv = infv;
      xBv   = esl_vmx_set_float(xB);

      mpv = vec_sld(infv, MMXo(Q-1), 12);  /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. */
      dpv = vec_sld(infv, DMXo(Q-1), 12);
      ipv = vec_sld(infv, IMXo(Q-1), 12);
      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
	  sv   =                vec_add(xBv, *tsc);  tsc++;
	  sv   = vec_max(sv, vec_add(mpv, *tsc));    tsc++;
	  sv   = vec_max(sv, vec_add(ipv, *tsc));    tsc++;
	  sv   = vec_max(sv, vec_add(dpv, *tsc));    tsc++;
	  sv   = vec_add(sv, *rsc);                  rsc++;
	  xEv  = vec_max(xEv, sv);

	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMXo(q);
	  dpv = DMXo(q);
	  ipv = IMXo(q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMXo(q) = sv;
	  DMXo(q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = vec_add(sv, *tsc); tsc++;
	  Dmaxv = vec_max(dcv, Dmaxv);

	  /* Calculate and store I(i,q) */
	  sv      =             vec_add(mpv, *tsc);  tsc++;
	  sv      = vec_max(sv, vec_add(ipv, *tsc)); tsc++;
	  IMXo(q) = vec_add(sv, *rsc);               rsc++;
	}	  

      /* Now the "special" states, which start from Mk->E (->C, ->J->B) */
      xE = esl_vmx_hmax_float(xEv);
      xN = xN +  om->xf[p7O_N][p7O_LOOP];
      xC = ESL_MAX(xC + om->xf[p7O_C][p7O_LOOP],  xE + om->xf[p7O_E][p7O_MOVE]);
      xJ = ESL_MAX(xJ + om->xf[p7O_J][p7O_LOOP],  xE + om->xf[p7O_E][p7O_LOOP]);
      xB = ESL_MAX(xJ + om->xf[p7O_J][p7O_MOVE],  xN + om->xf[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Finally the "lazy F" loop (sensu [Farrar07]). We can often
       * prove that we don't need to evaluate any D->D paths at all.
       *
       * The observation is that if we can show that on the next row,
       * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths
       * for all k, then we don't need any D->D calculations.
       * 
       * The test condition is:
       *      max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i)
       * So:
       *   max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound;
       *   max_k D(i,k) is why we tracked Dmaxv;
       *   xB(i) was just calculated above.
       */
      Dmax = esl_vmx_hmax_float(Dmaxv);
      if (Dmax + om->ddbound_f > xB) 
	{
	  /* Now we're obligated to do at least one complete DD path to be sure. */
	  /* dcv has carried through from end of q loop above */
	  dcv = vec_sld(infv, dcv, 12);
	  tsc = om->tf + 7*Q;	/* set tsc to start of the DD's */
	  for (q = 0; q < Q; q++) 
	    {
	      DMXo(q) = vec_max(dcv, DMXo(q));	
	      dcv     = vec_add(DMXo(q), *tsc); tsc++;
	    }

	  /* We may have to do up to three more passes; the check
	   * is for whether crossing a segment boundary can improve
	   * our score. 
	   */
	  do {
	    dcv = vec_sld(infv, dcv, 12);
	    tsc = om->tf + 7*Q;	/* set tsc to start of the DD's */
	    for (q = 0; q < Q; q++) 
	      {
		if (! vec_any_gt(dcv, DMXo(q))) break;
		DMXo(q) = vec_max(dcv, DMXo(q));	
		dcv     = vec_add(DMXo(q), *tsc);   tsc++;
	      }	    
	  } while (q == Q);
	}
      else
	{ /* not calculating DD? then just store that last MD vector we calc'ed. */
	  dcv     = vec_sld(infv, dcv, 12);
	  DMXo(0) = dcv;
	}

#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpFloatRow(ox, FALSE, i, 5, 2, xE, xN, xJ, xB, xC); /* logify=FALSE, <rowi>=i, width=5, precision=2*/
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T */
  *ret_sc = xC + om->xf[p7O_C][p7O_MOVE];
  return eslOK;
}
Esempio n. 22
0
  lsq = vec_perm(src, edges, align);     // misalign the data (lsq)
  vec_st(lsq, 15, target);               // Store the lsq part first
  vec_st(msq, 0, target);                // Store the msq part
}

/* create a rotation and translation matrix (columnwise matrix) */
#define CNV_ANGL (16/3.1415927f)
#define V_CNV_ANGL (vector float){CNV_ANGL, CNV_ANGL, CNV_ANGL, CNV_ANGL}
void make_rotation(vector float rot[4], trans &t) {
#ifdef USE_ALTIVEC
  vector float rotation = load_unaligned(&t.a) * V_CNV_ANGL;
  vector float translation = load_unaligned(&t.x);
  vector float sin = _cos_sin18_v(rotation - (vector float){8.f,8.f,8.f,8.f});
  vector float cos = _cos_sin18_v(rotation);
  vector float sin_a = sin;
  vector float sin_b = vec_sld(sin, sin, 4);
  vector float sin_c = vec_sld(sin, sin, 8);
  vector float cos_a = cos;
  vector float cos_b = vec_sld(cos, cos, 4);
  vector float cos_c = vec_sld(cos, cos, 8);
  //vector float zero = (vector float)vec_splat_s32(0);

  /* row 0 */
  vector float r00 = cos_b * cos_c;
  vector float r10 = -cos_b * sin_c;
  vector float r20 = sin_b;
  /* row 1 */
  vector float r01 = sin_a * sin_b * cos_c + cos_a * sin_c;
  vector float r11 = -sin_a * sin_b * sin_c + cos_a * cos_c;
  vector float r21 = -sin_a * cos_b;
  /* row 2 */
Esempio n. 23
0
/* Function:  p7_SSVFilter_longtarget()
 * Synopsis:  Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision)
 *
 * Purpose:   Calculates an approximation of the SSV (single ungapped diagonal)
 *            score for regions of sequence <dsq> of length <L> residues, using
 *            optimized profile <om>, and a preallocated one-row DP matrix <ox>,
 *            and captures the positions at which such regions exceed the score
 *            required to be significant in the eyes of the calling function,
 *            which depends on the <bg> and <p> (usually p=0.02 for nhmmer).
 *            Note that this variant performs only SSV computations, never
 *            passing through the J state - the score required to pass SSV at
 *            the default threshold (or less restrictive) is sufficient to
 *            pass MSV in essentially all DNA models we've tested.
 *
 *            Above-threshold diagonals are captured into a preallocated list
 *            <windowlist>. Rather than simply capturing positions at which a
 *            score threshold is reached, this function establishes windows
 *            around those high-scoring positions, using scores in <msvdata>.
 *            These windows can be merged by the calling function.
 *
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues
 *            om      - optimized profile
 *            ox      - DP matrix
 *            msvdata    - compact representation of substitution scores, for backtracking diagonals
 *            bg         - the background model, required for translating a P-value threshold into a score threshold
 *            P          - p-value below which a region is captured as being above threshold
 *            windowlist - preallocated container for all hits (resized if necessary)
 *
 *
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *ssvdata,
                        P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist)
{

  vector unsigned char mpv;        /* previous row values                                       */
  vector unsigned char xEv;		   /* E state: keeps max for Mk->E as we go                     */
  vector unsigned char xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector unsigned char sv;		   /* temp storage of 1 curr row value in progress              */
  vector unsigned char biasv;	   /* emission bias in a vector                                 */
  uint8_t  xJ;                 /* special states' scores                                    */
  int i;			           /* counter over sequence positions 1..L                      */
  int q;			           /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  vector unsigned char *dp  = ox->dpb[0];	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  vector unsigned char *rsc;			        /* will point at om->rbv[x] for residue x[i]                 */

  vector unsigned char zerov;	   /* vector of zeros                                           */
  vector unsigned char tecv;                    /* vector for E->C  cost                                     */
  vector unsigned char tjbmv;                    /* vector for [JN]->B->M move cost                                  */
  vector unsigned char basev;                   /* offset for scores                                         */

  int status;

  int k;
  int n;
  int end;
  int rem_sc;
  int start;
  int target_end;
  int target_start;
  int max_end;
  int max_sc;
  int sc;
  int pos_since_max;
  float ret_sc;

  union { vector unsigned char v; uint8_t b[16]; } u;


  /*
   * Computing the score required to let P meet the F1 prob threshold
   * In original code, converting from a scaled int MSV
   * score S (the score getting to state E) to a probability goes like this:
   *  usc =  S - om->tec_b - om->tjb_b - om->base_b;
   *  usc /= om->scale_b;
   *  usc -= 3.0;
   *  P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda)
   * and we're computing the threshold usc, so reverse it:
   *  (usc - nullsc) /  eslCONST_LOG2 = inv_f( P, mu, lambda)
   *  usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda)
   *  usc += 3
   *  usc *= om->scale_b
   *  S = usc + om->tec_b + om->tjb_b + om->base_b
   *
   *  Here, I compute threshold with length model based on max_length.  Doesn't
   *  matter much - in any case, both the bg and om models will change with roughly
   *  1 bit for each doubling of the length model, so they offset.
   */
  float nullsc;
  float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU],  om->evparam[p7_MLAMBDA]);
  vector unsigned char sc_threshv;               /* pushes value to saturation if it's above pthresh  */
  int sc_thresh;

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;


  p7_bg_SetLength(bg, om->max_length);
  p7_oprofile_ReconfigMSVLength(om, om->max_length);
  p7_bg_NullOne  (bg, dsq, om->max_length, &nullsc);

  sc_thresh = (int) ceil( ( ( nullsc  + (invP * eslCONST_LOG2) + 3.0 )  * om->scale_b ) + om->base_b +  om->tec_b  + om->tjb_b  );
  sc_threshv = esl_vmx_set_u8( (int8_t)sc_thresh - 1);


  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.
   */
  biasv = esl_vmx_set_u8(om->bias_b);
  for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0);
  xJ   = 0;
  zerov = vec_splat_u8(0);


  basev = esl_vmx_set_u8((int8_t) om->base_b);
  tecv = esl_vmx_set_u8((int8_t) om->tec_b);
  tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b);

  xBv = vec_subs(basev, tjbmv);

  for (i = 1; i <= L; i++) {
	  rsc = om->rbv[dsq[i]];
    xEv = vec_splat_u8(0);

	  /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12.
	   * Because ia32 is littlendian, this means a left bit shift.
	   * Zeros shift on automatically, which is our -infinity.
	   */
    mpv = vec_sld(zerov, dp[Q-1], 15);
	  for (q = 0; q < Q; q++)
	  {
		  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
		  sv   = vec_max(mpv, xBv);
		  sv   = vec_adds(sv, biasv);
		  sv   = vec_subs(sv, *rsc);   rsc++;
		  xEv  = vec_max(xEv, sv);

		  mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
		  dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
	  }


	  if (vec_any_gt(xEv, sc_threshv) ) { //hit pthresh, so add position to list and reset values
      //figure out which model state hit threshold
      end = -1;
      rem_sc = -1;
      for (q = 0; q < Q; q++) {  /// Unpack and unstripe, so we can find the state that exceeded pthresh
          u.v = dp[q];
          for (k = 0; k < 16; k++) { // unstripe
            //(q+Q*k+1) is the model position k at which the xE score is found
            if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) {
              end = (q+Q*k+1);
              rem_sc = u.b[k];
            }
          }
          dp[q] = vec_splat_u8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration
      }

      //recover the diagonal that hit threshold
      start = end;
      target_end = target_start = i;
      sc = rem_sc;
      while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) {
        rem_sc -= om->bias_b -  ssvdata->ssv_scores[start*om->abc->Kp + dsq[target_start]];
        --start;
        --target_start;
        //if ( start == 0 || target_start==0)    break;
      }
      start++;
      target_start++;



      //extend diagonal further with single diagonal extension
      k = end+1;
      n = target_end+1;
      max_end = target_end;
      max_sc = sc;
      pos_since_max = 0;
      while (k<om->M && n<=L) {
        sc += om->bias_b -  ssvdata->ssv_scores[k*om->abc->Kp + dsq[n]];
        if (sc >= max_sc) {
          max_sc = sc;
          max_end = n;
          pos_since_max=0;
        } else {
          pos_since_max++;
          if (pos_since_max == 5)
            break;
        }
        k++;
        n++;
      }

      end  +=  (max_end - target_end);
      target_end = max_end;

      ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b);
      ret_sc /= om->scale_b;
      ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ

      p7_hmmwindow_new(  windowlist,
                         0,                  // sequence_id; used in the FM-based filter, but not here
                         target_start,       // position in the target at which the diagonal starts
                         0,                  // position in the target fm_index at which diagonal starts;  not used here, just in FM-based filter
                         end,                // position in the model at which the diagonal ends
                         end-start+1 ,       // length of diagonal
                         ret_sc,             // score of diagonal
                         p7_NOCOMPLEMENT,    // always p7_NOCOMPLEMENT here;  varies in FM-based filter
                         L
                       );



      i = target_end; // skip forward


	  }

  } /* end loop over sequence residues 1..L */

  return eslOK;


  ERROR:
  ESL_EXCEPTION(eslEMEM, "Error allocating memory for hit list\n");

}
Esempio n. 24
0
void fluid_genPressure_black(fluid *in_f, int y, pvt_fluidMode *mode)
{
	struct pressure *p = &mode->pressure;
	
	int w = fieldWidth(p->velX);
	int h = fieldHeight(p->velX);

#ifdef __APPLE_ALTIVEC__
#elif defined __SSE3__
#else
	int sx = fieldStrideX(p->velX);
#endif
	int sy = fieldStrideY(p->velY);
	
	float *velX = fieldData(p->velX);
	float *velY = fieldData(p->velY);
	
	float *pressure = fieldData(p->pressure);
	
	if (y == 0)
	{
#ifdef X_SIMD
		x128f *vPressure = (x128f*)fluidFloatPointer(pressure, 0*sy);
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, 1*sy);
		
		int x;
		w/=4;
		for (x=0; x<w; x++)
		{
			vPressure[x] = vPressureP[x];
		}
#else
		int x;
		for (x=0; x<w; x++)
		{
			fluidFloatPointer(pressure,x*sx)[0] = fluidFloatPointer(pressure,x*sx + sy)[0];
		}
#endif
	}
	else if (y == h-1)
	{
#ifdef X_SIMD
		x128f *vPressure = (x128f*)fluidFloatPointer(pressure, y*sy);
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy);
		
		int x;
		w/=4;
		for (x=0; x<w; x++)
		{
			vPressure[x] = vPressureP[x];
		}
#else
		int x;
		for (x=0; x<w; x++)
		{
			fluidFloatPointer(pressure,x*sx + y*sy)[0] =
					fluidFloatPointer(pressure,x*sx + (y-1)*sy)[0];
		}
#endif
	}
	else
	{
#ifdef X_SIMD
		float *vPressureRow = fluidFloatPointer(pressure, y*sy);
		
		x128f *vPressure = (x128f*)vPressureRow;
		x128f *vVelX = (x128f*)fluidFloatPointer(velX, y*sy);
		
		x128f *vPressureN = (x128f*)fluidFloatPointer(pressure, (y+1)*sy);
		x128f *vVelYN = (x128f*)fluidFloatPointer(velY, (y+1)*sy);
		
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy);
		x128f *vVelYP = (x128f*)fluidFloatPointer(velY, (y-1)*sy);
		
		x128f div4 = {0.0f, 1.0f/4.0f, 0.0f, 1.0f/4.0f};
		x128f mask = {1.0f, 0.0f, 1.0f, 0.0f};
#endif
	
#ifdef __APPLE_ALTIVEC__
		//int myTempVariable = __mfspr( 1023 );
		
		vector float vZero = {0,0,0,0};
		
		vec_dstst(vPressure, 0x01000001, 0);
		vec_dst(vVelX, 0x01000001, 1);
		vec_dst(vVelYN, 0x01000001, 2);
		vec_dst(vVelYP, 0x01000001, 3);
		
		int x;
		{
			vector float tmp;
			
			//Compute shifts
			vector float sl_p = vec_sld(vPressure[0], vPressure[1],4);
			vector float sr_p = vec_sld(vZero, vPressure[0], 12);
			
			vector float sl_vx = vec_sld(vVelX[0], vVelX[1],4);
			vector float sr_vx = vec_sld(vZero, vVelX[0], 12);
			
			//Sum everything!!!
			tmp = vec_add(sl_p, sr_p);
			tmp = vec_add(tmp, vPressureN[0]);
			tmp = vec_add(tmp, vPressureP[0]);
			tmp = vec_sub(tmp, sl_vx);
			tmp = vec_add(tmp, sr_vx);
			tmp = vec_sub(tmp, vVelYN[0]);
			tmp = vec_add(tmp, vVelYP[0]);
			
			vPressure[0] = vec_madd(tmp, div4, vZero);
			vPressureRow[0] = vPressureRow[1];
		}
		x=1;
		
		while (x<w/4-5)
		{
			PRESSURE_VEC_PRE(0)
			PRESSURE_VEC_PRE(1)
			PRESSURE_VEC_PRE(2)
			PRESSURE_VEC_PRE(3)
			
			PRESSURE_VEC_SHIFT(0)
			PRESSURE_VEC_SHIFT(1)
			PRESSURE_VEC_SHIFT(2)
			PRESSURE_VEC_SHIFT(3)
			
			PRESSURE_VEC_END(0)
			PRESSURE_VEC_END(1)
			PRESSURE_VEC_END(2)
			PRESSURE_VEC_END(3)
			
			x+=4;
		}

		while (x<w/4-1)
		{			
			PRESSURE_VEC_PRE(0)
			PRESSURE_VEC_SHIFT(0)
			PRESSURE_VEC_END(0)
			x++;
		}
		{
			vector float tmp;
			
			//Compute shifts
			vector float sl_p = vec_sld(vPressure[x], vZero,4);
			vector float sr_p = vec_sld(vPressure[x-1], vPressure[x], 12);
			
			vector float sl_vx = vec_sld(vVelX[x], vZero,4);
			vector float sr_vx = vec_sld(vVelX[x-1], vVelX[x], 12);
			
			//Sum everything!!!
			tmp = vec_add(sl_p, sr_p);
			tmp = vec_add(tmp, vPressureN[x]);
			tmp = vec_add(tmp, vPressureP[x]);
			tmp = vec_sub(tmp, sl_vx);
			tmp = vec_add(tmp, sr_vx);
			tmp = vec_sub(tmp, vVelYN[x]);
			tmp = vec_add(tmp, vVelYP[x]);
			
			vPressure[x] = vec_madd(tmp, div4, vZero);
			
			vPressureRow[w-1] = vPressureRow[w-2];
		}
		
#elif defined __SSE3__
		
		int x;
		{
			__m128 tmp;
			
			//Compute shifts
			__m128 sl_p = _mm_srli_sf128(vPressure[0],4);
			sl_p = _mm_add_ps(sl_p,_mm_slli_sf128(vPressure[1],12));
			
			__m128 sr_p = _mm_slli_sf128(vPressure[0],4);
			
			__m128 sl_vx = _mm_srli_sf128(vVelX[0],4);
			sl_vx = _mm_add_ps(sl_vx,_mm_slli_sf128(vVelX[1],12));
			
			__m128 sr_vx = _mm_slli_sf128(vVelX[0],4);
			
			//Sum everything!!!
			tmp = _mm_add_ps(sl_p, sr_p);
			tmp = _mm_add_ps(tmp, vPressureN[0]);
			tmp = _mm_add_ps(tmp, vPressureP[0]);
			tmp = _mm_sub_ps(tmp, sl_vx);
			tmp = _mm_add_ps(tmp, sr_vx);
			tmp = _mm_sub_ps(tmp, vVelYN[0]);
			tmp = _mm_add_ps(tmp, vVelYP[0]);
			
			vPressure[0] = _mm_mul_ps(tmp, div4);
			vPressureRow[0] = vPressureRow[1];
		}
		x=1;
		while (x<w/4-9)
		{
			//Compute shifts (1)
			PRESSURE_SSE_PRE(0);
			PRESSURE_SSE_PRE(1);
			PRESSURE_SSE_PRE(2);
			
			//Sum everything!!! (1)
			PRESSURE_SSE_POST(0);
			PRESSURE_SSE_POST(1);
			PRESSURE_SSE_POST(2);
			
			x+=3;
		}
		while (x<w/4-1)
		{
			//Compute shifts
			PRESSURE_SSE_PRE(0);
			
			//Sum everything!!!
			PRESSURE_SSE_POST(0);
			
			x++;
		}
		{
			__m128 tmp;
			
			//Compute shifts
			__m128 sl_p = _mm_srli_sf128(vPressure[x],4);
			
			__m128 sr_p = _mm_slli_sf128(vPressure[x],4);
			sr_p = _mm_add_ps(sr_p,_mm_srli_sf128(vPressure[x-1],12));
			
			__m128 sl_vx = _mm_srli_sf128(vVelX[x],4);
			
			__m128 sr_vx = _mm_slli_sf128(vVelX[x],4);
			sr_vx = _mm_add_ps(sr_vx,_mm_srli_sf128(vVelX[x-1],12));
			
			//Sum everything!!!
			tmp = _mm_add_ps(sl_p, sr_p);
			tmp = _mm_add_ps(tmp, vPressureN[x]);
			tmp = _mm_add_ps(tmp, vPressureP[x]);
			tmp = _mm_sub_ps(tmp, sl_vx);
			tmp = _mm_add_ps(tmp, sr_vx);
			tmp = _mm_sub_ps(tmp, vVelYN[x]);
			tmp = _mm_add_ps(tmp, vVelYP[x]);
			
			vPressure[x] = _mm_mul_ps(tmp, div4);
			
			vPressureRow[w-1] = vPressureRow[w-2];
		}
		
#else
		float lastPressureX = fluidFloatPointer(pressure,sx + y*sy)[0];
		float lastVelX = fluidFloatPointer(velX, y*sy)[0];
		
		float curPressureX = lastPressureX;
		float curVelX = fluidFloatPointer(velX, sx + y*sy)[0];
		
		fluidFloatPointer(pressure,y*sy)[0] = lastPressureX;
		
		int x;
		int curxy = sx + y*sy;
		for (x=1; x<w-1; x++)
		{
			float nextPressureX = fluidFloatPointer(pressure,curxy + sx)[0];
			float nextVelX = fluidFloatPointer(velX,curxy + sx)[0];
			
			fluidFloatPointer(pressure,curxy)[0] =
				(	  lastPressureX
				 	+ nextPressureX
				 	+ fluidFloatPointer(pressure,curxy - sy)[0]
					+ fluidFloatPointer(pressure,curxy + sy)[0]
				 - 		(  nextVelX
						 - lastVelX
						 + fluidFloatPointer(velY,curxy + sy)[0]
						 - fluidFloatPointer(velY,curxy - sy)[0])) / 4.0f;
			
			lastPressureX = curPressureX;
			curPressureX = nextPressureX;
			
			lastVelX = curVelX;
			curVelX = nextVelX;
			
			curxy += sx;
		}
		
		fluidFloatPointer(pressure,(w-1)*sx + y*sy)[0]
			= fluidFloatPointer(pressure,(w-2)*sx + y*sy)[0];
#endif
	}
}
Esempio n. 25
0
static int 
backward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, const P7_OMX *fwd, P7_OMX *bck, float *opt_sc)
{
  vector float mpv, ipv, dpv;         /* previous row values                                       */
  vector float mcv, dcv;              /* current row values                                        */
  vector float tmmv, timv, tdmv;      /* tmp vars for accessing rotated transition scores          */
  vector float xBv;		      /* collects B->Mk components of B(i)                         */
  vector float xEv;	              /* splatted E(i)                                             */
  vector float zerov;		      /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	      /* special states' scores                                    */
  int      i;			      /* counter over sequence positions 0,1..L                    */
  int      q;			      /* counter over quads 0..Q-1                                 */
  int      Q       = p7O_NQF(om->M);  /* segment length: # of vectors                              */
  int      j;			      /* DD segment iteration counter (4 = full serialization)     */
  vector float  *dpc;                 /* current DP row                                            */
  vector float  *dpp;	              /* next ("previous") DP row                                  */
  vector float  *rp;		      /* will point into om->rfv[x] for residue x[i+1]             */
  vector float  *tp;		      /* will point into (and step thru) om->tfv transition scores */

  /* initialize the L row. */
  bck->M = om->M;
  bck->L = L;
  bck->has_own_scales = FALSE;	/* backwards scale factors are *usually* given by <fwd> */
  dpc    = bck->dpf[L * do_full];
  xJ     = 0.0;
  xB     = 0.0;
  xN     = 0.0;
  xC     = om->xf[p7O_C][p7O_MOVE];      /* C<-T */
  xE     = xC * om->xf[p7O_E][p7O_MOVE]; /* E<-C, no tail */
  xEv    = esl_vmx_set_float(xE); 
  zerov  = (vector float) vec_splat_u32(0);
  dcv    = (vector float) vec_splat_u32(0);;		/* solely to silence a compiler warning */
  for (q = 0; q < Q; q++) MMO(dpc,q) = DMO(dpc,q) = xEv;
  for (q = 0; q < Q; q++) IMO(dpc,q) = zerov;

  /* init row L's DD paths, 1) first segment includes xE, from DMO(q) */
  tp  = om->tfv + 8*Q - 1;	                        /* <*tp> now the [4 8 12 x] TDD quad         */
  dpv = vec_sld(DMO(dpc,Q-1), zerov, 4);
  for (q = Q-1; q >= 1; q--)
    {
      DMO(dpc,q) = vec_madd(dpv, *tp, DMO(dpc,q));      tp--;
      dpv        = DMO(dpc,q);
    }
  dcv        = vec_madd(dpv, *tp, zerov);
  DMO(dpc,q) = vec_add(DMO(dpc,q), dcv);

  /* 2) three more passes, only extending DD component (dcv only; no xE contrib from DMO(q)) */
  for (j = 1; j < 4; j++)
    {
      tp  = om->tfv + 8*Q - 1;	                        /* <*tp> now the [4 8 12 x] TDD quad         */
      dcv = vec_sld(dcv, zerov, 4);
      for (q = Q-1; q >= 0; q--)
	{
	  dcv        = vec_madd(dcv, *tp, zerov); tp--;
	  DMO(dpc,q) = vec_add(DMO(dpc,q), dcv);
	}
    }

  /* now MD init */
  tp  = om->tfv + 7*Q - 3;	                        /* <*tp> now the [4 8 12 x] Mk->Dk+1 quad    */
  dcv = vec_sld(DMO(dpc,0), zerov, 4);
  for (q = Q-1; q >= 0; q--)
    {
      MMO(dpc,q) = vec_madd(dcv, *tp, MMO(dpc,q)); tp -= 7;
      dcv        = DMO(dpc,q);
    }

  /* Sparse rescaling: same scale factors as fwd matrix */
  if (fwd->xmx[L*p7X_NXCELLS+p7X_SCALE] > 1.0)
    {
      xE  = xE / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xN  = xN / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xC  = xC / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xJ  = xJ / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xB  = xB / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xEv = esl_vmx_set_float(1.0 / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]);
      for (q = 0; q < Q; q++) {
	MMO(dpc,q) = vec_madd(MMO(dpc,q), xEv, zerov);
	DMO(dpc,q) = vec_madd(DMO(dpc,q), xEv, zerov);
	IMO(dpc,q) = vec_madd(IMO(dpc,q), xEv, zerov);
      }
    }
  bck->xmx[L*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
  bck->totscale                     = log(bck->xmx[L*p7X_NXCELLS+p7X_SCALE]);

  /* Stores */
  bck->xmx[L*p7X_NXCELLS+p7X_E] = xE;
  bck->xmx[L*p7X_NXCELLS+p7X_N] = xN;
  bck->xmx[L*p7X_NXCELLS+p7X_J] = xJ;
  bck->xmx[L*p7X_NXCELLS+p7X_B] = xB;
  bck->xmx[L*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
  if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, L, 9, 4, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=L, width=9, precision=4*/
#endif

  /* main recursion */
  for (i = L-1; i >= 1; i--)	/* backwards stride */
    {
      /* phase 1. B(i) collected. Old row destroyed, new row contains
       *    complete I(i,k), partial {MD}(i,k) w/ no {MD}->{DE} paths yet.
       */
      dpc = bck->dpf[i     * do_full];
      dpp = bck->dpf[(i+1) * do_full];
      rp  = om->rfv[dsq[i+1]] + Q-1; /* <*rp> is now the [4 8 12 x] match emission quad */
      tp  = om->tfv + 7*Q - 1;	    /* <*tp> is now the [4 8 12 x] TII transition quad  */

      /* leftshift the first transition quads */
      tmmv = vec_sld(om->tfv[1], zerov, 4);
      timv = vec_sld(om->tfv[2], zerov, 4);
      tdmv = vec_sld(om->tfv[3], zerov, 4);

      mpv = vec_madd(MMO(dpp,0), om->rfv[dsq[i+1]][0], zerov); /* precalc M(i+1,k+1)*e(M_k+1,x_{i+1}) */
      mpv = vec_sld(mpv, zerov, 4);

      xBv = zerov;
      for (q = Q-1; q >= 0; q--)     /* backwards stride */
	{
	  vector float t1;

	  ipv = IMO(dpp,q); /* assumes emission odds ratio of 1.0; i+1's IMO(q) now free */
	  t1         = vec_madd(mpv, timv, zerov);
	  IMO(dpc,q) = vec_madd(ipv, *tp,  t1);            tp--;
	  DMO(dpc,q) = vec_madd(mpv, tdmv, zerov);
	  t1         = vec_madd(mpv, tmmv, zerov);
	  mcv        = vec_madd(ipv, *tp,  t1);            tp -= 2;
	  
	  /* obtain mpv for next q. i+1's MMO(q) is freed  */
	  mpv        = vec_madd(MMO(dpp,q), *rp, zerov);   rp--;
	  MMO(dpc,q) = mcv;

	  tdmv = *tp;   tp--;
	  timv = *tp;   tp--;
	  tmmv = *tp;   tp--;

	  xBv = vec_madd(mpv, *tp, xBv); tp--;
	}

      /* phase 2: now that we have accumulated the B->Mk transitions in xBv, we can do the specials */
      xB = esl_vmx_hsum_float(xBv);

      xC =  xC * om->xf[p7O_C][p7O_LOOP];
      xJ = (xB * om->xf[p7O_J][p7O_MOVE]) + (xJ * om->xf[p7O_J][p7O_LOOP]); /* must come after xB */
      xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); /* must come after xB */
      xE = (xC * om->xf[p7O_E][p7O_MOVE]) + (xJ * om->xf[p7O_E][p7O_LOOP]); /* must come after xJ, xC */
      xEv = esl_vmx_set_float(xE);	/* splat */


      /* phase 3: {MD}->E paths and one step of the D->D paths */
      tp  = om->tfv + 8*Q - 1;	/* <*tp> now the [4 8 12 x] TDD quad */
      dpv = vec_add(DMO(dpc,0), xEv);
      dpv = vec_sld(dpv, zerov, 4);
      for (q = Q-1; q >= 1; q--)
	{
	  dcv        = vec_madd(dpv, *tp, xEv);    tp--;
	  DMO(dpc,q) = vec_add(DMO(dpc,q), dcv);
	  dpv        = DMO(dpc,q);
	  MMO(dpc,q) = vec_add(MMO(dpc,q), xEv);
	}
      dcv        = vec_madd(dpv, *tp, zerov);
      DMO(dpc,q) = vec_add(DMO(dpc,q), vec_add(dcv, xEv));
      MMO(dpc,q) = vec_add(MMO(dpc,q), xEv);
      
      /* phase 4: finish extending the DD paths */
      /* fully serialized for now */
      for (j = 1; j < 4; j++)	/* three passes: we've already done 1 segment, we need 4 total */
	{
	  dcv = vec_sld(dcv, zerov, 4);
	  tp  = om->tfv + 8*Q - 1;	/* <*tp> now the [4 8 12 x] TDD quad */
	  for (q = Q-1; q >= 0; q--)
	    {
	      dcv        = vec_madd(dcv, *tp, zerov); tp--;
	      DMO(dpc,q) = vec_add(DMO(dpc,q), dcv);
	    }
	}

      /* phase 5: add M->D paths */
      dcv = vec_sld(DMO(dpc,0), zerov, 4);
      tp  = om->tfv + 7*Q - 3;	/* <*tp> is now the [4 8 12 x] Mk->Dk+1 quad */
      for (q = Q-1; q >= 0; q--)
	{
	  MMO(dpc,q) = vec_madd(dcv, *tp, MMO(dpc,q)); tp -= 7;
	  dcv        = DMO(dpc,q);
	}

      /* Sparse rescaling  */

      /* In rare cases [J3/119] scale factors from <fwd> are
       * insufficient and backwards will overflow. In this case, we
       * switch on the fly to using our own scale factors, different
       * from those in <fwd>. This will complicate subsequent
       * posterior decoding routines.
       */
      if (xB > 1.0e16) bck->has_own_scales = TRUE;

      if      (bck->has_own_scales)  bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = (xB > 1.0e4) ? xB : 1.0;
      else                           bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[i*p7X_NXCELLS+p7X_SCALE];

      if (bck->xmx[i*p7X_NXCELLS+p7X_SCALE] > 1.0)
	{
	  xE /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xN /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xJ /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xB /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xC /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xBv = esl_vmx_set_float(1.0 / bck->xmx[i*p7X_NXCELLS+p7X_SCALE]);
	  for (q = 0; q < Q; q++) {
	    MMO(dpc,q) = vec_madd(MMO(dpc,q), xBv, zerov);
	    DMO(dpc,q) = vec_madd(DMO(dpc,q), xBv, zerov);
	    IMO(dpc,q) = vec_madd(IMO(dpc,q), xBv, zerov);
	  }
	  bck->totscale += log(bck->xmx[i*p7X_NXCELLS+p7X_SCALE]);
	}

      /* Stores are separate only for pedagogical reasons: easy to
       * turn this into a more memory efficient version just by
       * deleting the stores.
       */
      bck->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      bck->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      bck->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      bck->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      bck->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, i, 9, 4, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=9, precision=4*/
#endif
    } /* thus ends the loop over sequence positions i */

  /* Termination at i=0, where we can only reach N,B states. */
  dpp = bck->dpf[1 * do_full];
  tp  = om->tfv;	        /* <*tp> is now the [1 5 9 13] TBMk transition quad  */
  rp  = om->rfv[dsq[1]];	/* <*rp> is now the [1 5 9 13] match emission quad   */
  xBv = (vector float) vec_splat_u32(0);
  for (q = 0; q < Q; q++)
    {
      mpv = vec_madd(MMO(dpp,q), *rp, zerov);  rp++;
      xBv = vec_madd(mpv,        *tp, xBv);    tp += 7;
    }
  /* horizontal sum of xBv */
  xB = esl_vmx_hsum_float(xBv);
 
  xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]);  

  bck->xmx[p7X_B]     = xB;
  bck->xmx[p7X_C]     = 0.0;
  bck->xmx[p7X_J]     = 0.0;
  bck->xmx[p7X_N]     = xN;
  bck->xmx[p7X_E]     = 0.0;
  bck->xmx[p7X_SCALE] = 1.0;

#if p7_DEBUGGING
  dpc = bck->dpf[0];
  for (q = 0; q < Q; q++) /* Not strictly necessary, but if someone's looking at DP matrices, this is nice to do: */
    MMO(dpc,q) = DMO(dpc,q) = IMO(dpc,q) = zerov;
  if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, 0, 9, 4, bck->xmx[p7X_E], bck->xmx[p7X_N],  bck->xmx[p7X_J], bck->xmx[p7X_B],  bck->xmx[p7X_C]);	/* logify=TRUE, <rowi>=0, width=9, precision=4*/
#endif

  if       (isnan(xN))         ESL_EXCEPTION(eslERANGE, "backward score is NaN");
  else if  (L>0 && xN == 0.0)  ESL_EXCEPTION(eslERANGE, "backward score underflow (is 0.0)");    /* [J5/118] */
  else if  (isinf(xN) == 1)    ESL_EXCEPTION(eslERANGE, "backward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = bck->totscale + log(xN);
  return eslOK;
}
Esempio n. 26
0
vector double
test_shift_left_double (vector double x, vector double y)
{
	return vec_sld (x, y, /* shift_by */ 10);
}
Esempio n. 27
0
void *mem_searchrn(void *s, size_t len)
{
	vector unsigned char v_cr;
	vector unsigned char v_nl;
	vector unsigned char v0;
	vector unsigned char v_perm;
	vector unsigned char c;
	vector bool char rr, rn;
	vector bool char last_rr;
	char *p;
	ssize_t k;
	size_t block_num;
	unsigned f;

	if(unlikely(!s || !len))
		return NULL;

	/* only do one prefetch, this covers nearly 128k */
	block_num = DIV_ROUNDUP(len, 512);
	f  = block_num >= 256 ? 0 : block_num << 16;
	f |= 512;
	vec_dst((const unsigned char *)s, f, 2);

	v_cr = vec_splat_u8('\r');
	v_nl = vec_splat_u8('\n');
	v0   = vec_splat_u8(0);
	last_rr = (vector bool char)v0;

	k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len;

	p = (char *)ALIGN_DOWN(s, SOVUC);
	c = vec_ldl(0, (const vector unsigned char *)p);
	if(unlikely(k > 0))
		goto K_SHIFT;
	v_perm = vec_lvsl(0, (unsigned char *)s);
	c = vec_perm(c, v0, v_perm);
	v_perm = vec_lvsr(0, (unsigned char *)s);
	c = vec_perm(v0, c, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);

	k = -k;
	goto START_LOOP;

	do
	{
		p += SOVUC;
		c = vec_ldl(0, (const vector unsigned char *)p);
		k -= SOVUC;
		if(k > 0)
		{
			rr = vec_cmpeq(c, v_cr);
			rn = vec_cmpeq(c, v_nl);

			if(vec_any_eq(last_rr, rn)) {
				vec_dss(2);
				return p - 1;
			}
START_LOOP:
			last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1);
			rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
			rr = vec_and(rr, rn); /* get mask */
			if(vec_any_ne(rr, v0)) {
				vec_dss(2);
				return p + vec_zpos(rr);
			}
		}
	} while(k > 0);
	k = -k;
K_SHIFT:
	vec_dss(2);
	v_perm = vec_lvsr(0, (unsigned char *)k);
	c = vec_perm(v0, c, v_perm);
	v_perm = vec_lvsl(0, (unsigned char *)k);
	c = vec_perm(c, v0, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);
	if(vec_any_eq(last_rr, rn))
		return p - 1;

	rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
	rr = vec_and(rr, rn); /* get mask */
	if(vec_any_ne(rr, v0))
		return p + vec_zpos(rr);

	return NULL;
}
Esempio n. 28
0
static av_always_inline
void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                 uint8_t *src, ptrdiff_t src_stride,
                                 int h, int my, int w, int is6tap)
{
    LOAD_V_SUBPEL_FILTER(my-1);
    vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
    vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
    vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
    vec_u16 c7  = vec_splat_u16(7);

    // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
    // so combine this permute with the alignment permute vector
    align_vech = vec_lvsl(0, src);
    align_vecl = vec_sld(align_vech, align_vech, 8);
    if (w ==16)
        perm_vec = vec_mergeh(align_vech, align_vecl);
    else
        perm_vec = vec_mergeh(align_vech, align_vech);

    if (is6tap)
        s0 = load_with_perm_vec(-2*src_stride, src, perm_vec);
    s1 = load_with_perm_vec(-1*src_stride, src, perm_vec);
    s2 = load_with_perm_vec( 0*src_stride, src, perm_vec);
    s3 = load_with_perm_vec( 1*src_stride, src, perm_vec);
    if (is6tap)
        s4 = load_with_perm_vec( 2*src_stride, src, perm_vec);

    src += (2+is6tap)*src_stride;

    while (h --> 0) {
        if (is6tap)
            s5 = load_with_perm_vec(0, src, perm_vec);
        else
            s4 = load_with_perm_vec(0, src, perm_vec);

        FILTER_V(f16h, vec_mule);

        if (w == 16) {
            FILTER_V(f16l, vec_mulo);
            filt = vec_packsu(f16h, f16l);
            vec_st(filt, 0, dst);
        } else {
            filt = vec_packsu(f16h, f16h);
            if (w == 4)
                filt = (vec_u8)vec_splat((vec_u32)filt, 0);
            else
                vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
            vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
        }

        if (is6tap)
            s0 = s1;
        s1 = s2;
        s2 = s3;
        s3 = s4;
        if (is6tap)
            s4 = s5;

        dst += dst_stride;
        src += src_stride;
    }
}
Esempio n. 29
0
void foo() {
  vector bool int boolVector = (vector bool int) vec_splat_u32(3);
  boolVector = vec_sld( boolVector, boolVector, 
    1 );  /* { dg-bogus "no instance of overloaded" } */
}
Esempio n. 30
0
inline ushort v_reduce_sum(const v_uint16x8& a)
{
    const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
    return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
}