Esempio n. 1
0
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
{
    vector float zero, t0, t1, s0, s1, wi, wj;
    const vector unsigned char reverse = vcprm(3,2,1,0);
    int i,j;

    dst += len;
    win += len;
    src0+= len;

    zero = (vector float)vec_splat_u32(0);

    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
        s0 = vec_ld(i, src0);
        s1 = vec_ld(j, src1);
        wi = vec_ld(i, win);
        wj = vec_ld(j, win);

        s1 = vec_perm(s1, s1, reverse);
        wj = vec_perm(wj, wj, reverse);

        t0 = vec_madd(s0, wj, zero);
        t0 = vec_nmsub(s1, wi, t0);
        t1 = vec_madd(s0, wi, zero);
        t1 = vec_madd(s1, wj, t1);
        t1 = vec_perm(t1, t1, reverse);

        vec_st(t0, i, dst);
        vec_st(t1, j, dst);
    }
}
Esempio n. 2
0
void audio_convert_s16_to_float_altivec(float *out,
      const int16_t *in, size_t samples, float gain)
{
   const vector float gain_vec = vec_splats(gain);
   const vector float zero_vec = vec_splats(0.0f);
   // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely).
   if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
   {
      size_t i;
      for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
      {
         vector signed short input = vec_ld(0, in);
         vector signed int hi = vec_unpackh(input);
         vector signed int lo = vec_unpackl(input);
         vector float out_hi = vec_madd(vec_ctf(hi, 15), gain_vec, zero_vec);
         vector float out_lo = vec_madd(vec_ctf(lo, 15), gain_vec, zero_vec);

         vec_st(out_hi,  0, out);
         vec_st(out_lo, 16, out);
      }

      audio_convert_s16_to_float_C(out, in, samples - i, gain);
   }
   else
      audio_convert_s16_to_float_C(out, in, samples, gain);
}
Esempio n. 3
0
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths;
 * instead we use the fact that E(i) is itself the necessary normalization
 * factor, and implement FChoose's algorithm here for an on-the-fly 
 * calculation.
 */
static inline int
select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k)
{
  int          Q     = p7O_NQF(ox->M);
  double       sum   = 0.0;
  double       roll  = esl_random(rng);
  double       norm  = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E];   /* all M, D already scaled exactly the same */
  vector float xEv   = esl_vmx_set_float(norm);
  vector float zerov = (vector float) vec_splat_u32(0);
  union { vector float v; float p[4]; } u;
  int    q,r;

  while (1) {
    for (q = 0; q < Q; q++)
      {
	u.v = vec_madd(ox->dpf[i][q*3 + p7X_M], xEv, zerov);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;}
	}

	u.v = vec_madd(ox->dpf[i][q*3 + p7X_D], xEv, zerov);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;}
	}
      }
    ESL_DASSERT1(sum > 0.99);
  }
  /*UNREACHED*/
  ESL_EXCEPTION(-1, "unreached code was reached. universe collapses.");
} 
Esempio n. 4
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float  zerov;
  vector float  mpv, dpv;
  vector float  tmdv, tddv;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_D };

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv = om->tfv[7*(q-1) + p7O_MD];
    tddv = om->tfv[7*Q + (q-1)];
  } else {
    mpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r];
  u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Esempio n. 5
0
/* An accurate vector division routine using the reciprocal estimate and 
 * two Newton-Raphson iterations
 */
static inline vector float vec_div(vector float A, vector float B)
{
     vector float y0;
     vector float y1;
     vector float y2;
     vector float Q;
     vector float R;
     vector float one = (vector float) (1.0f);
     vector float zero = (vector float) (-0.0f);
     vector float mone = (vector float) (-1.0f);

     y0 = vec_re(B);            // approximate 1/B

     // y1 = y0*(-(y0*B - 1.0))+y0  i.e. y0+y0*(1.0 - y0*B)
     y1 = vec_madd(y0,vec_nmsub(y0, B, one),y0);
  
     // REPEAT the Newton-Raphson to get the required 24 bits
     y2 = vec_madd(y1, vec_nmsub(y1, B, one),y1);

     // y2 = y1*(-(y1*B - 1.0f))+y1  i.e. y1+y1*(1.0f - y1*B)
     // y2 is now the correctly rounded reciprocal, and the manual considers this
     // OK for use in computing the remainder: Q = A*y2, R = A - B*Q

     Q = vec_madd(A,y2,zero);  // -0.0 IEEE
     R = vec_nmsub(B,Q,A);      // -(B*Q-A) == (A-B*Q)

     // final rouding adjustment
     return(vec_madd(R, y2, Q));
}
Esempio n. 6
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp = om->tfv + 7*q;    	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u;
  float   path[4];
  int     state[4] = { p7T_B, p7T_M, p7T_I, p7T_D };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  
  
  u.v = vec_madd(xBv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[1] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov); tp++;  path[2] = u.p[r];
  u.v = vec_madd(dpv, *tp, zerov);        path[3] = u.p[r];
  esl_vec_FNorm(path, 4);
  return state[esl_rnd_FChoose(rng, path, 4)];
}
Esempio n. 7
0
static void vector_fmul_reverse_altivec(float *dst, const float *src0,
                                        const float *src1, int len)
{
    int i;
    vector float d, s0, s1, h0, l0,
                 s2, s3, zero = (vector float)vec_splat_u32(0);
    src1 += len-4;
    for(i=0; i<len-7; i+=8) {
        s1 = vec_ld(0, src1-i);              // [a,b,c,d]
        s0 = vec_ld(0, src0+i);
        l0 = vec_mergel(s1, s1);             // [c,c,d,d]
        s3 = vec_ld(-16, src1-i);
        h0 = vec_mergeh(s1, s1);             // [a,a,b,b]
        s2 = vec_ld(16, src0+i);
        s1 = vec_mergeh(vec_mergel(l0,h0),   // [d,b,d,b]
                        vec_mergeh(l0,h0));  // [c,a,c,a]
                                             // [d,c,b,a]
        l0 = vec_mergel(s3, s3);
        d = vec_madd(s0, s1, zero);
        h0 = vec_mergeh(s3, s3);
        vec_st(d, 0, dst+i);
        s3 = vec_mergeh(vec_mergel(l0,h0),
                        vec_mergeh(l0,h0));
        d = vec_madd(s2, s3, zero);
        vec_st(d, 16, dst+i);
    }
}
Esempio n. 8
0
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                               const v_float32x4& m1, const v_float32x4& m2,
                               const v_float32x4& a)
{
    const vec_float4 v0 = vec_splat(v.val, 0);
    const vec_float4 v1 = vec_splat(v.val, 1);
    const vec_float4 v2 = vec_splat(v.val, 2);
    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
}
Esempio n. 9
0
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
{
    const vec_float4 v0 = vec_splat(v.val, 0);
    const vec_float4 v1 = vec_splat(v.val, 1);
    const vec_float4 v2 = vec_splat(v.val, 2);
    VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
}
Esempio n. 10
0
static void vector_fmul_altivec(float *dst, const float *src, int len)
{
    int i;
    vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
    for(i=0; i<len-7; i+=8) {
        d0 = vec_ld(0, dst+i);
        s = vec_ld(0, src+i);
        d1 = vec_ld(16, dst+i);
        d0 = vec_madd(d0, s, zero);
        d1 = vec_madd(d1, vec_ld(16,src+i), zero);
        vec_st(d0, 0, dst+i);
        vec_st(d1, 16, dst+i);
    }
}
Esempio n. 11
0
// exectue vectorized structured inner dot product
void dotprod_crcf_execute(dotprod_crcf    _q,
                          float complex * _x,
                          float complex * _r)
{
    int al; // input data alignment

    vector float *ar,*d;
    vector float s0,s1,s2,s3;
    union { vector float v; float w[4];} s;
    unsigned int nblocks;

    ar = (vector float*)( (int)_x & ~15);
    al = ((int)_x & 15)/sizeof(float);

    d = (vector float*)_q->h[al];

    // number of blocks doubles because of complex type
    nblocks = (2*_q->n + al - 1)/4 + 1;

    // split into four vectors each with four 32-bit
    // partial sums.  Effectively each loop iteration
    // operates on 16 input samples at a time.
    s0 = s1 = s2 = s3 = (vector float)(0);
    while (nblocks >= 4) {
        s0 = vec_madd(ar[nblocks-1],d[nblocks-1],s0);
        s1 = vec_madd(ar[nblocks-2],d[nblocks-2],s1);
        s2 = vec_madd(ar[nblocks-3],d[nblocks-3],s2);
        s3 = vec_madd(ar[nblocks-4],d[nblocks-4],s3);
        nblocks -= 4;
    }

    // fold the resulting partial sums into vector s0
    s0 = vec_add(s0,s1);    // s0 = s0+s1
    s2 = vec_add(s2,s3);    // s2 = s2+s3
    s0 = vec_add(s0,s2);    // s0 = s0+s2

    // finish partial summing operations
    while (nblocks-- > 0)
        s0 = vec_madd(ar[nblocks],d[nblocks],s0);

    // move the result into the union s (effetively,
    // this loads the four 32-bit values in s0 into
    // the array w).
    s.v = vec_add(s0,(vector float)(0));

    // sum the resulting array
    //*_r = s.w[0] + s.w[1] + s.w[2] + s.w[3];
    *_r = (s.w[0] + s.w[2]) + (s.w[1] + s.w[3]) * _Complex_I;
}
Esempio n. 12
0
static void OSX_AudioIOProc16Bit_Altivec(SInt16	*myInBuffer, float *myOutBuffer)
{
		register UInt32	i;

		float f = SOUND_BUFFER_SCALE_16BIT;
   		const vector float gain = vec_load_ps1(&f); // multiplier
		const vector float mix = vec_setzero();
		if (gBufferMono2Stereo)
		{
			int j=0;
			// TEST: OK
			for (i=0;i<SOUND_BUFFER_SIZE;i+=8, j+=16)
			{
				vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts
				vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float
				vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float
				vector float v3 = vec_madd(v1, gain, mix); // scale
				vector float v4 = vec_madd(v2, gain, mix); // scale

				vector float v5 = vec_mergel(v3, v3); // v3(0,0,1,1);
				vector float v6 = vec_mergeh(v3, v3); // v3(2,2,3,3);
				vector float v7 = vec_mergel(v4, v4); // v4(0,0,1,1);
				vector float v8 = vec_mergeh(v4, v4); // v4(2,2,3,3);

				vec_st(v5, 0, myOutBuffer + j); // Store 4 floats
				vec_st(v6, 0, myOutBuffer + 4 + j); // Store 4 floats
				vec_st(v7, 0, myOutBuffer + 8 + j); // Store 4 floats
				vec_st(v8, 0, myOutBuffer + 12 + j); // Store 4 floats

			}
		}
		else
		{
			// TEST: OK
			for (i=0;i<SOUND_BUFFER_SIZE;i+=8)
			{
				vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts
				vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float
				vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float
				vector float v3 = vec_madd(v1, gain, mix); // scale
				vector float v4 = vec_madd(v2, gain, mix); // scale
				vec_st(v3, 0, myOutBuffer + i); // Store 4 floats
				vec_st(v4, 0, myOutBuffer + 4 + i); // Store 4 floats
			}
		}
}
Esempio n. 13
0
__SIMDi _SIMD_madd_epi32(__SIMDi a, __SIMDi b, __SIMDi c)
{
#ifdef  USE_SSE
  return _SIMD_add_epi32(_SIMD_mul_epi32(a,b),c);
#elif defined USE_AVX
  return _m256_madd_ps(a,b);
#elif defined USE_IBM
  return vec_madd(a,b);
#endif
}
Esempio n. 14
0
__SIMDd _SIMD_madd_pd(__SIMDd a, __SIMDd b, __SIMDd c)
{
#ifdef  USE_SSE
  return _mm_add_pd(_mm_mul_pd(a,b),c);
#elif defined USE_AVX
  return _m256_madd_ps(a,b);
#elif defined USE_IBM
  return vec_madd(a,b);
#endif
}
Esempio n. 15
0
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */
static inline int
select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q    = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r    = (k-1) / Q;
  vector float  zerov;
  vector float  mpv  = ox->dpf[i-1][q*3 + p7X_M];
  vector float  ipv  = ox->dpf[i-1][q*3 + p7X_I];
  vector float *tp   = om->tfv + 7*q + p7O_MI;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_I };

  zerov = (vector float) vec_splat_u32(0);

  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov);        path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Esempio n. 16
0
File: skin.c Progetto: Lenbok/dormin
static vector float appbones (State *s,
                              struct skin *skin,
                              vector float x,
                              vector float y,
                              vector float z,
                              vector float nx,
                              vector float ny,
                              vector float nz,
                              vector float *np)
{
    int j;
    int num_bones;
    int bone_index;
    struct abone *b;
    vector float vz = (vector float) vec_splat_u32 (0);
    vector float v, w, n;
    vector unsigned char S = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4<<3};

    v = n = vz;
    w = vec_ld (0, skin->weights);

    num_bones = skin->boneinfo & 3;
    bone_index = skin->boneinfo >> 2;
    for (j = 0; j < num_bones; ++j) {
        vector float t0, t1, t2, t3, t4, t5, r0, r1, r2, r3, vw;

        b = &s->abones[bone_index & 0x3ff];
        bone_index >>= 10;
        vw = vec_splat (w, 0);
        w = vec_slo (w, S);

        r0 = vec_ld ( 0, b->cm);
        r1 = vec_ld (16, b->cm);
        r2 = vec_ld (32, b->cm);
        r3 = vec_ld (48, b->cm);

        t0 = vec_madd (r0, x, r3);
        t1 = vec_madd (r1, y, t0);
        t2 = vec_madd (r2, z, t1);
        v = vec_madd (t2, vw, v);

        t3 = vec_madd (r0, nx, vz);
        t4 = vec_madd (r1, ny, t3);
        t5 = vec_madd (r2, nz, t4);
        n = vec_madd (t5, vw, n);
    }

    *np = n;
    return v;
}
Esempio n. 17
0
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
{
    union {
        vector float v;
        float s[4];
    } mul_u;
    int i;
    vector float src1, src2, dst1, dst2, mul_v, zero;

    zero = (vector float)vec_splat_u32(0);
    mul_u.s[0] = mul;
    mul_v = vec_splat(mul_u.v, 0);

    for(i=0; i<len; i+=8) {
        src1 = vec_ctf(vec_ld(0,  src+i), 0);
        src2 = vec_ctf(vec_ld(16, src+i), 0);
        dst1 = vec_madd(src1, mul_v, zero);
        dst2 = vec_madd(src2, mul_v, zero);
        vec_st(dst1,  0, dst+i);
        vec_st(dst2, 16, dst+i);
    }
}
Esempio n. 18
0
void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) {
#ifdef TM_USE_OMP
#pragma omp parallel
  {
#endif
  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
  vector4double z0, z1, z2, z3, z4, z5, k;
  float *s, *r;
  float ALIGN32 _c;
  _c = c;
  __prefetch_by_load(S);
  __prefetch_by_load(R);

  k = vec_splats((double)_c);
  __alignx(16, s);
  __alignx(16, r);
  __alignx(16, S);
  __alignx(16, R);

#ifdef TM_USE_OMP
#pragma omp for
#else
#pragma unroll(2)
#endif
  for(int i = 0; i < N; i++) {
    s=(float*)((spinor32 *) S + i);
    r=(float*)((spinor32 *) R + i);
    __prefetch_by_load(S + i + 1);
    __prefetch_by_stream(1, R + i + 1);
    x0 = vec_ld(0, r);
    x1 = vec_ld(0, r+4);
    x2 = vec_ld(0, r+8);
    x3 = vec_ld(0, r+12);
    x4 = vec_ld(0, r+16);
    x5 = vec_ld(0, r+20);
    y0 = vec_ld(0, s);
    y1 = vec_ld(0, s+4);
    y2 = vec_ld(0, s+8);
    y3 = vec_ld(0, s+12);
    y4 = vec_ld(0, s+16);
    y5 = vec_ld(0, s+20);
    z0 = vec_madd(k, y0, x0);
    z1 = vec_madd(k, y1, x1);
    z2 = vec_madd(k, y2, x2);
    z3 = vec_madd(k, y3, x3);
    z4 = vec_madd(k, y4, x4);
    z5 = vec_madd(k, y5, x5);
    vec_st(z0, 0, r);
    vec_st(z1, 0, r+4);
    vec_st(z2, 0, r+8);
    vec_st(z3, 0, r+12);
    vec_st(z4, 0, r+16);
    vec_st(z5, 0, r+20);
  }
#ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#endif
  return;
}
Esempio n. 19
0
__vector float func(__vector float vx)
{
    __vector float vy;
    __vector float va = (__vector float) {   5.0f,   5.0f,   5.0f,   5.0f };
    __vector float vb = (__vector float) { -16.0f, -16.0f, -16.0f, -16.0f };
    __vector float vc = (__vector float) { -36.0f, -36.0f, -36.0f, -36.0f };
    __vector float vd = (__vector float) {  64.0f,  64.0f,  64.0f,  64.0f };
    __vector float ve = (__vector float) { 192.0f, 192.0f, 192.0f, 192.0f };

    vy = vec_madd(va, vx, vb);
    vy = vec_madd(vy, vx, vc);
    vy = vec_madd(vy, vx, vd);
    vy = vec_madd(vy, vx, ve);

    return vy;
}

float calc_integral(float start, float end, float delta)
{
    int i;
    float *sum;

    __vector float vx     = (__vector float) { start+delta*0, start+delta*1,
                                               start+delta*2, start+delta*3 };
    __vector float vsum   = (__vector float) { 0.0f, 0.0f, 0.0f, 0.0f };
    __vector float vdelta = (__vector float) { delta, delta, delta, delta };
    __vector float vstep  = (__vector float) { 4.0f, 4.0f, 4.0f, 4.0f };

    for (i = 0; i < (end-start)/delta; i += 4) {
        vsum = vec_madd(func(vx), vdelta, vsum);
        vx   = vec_madd(vdelta, vstep, vx);
    }

    sum = (float *) &vsum;

    return (sum[0] + sum[1] + sum[2] + sum[3]);
}

int main(int argc, char **argv)
{
    float start = 0.0f;
    float end   = 4.0f;
    float delta = 0.00001f;
    float result;

    printf("start = %f, end = %f\n", start, end);

    result = calc_integral(start, end, delta);

    printf("result = %f\n", result);

    return 0;
}
Esempio n. 20
0
void CDSPToolsOSX::Mul32(tfloat32* pDest, tfloat32 Src)
{
#ifdef _Mac_PowerPC
	vector float vSrc = (vector float)(Src, Src, Src, Src);

	vector float vDest = vec_ldl(0, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 0, pDest);

	vDest = vec_ldl(4 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 4 * 4, pDest);

	vDest = vec_ldl(8 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 8 * 4, pDest);

	vDest = vec_ldl(12 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 12 * 4, pDest);

	vDest = vec_ldl(16 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 16 * 4, pDest);

	vDest = vec_ldl(20 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 20 * 4, pDest);

	vDest = vec_ldl(24 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 24 * 4, pDest);

	vDest = vec_ldl(28 * 4, pDest);
	vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0));
	vec_st(vDest, 28 * 4, pDest);
#else	// _Mac_PowerPC
	CDSPTools::Mul32(pDest, Src);
#endif	// _Mac_PowerPC
}
/* ************************************************************************* 

   NAME:  test_madd

   USAGE: 

   test_madd();

   returns: void

   DESCRIPTION:
                   see how the combination multiply/add operation works

   REFERENCES:

   Ian Ollmann's Altivec Tutorial
   
   LIMITATIONS:

   GLOBAL VARIABLES:

      accessed: none

      modified: none

   FUNCTIONS CALLED:
   
   fprintf
   vec_madd - multiply two float vectors and add to the sum a float value 
              all in one operation
 
   
   REVISION HISTORY:
        STR                  Description of Revision                 Author
     27-Feb-11               initial coding                           kaj

 ************************************************************************* */
void test_madd(void)
{
  vector float floatVector1 =
       { -17.777777, 0.0, 37.777777, 100.0};
  vector float addVector = 
       { 32.0, 32.0, 32.0, 32.0};
  vector float coeffVector = 
       { 1.8, 1.8, 1.8, 1.8};
  vector float fahrenheitVector;
  short printfloat[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16)));

  /* print vectors performing madd on */
  fprintf(stderr,"-----------------------------------------------------------\n\n");
  printVecFloats("vec_madd input vector 1", floatVector1,FLOAT_ARRAYSIZE);
  printVecFloats("vec_madd input vector to add", addVector,FLOAT_ARRAYSIZE);  
  printVecFloats("vec_madd coeffvector to multiply", coeffVector,FLOAT_ARRAYSIZE); 
  
  /* calculate */
  fahrenheitVector = vec_madd(floatVector1,coeffVector,addVector);
  printVecFloats("vec_madd vector (Input*9/5+32)", fahrenheitVector,FLOAT_ARRAYSIZE);

} /* test_madd */
Esempio n. 22
0
static void vector_fmul_add_altivec(float *dst, const float *src0,
                                    const float *src1, const float *src2,
                                    int len)
{
    int i;
    vector float d, s0, s1, s2, t0, t1, edges;
    vector unsigned char align = vec_lvsr(0,dst),
                         mask = vec_lvsl(0, dst);

    for (i=0; i<len-3; i+=4) {
        t0 = vec_ld(0, dst+i);
        t1 = vec_ld(15, dst+i);
        s0 = vec_ld(0, src0+i);
        s1 = vec_ld(0, src1+i);
        s2 = vec_ld(0, src2+i);
        edges = vec_perm(t1 ,t0, mask);
        d = vec_madd(s0,s1,s2);
        t1 = vec_perm(d, edges, align);
        t0 = vec_perm(edges, d, align);
        vec_st(t1, 15, dst+i);
        vec_st(t0, 0, dst+i);
    }
}
Esempio n. 23
0
int main(int argc, char **argv)
{
    int i;

    __vector float *vin  = (__vector float *) in;
    __vector float *vout = (__vector float *) out;
    __vector float vin_negative;
    __vector unsigned int vpat;

    __vector float vzero  = (__vector float) {  0.0f,  0.0f,  0.0f,  0.0f };
    __vector float vminus = (__vector float) { -1.0f, -1.0f, -1.0f, -1.0f };

    for (i = 0; i < SIZE/4; i++) {
        vpat = vec_cmpgt(vin[i], vzero);
        vin_negative = vec_madd(vin[i], vminus, vzero);
        vout[i] = vec_sel(vin_negative, vin[i], vpat);
    }

    for (i = 0; i < SIZE; i++) {
        printf("out[%02d]=%0.0f\n", i, out[i]);
    }

    return 0;
}
Esempio n. 24
0
static void ProjectDlightTexture_altivec( void ) {
	int		i, l;
	vec_t	origin0, origin1, origin2;
	float   texCoords0, texCoords1;
	vector float floatColorVec0, floatColorVec1;
	vector float modulateVec, colorVec, zero;
	vector short colorShort;
	vector signed int colorInt;
	vector unsigned char floatColorVecPerm, modulatePerm, colorChar;
	vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff);
	float	*texCoords;
	byte	*colors;
	byte	clipBits[SHADER_MAX_VERTEXES];
	float	texCoordsArray[SHADER_MAX_VERTEXES][2];
	byte	colorArray[SHADER_MAX_VERTEXES][4];
	unsigned	hitIndexes[SHADER_MAX_INDEXES];
	int		numIndexes;
	float	scale;
	float	radius;
	vec3_t	floatColor;
	float	modulate = 0.0f;

	if ( !backEnd.refdef.num_dlights ) {
		return;
	}

	// There has to be a better way to do this so that floatColor
	// and/or modulate are already 16-byte aligned.
	floatColorVecPerm = vec_lvsl(0,(float *)floatColor);
	modulatePerm = vec_lvsl(0,(float *)&modulate);
	modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0);
	zero = (vector float)vec_splat_s8(0);

	for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) {
		dlight_t	*dl;

		if ( !( tess.dlightBits & ( 1 << l ) ) ) {
			continue;	// this surface definately doesn't have any of this light
		}
		texCoords = texCoordsArray[0];
		colors = colorArray[0];

		dl = &backEnd.refdef.dlights[l];
		origin0 = dl->transformed[0];
		origin1 = dl->transformed[1];
		origin2 = dl->transformed[2];
		radius = dl->radius;
		scale = 1.0f / radius;

		if(r_greyscale->integer)
		{
			float luminance;
			
			luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f;
			floatColor[0] = floatColor[1] = floatColor[2] = luminance;
		}
		else if(r_greyscale->value)
		{
			float luminance;
			
			luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f;
			floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value);
			floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value);
			floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value);
		}
		else
		{
			floatColor[0] = dl->color[0] * 255.0f;
			floatColor[1] = dl->color[1] * 255.0f;
			floatColor[2] = dl->color[2] * 255.0f;
		}
		floatColorVec0 = vec_ld(0, floatColor);
		floatColorVec1 = vec_ld(11, floatColor);
		floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm);
		for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) {
			int		clip = 0;
			vec_t dist0, dist1, dist2;
			
			dist0 = origin0 - tess.xyz[i][0];
			dist1 = origin1 - tess.xyz[i][1];
			dist2 = origin2 - tess.xyz[i][2];

			backEnd.pc.c_dlightVertexes++;

			texCoords0 = 0.5f + dist0 * scale;
			texCoords1 = 0.5f + dist1 * scale;

			if( !r_dlightBacks->integer &&
					// dist . tess.normal[i]
					( dist0 * tess.normal[i][0] +
					dist1 * tess.normal[i][1] +
					dist2 * tess.normal[i][2] ) < 0.0f ) {
				clip = 63;
			} else {
				if ( texCoords0 < 0.0f ) {
					clip |= 1;
				} else if ( texCoords0 > 1.0f ) {
					clip |= 2;
				}
				if ( texCoords1 < 0.0f ) {
					clip |= 4;
				} else if ( texCoords1 > 1.0f ) {
					clip |= 8;
				}
				texCoords[0] = texCoords0;
				texCoords[1] = texCoords1;

				// modulate the strength based on the height and color
				if ( dist2 > radius ) {
					clip |= 16;
					modulate = 0.0f;
				} else if ( dist2 < -radius ) {
					clip |= 32;
					modulate = 0.0f;
				} else {
					dist2 = Q_fabs(dist2);
					if ( dist2 < radius * 0.5f ) {
						modulate = 1.0f;
					} else {
						modulate = 2.0f * (radius - dist2) * scale;
					}
				}
			}
			clipBits[i] = clip;

			modulateVec = vec_ld(0,(float *)&modulate);
			modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm);
			colorVec = vec_madd(floatColorVec0,modulateVec,zero);
			colorInt = vec_cts(colorVec,0);	// RGBx
			colorShort = vec_pack(colorInt,colorInt);		// RGBxRGBx
			colorChar = vec_packsu(colorShort,colorShort);	// RGBxRGBxRGBxRGBx
			colorChar = vec_sel(colorChar,vSel,vSel);		// RGBARGBARGBARGBA replace alpha with 255
			vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors);	// store color
		}

		// build a list of triangles that need light
		numIndexes = 0;
		for ( i = 0 ; i < tess.numIndexes ; i += 3 ) {
			int		a, b, c;

			a = tess.indexes[i];
			b = tess.indexes[i+1];
			c = tess.indexes[i+2];
			if ( clipBits[a] & clipBits[b] & clipBits[c] ) {
				continue;	// not lighted
			}
			hitIndexes[numIndexes] = a;
			hitIndexes[numIndexes+1] = b;
			hitIndexes[numIndexes+2] = c;
			numIndexes += 3;
		}

		if ( !numIndexes ) {
			continue;
		}

		qglEnableClientState( GL_TEXTURE_COORD_ARRAY );
		qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] );

		qglEnableClientState( GL_COLOR_ARRAY );
		qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray );

		GL_Bind( tr.dlightImage );
		// include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light
		// where they aren't rendered
		if ( dl->additive ) {
			GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL );
		}
		else {
			GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL );
		}
		R_DrawElements( numIndexes, hitIndexes );
		backEnd.pc.c_totalIndexes += numIndexes;
		backEnd.pc.c_dlightIndexes += numIndexes;
	}
}
Esempio n. 25
0
File: fdctdsp.c Progetto: 63n/FFmpeg
/* two dimensional discrete cosine transform */
void ff_fdct_altivec(int16_t *block)
{
    vector signed short *bp;
    const vector float *cp = fdctconsts;
    vector float b00, b10, b20, b30, b40, b50, b60, b70;
    vector float b01, b11, b21, b31, b41, b51, b61, b71;
    vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
    vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;

    /* setup constants {{{ */
    /* mzero = -0.0 */
    mzero  = ((vector float) vec_splat_u32(-1));
    mzero  = ((vector float) vec_sl(vu32(mzero), vu32(mzero)));
    cnsts0 = vec_ld(0, cp);
    cp++;
    cnsts1 = vec_ld(0, cp);
    cp++;
    cnsts2 = vec_ld(0, cp);
    /* }}} */

    /* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b))

    bp  = (vector signed short *) block;
    b00 = ((vector float) vec_ld(0,      bp));
    b40 = ((vector float) vec_ld(16 * 4, bp));
    b01 = ((vector float) MERGE_S16(h, b00, b40));
    b11 = ((vector float) MERGE_S16(l, b00, b40));
    bp++;
    b10 = ((vector float) vec_ld(0,      bp));
    b50 = ((vector float) vec_ld(16 * 4, bp));
    b21 = ((vector float) MERGE_S16(h, b10, b50));
    b31 = ((vector float) MERGE_S16(l, b10, b50));
    bp++;
    b20 = ((vector float) vec_ld(0,      bp));
    b60 = ((vector float) vec_ld(16 * 4, bp));
    b41 = ((vector float) MERGE_S16(h, b20, b60));
    b51 = ((vector float) MERGE_S16(l, b20, b60));
    bp++;
    b30 = ((vector float) vec_ld(0,      bp));
    b70 = ((vector float) vec_ld(16 * 4, bp));
    b61 = ((vector float) MERGE_S16(h, b30, b70));
    b71 = ((vector float) MERGE_S16(l, b30, b70));

    x0 = ((vector float) MERGE_S16(h, b01, b41));
    x1 = ((vector float) MERGE_S16(l, b01, b41));
    x2 = ((vector float) MERGE_S16(h, b11, b51));
    x3 = ((vector float) MERGE_S16(l, b11, b51));
    x4 = ((vector float) MERGE_S16(h, b21, b61));
    x5 = ((vector float) MERGE_S16(l, b21, b61));
    x6 = ((vector float) MERGE_S16(h, b31, b71));
    x7 = ((vector float) MERGE_S16(l, b31, b71));

    b00 = ((vector float) MERGE_S16(h, x0, x4));
    b10 = ((vector float) MERGE_S16(l, x0, x4));
    b20 = ((vector float) MERGE_S16(h, x1, x5));
    b30 = ((vector float) MERGE_S16(l, x1, x5));
    b40 = ((vector float) MERGE_S16(h, x2, x6));
    b50 = ((vector float) MERGE_S16(l, x2, x6));
    b60 = ((vector float) MERGE_S16(h, x3, x7));
    b70 = ((vector float) MERGE_S16(l, x3, x7));

#undef MERGE_S16
    /* }}} */

    /* Some of the initial calculations can be done as vector short
     * before conversion to vector float.  The following code section
     * takes advantage of this. */

    /* fdct rows {{{ */
    x0 = ((vector float) vec_add(vs16(b00), vs16(b70)));
    x7 = ((vector float) vec_sub(vs16(b00), vs16(b70)));
    x1 = ((vector float) vec_add(vs16(b10), vs16(b60)));
    x6 = ((vector float) vec_sub(vs16(b10), vs16(b60)));
    x2 = ((vector float) vec_add(vs16(b20), vs16(b50)));
    x5 = ((vector float) vec_sub(vs16(b20), vs16(b50)));
    x3 = ((vector float) vec_add(vs16(b30), vs16(b40)));
    x4 = ((vector float) vec_sub(vs16(b30), vs16(b40)));

    b70 = ((vector float) vec_add(vs16(x0), vs16(x3)));
    b10 = ((vector float) vec_add(vs16(x1), vs16(x2)));

    b00 = ((vector float) vec_add(vs16(b70), vs16(b10)));
    b40 = ((vector float) vec_sub(vs16(b70), vs16(b10)));

#define CTF0(n)                                                    \
    b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \
    b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \
    b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0);                   \
    b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0)

    CTF0(0);
    CTF0(4);

    b20 = ((vector float) vec_sub(vs16(x0), vs16(x3)));
    b60 = ((vector float) vec_sub(vs16(x1), vs16(x2)));

    CTF0(2);
    CTF0(6);

#undef CTF0

    x0 = vec_add(b60, b20);
    x1 = vec_add(b61, b21);

    cnst = LD_W2;
    x0   = vec_madd(cnst, x0, mzero);
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_W1;
    b20  = vec_madd(cnst, b20, x0);
    b21  = vec_madd(cnst, b21, x1);
    cnst = LD_W0;
    b60  = vec_madd(cnst, b60, x0);
    b61  = vec_madd(cnst, b61, x1);

#define CTFX(x, b)                                  \
    b ## 0 = ((vector float) vec_unpackh(vs16(x))); \
    b ## 1 = ((vector float) vec_unpackl(vs16(x))); \
    b ## 0 = vec_ctf(vs32(b ## 0), 0);              \
    b ## 1 = vec_ctf(vs32(b ## 1), 0)

    CTFX(x4, b7);
    CTFX(x5, b5);
    CTFX(x6, b3);
    CTFX(x7, b1);

#undef CTFX

    x0   = vec_add(b70, b10);
    x1   = vec_add(b50, b30);
    x2   = vec_add(b70, b30);
    x3   = vec_add(b50, b10);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b70  = vec_madd(cnst, b70, x0);
    cnst = LD_W5;
    b50  = vec_madd(cnst, b50, x1);
    cnst = LD_W6;
    b30  = vec_madd(cnst, b30, x1);
    cnst = LD_W7;
    b10  = vec_madd(cnst, b10, x0);

    b70 = vec_add(b70, x2);
    b50 = vec_add(b50, x3);
    b30 = vec_add(b30, x2);
    b10 = vec_add(b10, x3);

    x0   = vec_add(b71, b11);
    x1   = vec_add(b51, b31);
    x2   = vec_add(b71, b31);
    x3   = vec_add(b51, b11);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b71  = vec_madd(cnst, b71, x0);
    cnst = LD_W5;
    b51  = vec_madd(cnst, b51, x1);
    cnst = LD_W6;
    b31  = vec_madd(cnst, b31, x1);
    cnst = LD_W7;
    b11  = vec_madd(cnst, b11, x0);

    b71 = vec_add(b71, x2);
    b51 = vec_add(b51, x3);
    b31 = vec_add(b31, x2);
    b11 = vec_add(b11, x3);
    /* }}} */

    /* 8x8 matrix transpose (vector float[8][2]) {{{ */
    x0 = VEC_FMERGEL(b00, b20);
    x1 = VEC_FMERGEH(b00, b20);
    x2 = VEC_FMERGEL(b10, b30);
    x3 = VEC_FMERGEH(b10, b30);

    b00 = VEC_FMERGEH(x1, x3);
    b10 = VEC_FMERGEL(x1, x3);
    b20 = VEC_FMERGEH(x0, x2);
    b30 = VEC_FMERGEL(x0, x2);

    x4 = VEC_FMERGEL(b41, b61);
    x5 = VEC_FMERGEH(b41, b61);
    x6 = VEC_FMERGEL(b51, b71);
    x7 = VEC_FMERGEH(b51, b71);

    b41 = VEC_FMERGEH(x5, x7);
    b51 = VEC_FMERGEL(x5, x7);
    b61 = VEC_FMERGEH(x4, x6);
    b71 = VEC_FMERGEL(x4, x6);

    x0 = VEC_FMERGEL(b01, b21);
    x1 = VEC_FMERGEH(b01, b21);
    x2 = VEC_FMERGEL(b11, b31);
    x3 = VEC_FMERGEH(b11, b31);

    x4 = VEC_FMERGEL(b40, b60);
    x5 = VEC_FMERGEH(b40, b60);
    x6 = VEC_FMERGEL(b50, b70);
    x7 = VEC_FMERGEH(b50, b70);

    b40 = VEC_FMERGEH(x1, x3);
    b50 = VEC_FMERGEL(x1, x3);
    b60 = VEC_FMERGEH(x0, x2);
    b70 = VEC_FMERGEL(x0, x2);

    b01 = VEC_FMERGEH(x5, x7);
    b11 = VEC_FMERGEL(x5, x7);
    b21 = VEC_FMERGEH(x4, x6);
    b31 = VEC_FMERGEL(x4, x6);
    /* }}} */

    FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
    FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);

    /* round, convert back to short {{{ */
#define CTS(n)                                                  \
    b ## n ## 0 = vec_round(b ## n ## 0);                       \
    b ## n ## 1 = vec_round(b ## n ## 1);                       \
    b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0));     \
    b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0));     \
    b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0),   \
                                           vs32(b ## n ## 1))); \
    vec_st(vs16(b ## n ## 0), 0, bp)

    bp = (vector signed short *) block;
    CTS(0);
    bp++;
    CTS(1);
    bp++;
    CTS(2);
    bp++;
    CTS(3);
    bp++;
    CTS(4);
    bp++;
    CTS(5);
    bp++;
    CTS(6);
    bp++;
    CTS(7);

#undef CTS
    /* }}} */
}
Esempio n. 26
0
void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign)
{

    /*
       altivec version of cfft2 from Petersen and Arbenz book, "Intro.
       to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6
                                            wpp 14. Dec. 2003
    */

    int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle;
    float rp,up,wr[4] __attribute((aligned(16)));
    float wu[4] __attribute((aligned(16)));
    float *a,*b,*c,*d;
    const vector float vminus = (vector float) {
        -0.,0.,-0.,0.
    };
    const vector float vzero  = (vector float) {
        0.,0.,0.,0.
    };
    const vector unsigned char pv3201 =
    (vector unsigned char) {
        4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11
    };
    vector float V0,V1,V2,V3,V4,V5,V6,V7;
    vector float V8,V9,V10,V11,V12,V13,V14,V15;

    if(n<=1) {
        y[0][0] = x[0][0];
        y[0][1] = x[0][1];
        return;
    }
    m    = (int) (log((float) n)/log(1.99));
    mj   = 1;
    mj2  = 2;
    lj   = n/2;
    /* first pass thru data: x -> y */
    for(j=0; j<lj; j++) {
        jb = n/2+j;
        jc  = j*mj2;
        jd = jc + 1;
        rp = w[j][0];
        up = w[j][1];
        if(sign<0.0) up = -up;
        y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]);
        y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]);
        y[jc][0] = x[j][0] + x[jb][0];
        y[jc][1] = x[j][1] + x[jb][1];
    }
    if(n==2) return;
    /* next pass is mj = 2 */
    mj  = 2;
    mj2 = 4;
    lj  = n/4;
    a = (float *)&y[0][0];
    b = (float *)&y[n/2][0];
    c = (float *)&x[0][0];
    d = (float *)&x[mj][0];
    if(n==4) {
        c = (float *)&y[0][0];
        d = (float *)&y[mj][0];
    }
    for(j=0; j<lj; j++) {
        jw = j*mj;
        jc = j*mj2;
        jd = 2*jc;
        rp = w[jw][0];
        up = w[jw][1];
        if(sign<0.0) up = -up;
        wr[0] = rp;
        wr[1] = rp;
        wr[2] = rp;
        wr[3] = rp;
        wu[0] = up;
        wu[1] = up;
        wu[2] = up;
        wu[3] = up;
        V6 = vec_ld(0,wr);
        V7 = vec_ld(0,wu);
        V7 = vec_xor(V7,vminus);
        V0 = vec_ld(0,(vector float *) (a+jc));
        V1 = vec_ld(0,(vector float *) (b+jc));
        V2 = vec_add(V0,V1);                         /* a + b */
        vec_st(V2,0,(vector float *) (c+jd));     /* store c */
        V3 = vec_sub(V0,V1);                         /* a - b */
        V4 = vec_perm(V3,V3,pv3201);
        V0 = vec_madd(V6,V3,vzero);
        V1 = vec_madd(V7,V4,vzero);
        V2 = vec_add(V0,V1);                         /* w*(a - b) */
        vec_st(V2,0,(vector float*) (d+jd));         /* store d */
    }
    if(n==4) return;
    mj  *= 2;
    mj2  = 2*mj;
    lj   = n/mj2;
    tgle = 0;
    for(pass=2; pass<m-1; pass++) {
        if(tgle) {
            a = (float *)&y[0][0];
            b = (float *)&y[n/2][0];
            c = (float *)&x[0][0];
            d = (float *)&x[mj][0];
            tgle = 0;
        } else {
            a = (float *)&x[0][0];
            b = (float *)&x[n/2][0];
            c = (float *)&y[0][0];
            d = (float *)&y[mj][0];
            tgle = 1;
        }
        for(j=0; j<lj; j++) {
            jw = j*mj;
            jc = j*mj2;
            jd = 2*jc;
            rp = w[jw][0];
            up = w[jw][1];
            if(sign<0.0) up = -up;
            wr[0] = rp;
            wr[1] = rp;
            wr[2] = rp;
            wr[3] = rp;
            wu[0] = up;
            wu[1] = up;
            wu[2] = up;
            wu[3] = up;
            V6 = vec_ld(0,wr);
            V7 = vec_ld(0,wu);
            V7 = vec_xor(V7,vminus);
            for(k=0; k<mj; k+=4) {
                k2   = 2*k;
                k4 = k2+4;
                V0 = vec_ld(0,(vector float *) (a+jc+k2));
                V1 = vec_ld(0,(vector float *) (b+jc+k2));
                V2 = vec_add(V0,V1);                        /* a + b */
                vec_st(V2,0,(vector float*) (c+jd+k2));   /* store c */
                V3 = vec_sub(V0,V1);                        /* a - b */
                V4 = vec_perm(V3,V3,pv3201);
                V0 = vec_madd(V6,V3,vzero);
                V1 = vec_madd(V7,V4,vzero);
                V2 = vec_add(V0,V1);                        /* w*(a - b) */
                vec_st(V2,0,(vector float *) (d+jd+k2));    /* store d */
                V8 = vec_ld(0,(vector float *) (a+jc+k4));
                V9 = vec_ld(0,(vector float *) (b+jc+k4));
                V10 = vec_add(V8,V9);		        /* a + b */
                vec_st(V10,0,(vector float *) (c+jd+k4));   /* store c */
                V11 = vec_sub(V8,V9);		        /* a - b */
                V12 = vec_perm(V11,V11,pv3201);
                V8  = vec_madd(V6,V11,vzero);
                V9  = vec_madd(V7,V12,vzero);
                V10 = vec_add(V8,V9);		        /* w*(a - b) */
                vec_st(V10,0,(vector float *) (d+jd+k4));   /* store d */
            }
        }
        mj  *= 2;
        mj2  = 2*mj;
        lj   = n/mj2;
    }
    /* last pass thru data: in-place if previous in y */
    c = (float *)&y[0][0];
    d = (float *)&y[n/2][0];
    if(tgle) {
        a = (float *)&y[0][0];
        b = (float *)&y[n/2][0];
    } else {
        a = (float *)&x[0][0];
        b = (float *)&x[n/2][0];
    }
    for(k=0; k<(n/2); k+=4) {
        k2 = 2*k;
        k4 = k2+4;
        V0 = vec_ld(0,(vector float *) (a+k2));
        V1 = vec_ld(0,(vector float *) (b+k2));
        V2 = vec_add(V0,V1);                      /* a + b */
        vec_st(V2,0,(vector float*) (c+k2));      /* store c */
        V3 = vec_sub(V0,V1);                      /* a - b */
        vec_st(V3,0,(vector float *) (d+k2));     /* store d */
        V4 = vec_ld(0,(vector float *) (a+k4));
        V5 = vec_ld(0,(vector float *) (b+k4));
        V6 = vec_add(V4,V5);                      /* a + b */
        vec_st(V6,0,(vector float *) (c+k4));     /* store c */
        V7 = vec_sub(V4,V5);                      /* a - b */
        vec_st(V7,0,(vector float *) (d+k4));     /* store d */
    }
}

// LLVM LOCAL begin
// Implementations of sin() and cos() may vary slightly in the accuracy of
// their results, typically only in the least significant bit.  Round to make
// the results consistent across platforms.
typedef union {
    double d;
    unsigned long long ll;
} dbl_ll_union;
static double LLVMsin(double d) {
    dbl_ll_union u;
    u.d = sin(d);
    u.ll = (u.ll + 1) & ~1ULL;
    return u.d;
}
static double LLVMcos(double d) {
    dbl_ll_union u;
    u.d = cos(d);
    u.ll = (u.ll + 1) & ~1ULL;
    return u.d;
}
// LLVM LOCAL end

void cffti(int n, float w[][2])
{

    /* initialization routine for cfft2: computes
            cos(twopi*k),sin(twopi*k) for k=0..n/2-1
       - the "twiddle factors" for a binary radix FFT */

    int i,n2;
    float aw,arg,pi;
    pi = 3.141592653589793;
    n2 = n/2;
    aw = 2.0*pi/((float)n);
    for(i=0; i<n2; i++) {
        arg   = aw*((float)i);
        w[i][0] = LLVMcos(arg);
        w[i][1] = LLVMsin(arg);
    }
}
void 
nb_kernel310_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,tsc,fs,fs2,nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc;
	vector float fix,fiy,fiz;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12;

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);
	tsc=load_float_and_splat(p_tabscale);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			vctot      = nul;
			fix        = nul;
			fiy        = nul;
			fiz        = nul;
			ix         = vec_add(ix,shvec);
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			iq        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_2_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_madd(fs,rinv,nul);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_3_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_1(dx,dy,dz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
			tmp1 = vec_add(tmp1,tmp3);
			tmp2 = vec_add(tmp2,tmp4);
			tmp1 = vec_add(tmp1,tmp2);

			add_xyz_to_mem(faction+ii3,tmp1);
			add_xyz_to_mem(fshift+is3,tmp1);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
Esempio n. 28
0
void test1() {
// CHECK-LABEL: define void @test1
// CHECK-LE-LABEL: define void @test1

  res_vf = vec_abs(vf);
// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_add(vd, vd);
// CHECK: fadd <2 x double>
// CHECK-LE: fadd <2 x double>

  res_vd = vec_and(vbll, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vbll);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vbll, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vd, vbll);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_ceil(vd);
// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})

  res_vf = vec_ceil(vf);
// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpeq(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpeq(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpge(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpge(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpgt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpgt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmple(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmple(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmplt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmplt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  /* vec_cpsgn */
  res_vf = vec_cpsgn(vf, vf);
// CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})
// CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})

  res_vd = vec_cpsgn(vd, vd);
// CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})
// CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})

  /* vec_div */
  res_vsll = vec_div(vsll, vsll);
// CHECK: sdiv <2 x i64>
// CHECK-LE: sdiv <2 x i64>

  res_vull = vec_div(vull, vull);
// CHECK: udiv <2 x i64>
// CHECK-LE: udiv <2 x i64>

  res_vf = vec_div(vf, vf);
// CHECK: fdiv <4 x float>
// CHECK-LE: fdiv <4 x float>

  res_vd = vec_div(vd, vd);
// CHECK: fdiv <2 x double>
// CHECK-LE: fdiv <2 x double>

  /* vec_max */
  res_vf = vec_max(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  res_vd = vec_max(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmaxdp
// CHECK-LE: @llvm.ppc.vsx.xvmaxdp

  res_vf = vec_vmaxfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  /* vec_min */
  res_vf = vec_min(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_vd = vec_min(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmindp
// CHECK-LE: @llvm.ppc.vsx.xvmindp

  res_vf = vec_vminfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_d = __builtin_vsx_xsmaxdp(d, d);
// CHECK: @llvm.ppc.vsx.xsmaxdp
// CHECK-LE: @llvm.ppc.vsx.xsmaxdp

  res_d = __builtin_vsx_xsmindp(d, d);
// CHECK: @llvm.ppc.vsx.xsmindp
// CHECK-LE: @llvm.ppc.vsx.xsmindp

  /* vec_perm */
  res_vsll = vec_perm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_perm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbll = vec_perm(vbll, vbll, vuc);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vf = vec_round(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float>
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float>

  res_vd = vec_round(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double>
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double>

  res_vd = vec_perm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_splat(vd, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vbll = vec_splat(vbll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsll =  vec_splat(vsll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vull =  vec_splat(vull, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsi = vec_pack(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vui = vec_pack(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbi = vec_pack(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_vperm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_vperm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_vperm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_vsx_ld */

  res_vsi = vec_vsx_ld(0, &vsi);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vui = vec_vsx_ld(0, &vui);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vf = vec_vsx_ld (0, &vf);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsll = vec_vsx_ld(0, &vsll);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vss = vec_vsx_ld(0, &vss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vss = vec_vsx_ld(0, &ss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &vus);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &us);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vbc = vec_vsx_ld(0, &vbc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &vsc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &vuc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &sc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &uc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  /* vec_vsx_st */

  vec_vsx_st(vsi, 0, &res_vsi);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsi, 0, &res_si);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_vui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_ui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vf, 0, &res_vf);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsll, 0, &res_vsll);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vull, 0, &res_vull);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vd, 0, &res_vd);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vss, 0, &res_vss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vss, 0, &res_ss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_vus);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_us);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_vsc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_vuc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_vbc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  /* vec_and */
  res_vsll = vec_and(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_and(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_vand */
  res_vsll = vec_vand(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_vand(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_andc */
  res_vsll = vec_andc(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_andc(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vf = vec_floor(vf);
// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_floor(vd);
// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_madd(vf, vf, vf);
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})

  res_vd = vec_madd(vd, vd, vd);
// CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})

  /* vec_mergeh */
  res_vsll = vec_mergeh(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_mergel */
  res_vsll = vec_mergel(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_msub */
  res_vf = vec_msub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>

  res_vd = vec_msub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>

  res_vsll = vec_mul(vsll, vsll);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vull = vec_mul(vull, vull);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vf = vec_mul(vf, vf);
// CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_mul(vd, vd);
// CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_nearbyint(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_nearbyint(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_nmadd(vf, vf, vf);
// CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]

  res_vd = vec_nmadd(vd, vd, vd);
// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  res_vf = vec_nmsub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}

  res_vd = vec_nmsub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  /* vec_nor */
  res_vsll = vec_nor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vull, vull);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vd = vec_nor(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>

  /* vec_or */
  res_vsll = vec_or(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_or(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vd = vec_or(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_or(vbll, vd);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vd = vec_or(vd, vbll);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vf = vec_re(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>

  res_vd = vec_re(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>

  res_vf = vec_rint(vf);
// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_rint(vd);
// CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_rsqrte(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})

  res_vd = vec_rsqrte(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vf = vec_sel(vd, vd, vbll);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_sel(vd, vd, vull);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  res_vf = vec_sqrt(vf);
// CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_sqrt(vd);
// CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})

  res_vd = vec_sub(vd, vd);
// CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_trunc(vf);
// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_trunc(vd);
// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})

  /* vec_vor */
  res_vsll = vec_vor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_vor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  /* vec_xor */
  res_vsll = vec_xor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_xor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vbll);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vbll, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  /* vec_vxor */
  res_vsll = vec_vxor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_vxor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_cts(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_cts(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vd = vec_ctf(vsll, 0);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vsll, 31);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 0);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 31);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>
}
void 
nb_kernel010nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,c6,c12;
	vector float rinvsq,rsq,rinvsix;
  
	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int ntiA,tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();

#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_2_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_3_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				load_1_pair(vdwparam+tja,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			/* update outer data */
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);
	vkrf=load_float_and_splat(p_krf);
	vcrf=load_float_and_splat(p_crf);
	ii         = iinr[0];
	iqO        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */