static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len) { vector float zero, t0, t1, s0, s1, wi, wj; const vector unsigned char reverse = vcprm(3,2,1,0); int i,j; dst += len; win += len; src0+= len; zero = (vector float)vec_splat_u32(0); for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { s0 = vec_ld(i, src0); s1 = vec_ld(j, src1); wi = vec_ld(i, win); wj = vec_ld(j, win); s1 = vec_perm(s1, s1, reverse); wj = vec_perm(wj, wj, reverse); t0 = vec_madd(s0, wj, zero); t0 = vec_nmsub(s1, wi, t0); t1 = vec_madd(s0, wi, zero); t1 = vec_madd(s1, wj, t1); t1 = vec_perm(t1, t1, reverse); vec_st(t0, i, dst); vec_st(t1, j, dst); } }
void audio_convert_s16_to_float_altivec(float *out, const int16_t *in, size_t samples, float gain) { const vector float gain_vec = vec_splats(gain); const vector float zero_vec = vec_splats(0.0f); // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely). if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0) { size_t i; for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { vector signed short input = vec_ld(0, in); vector signed int hi = vec_unpackh(input); vector signed int lo = vec_unpackl(input); vector float out_hi = vec_madd(vec_ctf(hi, 15), gain_vec, zero_vec); vector float out_lo = vec_madd(vec_ctf(lo, 15), gain_vec, zero_vec); vec_st(out_hi, 0, out); vec_st(out_lo, 16, out); } audio_convert_s16_to_float_C(out, in, samples - i, gain); } else audio_convert_s16_to_float_C(out, in, samples, gain); }
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths; * instead we use the fact that E(i) is itself the necessary normalization * factor, and implement FChoose's algorithm here for an on-the-fly * calculation. */ static inline int select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k) { int Q = p7O_NQF(ox->M); double sum = 0.0; double roll = esl_random(rng); double norm = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E]; /* all M, D already scaled exactly the same */ vector float xEv = esl_vmx_set_float(norm); vector float zerov = (vector float) vec_splat_u32(0); union { vector float v; float p[4]; } u; int q,r; while (1) { for (q = 0; q < Q; q++) { u.v = vec_madd(ox->dpf[i][q*3 + p7X_M], xEv, zerov); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;} } u.v = vec_madd(ox->dpf[i][q*3 + p7X_D], xEv, zerov); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;} } } ESL_DASSERT1(sum > 0.99); } /*UNREACHED*/ ESL_EXCEPTION(-1, "unreached code was reached. universe collapses."); }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv, dpv; vector float tmdv, tddv; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_D }; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i][(q-1)*3 + p7X_M]; dpv = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv = om->tfv[7*(q-1) + p7O_MD]; tddv = om->tfv[7*Q + (q-1)]; } else { mpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv = vec_sld(zerov, om->tfv[8*Q-1], 12); } u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r]; u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* An accurate vector division routine using the reciprocal estimate and * two Newton-Raphson iterations */ static inline vector float vec_div(vector float A, vector float B) { vector float y0; vector float y1; vector float y2; vector float Q; vector float R; vector float one = (vector float) (1.0f); vector float zero = (vector float) (-0.0f); vector float mone = (vector float) (-1.0f); y0 = vec_re(B); // approximate 1/B // y1 = y0*(-(y0*B - 1.0))+y0 i.e. y0+y0*(1.0 - y0*B) y1 = vec_madd(y0,vec_nmsub(y0, B, one),y0); // REPEAT the Newton-Raphson to get the required 24 bits y2 = vec_madd(y1, vec_nmsub(y1, B, one),y1); // y2 = y1*(-(y1*B - 1.0f))+y1 i.e. y1+y1*(1.0f - y1*B) // y2 is now the correctly rounded reciprocal, and the manual considers this // OK for use in computing the remainder: Q = A*y2, R = A - B*Q Q = vec_madd(A,y2,zero); // -0.0 IEEE R = vec_nmsub(B,Q,A); // -(B*Q-A) == (A-B*Q) // final rouding adjustment return(vec_madd(R, y2, Q)); }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u; float path[4]; int state[4] = { p7T_B, p7T_M, p7T_I, p7T_D }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } u.v = vec_madd(xBv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(mpv, *tp, zerov); tp++; path[1] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); tp++; path[2] = u.p[r]; u.v = vec_madd(dpv, *tp, zerov); path[3] = u.p[r]; esl_vec_FNorm(path, 4); return state[esl_rnd_FChoose(rng, path, 4)]; }
static void vector_fmul_reverse_altivec(float *dst, const float *src0, const float *src1, int len) { int i; vector float d, s0, s1, h0, l0, s2, s3, zero = (vector float)vec_splat_u32(0); src1 += len-4; for(i=0; i<len-7; i+=8) { s1 = vec_ld(0, src1-i); // [a,b,c,d] s0 = vec_ld(0, src0+i); l0 = vec_mergel(s1, s1); // [c,c,d,d] s3 = vec_ld(-16, src1-i); h0 = vec_mergeh(s1, s1); // [a,a,b,b] s2 = vec_ld(16, src0+i); s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] vec_mergeh(l0,h0)); // [c,a,c,a] // [d,c,b,a] l0 = vec_mergel(s3, s3); d = vec_madd(s0, s1, zero); h0 = vec_mergeh(s3, s3); vec_st(d, 0, dst+i); s3 = vec_mergeh(vec_mergel(l0,h0), vec_mergeh(l0,h0)); d = vec_madd(s2, s3, zero); vec_st(d, 16, dst+i); } }
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a) { const vec_float4 v0 = vec_splat(v.val, 0); const vec_float4 v1 = vec_splat(v.val, 1); const vec_float4 v2 = vec_splat(v.val, 2); return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val)))); }
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) { const vec_float4 v0 = vec_splat(v.val, 0); const vec_float4 v1 = vec_splat(v.val, 1); const vec_float4 v2 = vec_splat(v.val, 2); VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3); return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val))))); }
static void vector_fmul_altivec(float *dst, const float *src, int len) { int i; vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); for(i=0; i<len-7; i+=8) { d0 = vec_ld(0, dst+i); s = vec_ld(0, src+i); d1 = vec_ld(16, dst+i); d0 = vec_madd(d0, s, zero); d1 = vec_madd(d1, vec_ld(16,src+i), zero); vec_st(d0, 0, dst+i); vec_st(d1, 16, dst+i); } }
// exectue vectorized structured inner dot product void dotprod_crcf_execute(dotprod_crcf _q, float complex * _x, float complex * _r) { int al; // input data alignment vector float *ar,*d; vector float s0,s1,s2,s3; union { vector float v; float w[4];} s; unsigned int nblocks; ar = (vector float*)( (int)_x & ~15); al = ((int)_x & 15)/sizeof(float); d = (vector float*)_q->h[al]; // number of blocks doubles because of complex type nblocks = (2*_q->n + al - 1)/4 + 1; // split into four vectors each with four 32-bit // partial sums. Effectively each loop iteration // operates on 16 input samples at a time. s0 = s1 = s2 = s3 = (vector float)(0); while (nblocks >= 4) { s0 = vec_madd(ar[nblocks-1],d[nblocks-1],s0); s1 = vec_madd(ar[nblocks-2],d[nblocks-2],s1); s2 = vec_madd(ar[nblocks-3],d[nblocks-3],s2); s3 = vec_madd(ar[nblocks-4],d[nblocks-4],s3); nblocks -= 4; } // fold the resulting partial sums into vector s0 s0 = vec_add(s0,s1); // s0 = s0+s1 s2 = vec_add(s2,s3); // s2 = s2+s3 s0 = vec_add(s0,s2); // s0 = s0+s2 // finish partial summing operations while (nblocks-- > 0) s0 = vec_madd(ar[nblocks],d[nblocks],s0); // move the result into the union s (effetively, // this loads the four 32-bit values in s0 into // the array w). s.v = vec_add(s0,(vector float)(0)); // sum the resulting array //*_r = s.w[0] + s.w[1] + s.w[2] + s.w[3]; *_r = (s.w[0] + s.w[2]) + (s.w[1] + s.w[3]) * _Complex_I; }
static void OSX_AudioIOProc16Bit_Altivec(SInt16 *myInBuffer, float *myOutBuffer) { register UInt32 i; float f = SOUND_BUFFER_SCALE_16BIT; const vector float gain = vec_load_ps1(&f); // multiplier const vector float mix = vec_setzero(); if (gBufferMono2Stereo) { int j=0; // TEST: OK for (i=0;i<SOUND_BUFFER_SIZE;i+=8, j+=16) { vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float vector float v3 = vec_madd(v1, gain, mix); // scale vector float v4 = vec_madd(v2, gain, mix); // scale vector float v5 = vec_mergel(v3, v3); // v3(0,0,1,1); vector float v6 = vec_mergeh(v3, v3); // v3(2,2,3,3); vector float v7 = vec_mergel(v4, v4); // v4(0,0,1,1); vector float v8 = vec_mergeh(v4, v4); // v4(2,2,3,3); vec_st(v5, 0, myOutBuffer + j); // Store 4 floats vec_st(v6, 0, myOutBuffer + 4 + j); // Store 4 floats vec_st(v7, 0, myOutBuffer + 8 + j); // Store 4 floats vec_st(v8, 0, myOutBuffer + 12 + j); // Store 4 floats } } else { // TEST: OK for (i=0;i<SOUND_BUFFER_SIZE;i+=8) { vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float vector float v3 = vec_madd(v1, gain, mix); // scale vector float v4 = vec_madd(v2, gain, mix); // scale vec_st(v3, 0, myOutBuffer + i); // Store 4 floats vec_st(v4, 0, myOutBuffer + 4 + i); // Store 4 floats } } }
__SIMDi _SIMD_madd_epi32(__SIMDi a, __SIMDi b, __SIMDi c) { #ifdef USE_SSE return _SIMD_add_epi32(_SIMD_mul_epi32(a,b),c); #elif defined USE_AVX return _m256_madd_ps(a,b); #elif defined USE_IBM return vec_madd(a,b); #endif }
__SIMDd _SIMD_madd_pd(__SIMDd a, __SIMDd b, __SIMDd c) { #ifdef USE_SSE return _mm_add_pd(_mm_mul_pd(a,b),c); #elif defined USE_AVX return _m256_madd_ps(a,b); #elif defined USE_IBM return vec_madd(a,b); #endif }
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */ static inline int select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv = ox->dpf[i-1][q*3 + p7X_M]; vector float ipv = ox->dpf[i-1][q*3 + p7X_I]; vector float *tp = om->tfv + 7*q + p7O_MI; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_I }; zerov = (vector float) vec_splat_u32(0); u.v = vec_madd(mpv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
static vector float appbones (State *s, struct skin *skin, vector float x, vector float y, vector float z, vector float nx, vector float ny, vector float nz, vector float *np) { int j; int num_bones; int bone_index; struct abone *b; vector float vz = (vector float) vec_splat_u32 (0); vector float v, w, n; vector unsigned char S = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4<<3}; v = n = vz; w = vec_ld (0, skin->weights); num_bones = skin->boneinfo & 3; bone_index = skin->boneinfo >> 2; for (j = 0; j < num_bones; ++j) { vector float t0, t1, t2, t3, t4, t5, r0, r1, r2, r3, vw; b = &s->abones[bone_index & 0x3ff]; bone_index >>= 10; vw = vec_splat (w, 0); w = vec_slo (w, S); r0 = vec_ld ( 0, b->cm); r1 = vec_ld (16, b->cm); r2 = vec_ld (32, b->cm); r3 = vec_ld (48, b->cm); t0 = vec_madd (r0, x, r3); t1 = vec_madd (r1, y, t0); t2 = vec_madd (r2, z, t1); v = vec_madd (t2, vw, v); t3 = vec_madd (r0, nx, vz); t4 = vec_madd (r1, ny, t3); t5 = vec_madd (r2, nz, t4); n = vec_madd (t5, vw, n); } *np = n; return v; }
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) { union { vector float v; float s[4]; } mul_u; int i; vector float src1, src2, dst1, dst2, mul_v, zero; zero = (vector float)vec_splat_u32(0); mul_u.s[0] = mul; mul_v = vec_splat(mul_u.v, 0); for(i=0; i<len; i+=8) { src1 = vec_ctf(vec_ld(0, src+i), 0); src2 = vec_ctf(vec_ld(16, src+i), 0); dst1 = vec_madd(src1, mul_v, zero); dst2 = vec_madd(src2, mul_v, zero); vec_st(dst1, 0, dst+i); vec_st(dst2, 16, dst+i); } }
void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) { #ifdef TM_USE_OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5, k; float *s, *r; float ALIGN32 _c; _c = c; __prefetch_by_load(S); __prefetch_by_load(R); k = vec_splats((double)_c); __alignx(16, s); __alignx(16, r); __alignx(16, S); __alignx(16, R); #ifdef TM_USE_OMP #pragma omp for #else #pragma unroll(2) #endif for(int i = 0; i < N; i++) { s=(float*)((spinor32 *) S + i); r=(float*)((spinor32 *) R + i); __prefetch_by_load(S + i + 1); __prefetch_by_stream(1, R + i + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_madd(k, y0, x0); z1 = vec_madd(k, y1, x1); z2 = vec_madd(k, y2, x2); z3 = vec_madd(k, y3, x3); z4 = vec_madd(k, y4, x4); z5 = vec_madd(k, y5, x5); vec_st(z0, 0, r); vec_st(z1, 0, r+4); vec_st(z2, 0, r+8); vec_st(z3, 0, r+12); vec_st(z4, 0, r+16); vec_st(z5, 0, r+20); } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif return; }
__vector float func(__vector float vx) { __vector float vy; __vector float va = (__vector float) { 5.0f, 5.0f, 5.0f, 5.0f }; __vector float vb = (__vector float) { -16.0f, -16.0f, -16.0f, -16.0f }; __vector float vc = (__vector float) { -36.0f, -36.0f, -36.0f, -36.0f }; __vector float vd = (__vector float) { 64.0f, 64.0f, 64.0f, 64.0f }; __vector float ve = (__vector float) { 192.0f, 192.0f, 192.0f, 192.0f }; vy = vec_madd(va, vx, vb); vy = vec_madd(vy, vx, vc); vy = vec_madd(vy, vx, vd); vy = vec_madd(vy, vx, ve); return vy; } float calc_integral(float start, float end, float delta) { int i; float *sum; __vector float vx = (__vector float) { start+delta*0, start+delta*1, start+delta*2, start+delta*3 }; __vector float vsum = (__vector float) { 0.0f, 0.0f, 0.0f, 0.0f }; __vector float vdelta = (__vector float) { delta, delta, delta, delta }; __vector float vstep = (__vector float) { 4.0f, 4.0f, 4.0f, 4.0f }; for (i = 0; i < (end-start)/delta; i += 4) { vsum = vec_madd(func(vx), vdelta, vsum); vx = vec_madd(vdelta, vstep, vx); } sum = (float *) &vsum; return (sum[0] + sum[1] + sum[2] + sum[3]); } int main(int argc, char **argv) { float start = 0.0f; float end = 4.0f; float delta = 0.00001f; float result; printf("start = %f, end = %f\n", start, end); result = calc_integral(start, end, delta); printf("result = %f\n", result); return 0; }
void CDSPToolsOSX::Mul32(tfloat32* pDest, tfloat32 Src) { #ifdef _Mac_PowerPC vector float vSrc = (vector float)(Src, Src, Src, Src); vector float vDest = vec_ldl(0, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 0, pDest); vDest = vec_ldl(4 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 4 * 4, pDest); vDest = vec_ldl(8 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 8 * 4, pDest); vDest = vec_ldl(12 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 12 * 4, pDest); vDest = vec_ldl(16 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 16 * 4, pDest); vDest = vec_ldl(20 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 20 * 4, pDest); vDest = vec_ldl(24 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 24 * 4, pDest); vDest = vec_ldl(28 * 4, pDest); vDest = vec_madd(vDest, vSrc, (vector float)(0, 0, 0, 0)); vec_st(vDest, 28 * 4, pDest); #else // _Mac_PowerPC CDSPTools::Mul32(pDest, Src); #endif // _Mac_PowerPC }
/* ************************************************************************* NAME: test_madd USAGE: test_madd(); returns: void DESCRIPTION: see how the combination multiply/add operation works REFERENCES: Ian Ollmann's Altivec Tutorial LIMITATIONS: GLOBAL VARIABLES: accessed: none modified: none FUNCTIONS CALLED: fprintf vec_madd - multiply two float vectors and add to the sum a float value all in one operation REVISION HISTORY: STR Description of Revision Author 27-Feb-11 initial coding kaj ************************************************************************* */ void test_madd(void) { vector float floatVector1 = { -17.777777, 0.0, 37.777777, 100.0}; vector float addVector = { 32.0, 32.0, 32.0, 32.0}; vector float coeffVector = { 1.8, 1.8, 1.8, 1.8}; vector float fahrenheitVector; short printfloat[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); /* print vectors performing madd on */ fprintf(stderr,"-----------------------------------------------------------\n\n"); printVecFloats("vec_madd input vector 1", floatVector1,FLOAT_ARRAYSIZE); printVecFloats("vec_madd input vector to add", addVector,FLOAT_ARRAYSIZE); printVecFloats("vec_madd coeffvector to multiply", coeffVector,FLOAT_ARRAYSIZE); /* calculate */ fahrenheitVector = vec_madd(floatVector1,coeffVector,addVector); printVecFloats("vec_madd vector (Input*9/5+32)", fahrenheitVector,FLOAT_ARRAYSIZE); } /* test_madd */
static void vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } }
int main(int argc, char **argv) { int i; __vector float *vin = (__vector float *) in; __vector float *vout = (__vector float *) out; __vector float vin_negative; __vector unsigned int vpat; __vector float vzero = (__vector float) { 0.0f, 0.0f, 0.0f, 0.0f }; __vector float vminus = (__vector float) { -1.0f, -1.0f, -1.0f, -1.0f }; for (i = 0; i < SIZE/4; i++) { vpat = vec_cmpgt(vin[i], vzero); vin_negative = vec_madd(vin[i], vminus, vzero); vout[i] = vec_sel(vin_negative, vin[i], vpat); } for (i = 0; i < SIZE; i++) { printf("out[%02d]=%0.0f\n", i, out[i]); } return 0; }
static void ProjectDlightTexture_altivec( void ) { int i, l; vec_t origin0, origin1, origin2; float texCoords0, texCoords1; vector float floatColorVec0, floatColorVec1; vector float modulateVec, colorVec, zero; vector short colorShort; vector signed int colorInt; vector unsigned char floatColorVecPerm, modulatePerm, colorChar; vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff); float *texCoords; byte *colors; byte clipBits[SHADER_MAX_VERTEXES]; float texCoordsArray[SHADER_MAX_VERTEXES][2]; byte colorArray[SHADER_MAX_VERTEXES][4]; unsigned hitIndexes[SHADER_MAX_INDEXES]; int numIndexes; float scale; float radius; vec3_t floatColor; float modulate = 0.0f; if ( !backEnd.refdef.num_dlights ) { return; } // There has to be a better way to do this so that floatColor // and/or modulate are already 16-byte aligned. floatColorVecPerm = vec_lvsl(0,(float *)floatColor); modulatePerm = vec_lvsl(0,(float *)&modulate); modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0); zero = (vector float)vec_splat_s8(0); for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) { dlight_t *dl; if ( !( tess.dlightBits & ( 1 << l ) ) ) { continue; // this surface definately doesn't have any of this light } texCoords = texCoordsArray[0]; colors = colorArray[0]; dl = &backEnd.refdef.dlights[l]; origin0 = dl->transformed[0]; origin1 = dl->transformed[1]; origin2 = dl->transformed[2]; radius = dl->radius; scale = 1.0f / radius; if(r_greyscale->integer) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = floatColor[1] = floatColor[2] = luminance; } else if(r_greyscale->value) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value); floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value); floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value); } else { floatColor[0] = dl->color[0] * 255.0f; floatColor[1] = dl->color[1] * 255.0f; floatColor[2] = dl->color[2] * 255.0f; } floatColorVec0 = vec_ld(0, floatColor); floatColorVec1 = vec_ld(11, floatColor); floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm); for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) { int clip = 0; vec_t dist0, dist1, dist2; dist0 = origin0 - tess.xyz[i][0]; dist1 = origin1 - tess.xyz[i][1]; dist2 = origin2 - tess.xyz[i][2]; backEnd.pc.c_dlightVertexes++; texCoords0 = 0.5f + dist0 * scale; texCoords1 = 0.5f + dist1 * scale; if( !r_dlightBacks->integer && // dist . tess.normal[i] ( dist0 * tess.normal[i][0] + dist1 * tess.normal[i][1] + dist2 * tess.normal[i][2] ) < 0.0f ) { clip = 63; } else { if ( texCoords0 < 0.0f ) { clip |= 1; } else if ( texCoords0 > 1.0f ) { clip |= 2; } if ( texCoords1 < 0.0f ) { clip |= 4; } else if ( texCoords1 > 1.0f ) { clip |= 8; } texCoords[0] = texCoords0; texCoords[1] = texCoords1; // modulate the strength based on the height and color if ( dist2 > radius ) { clip |= 16; modulate = 0.0f; } else if ( dist2 < -radius ) { clip |= 32; modulate = 0.0f; } else { dist2 = Q_fabs(dist2); if ( dist2 < radius * 0.5f ) { modulate = 1.0f; } else { modulate = 2.0f * (radius - dist2) * scale; } } } clipBits[i] = clip; modulateVec = vec_ld(0,(float *)&modulate); modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm); colorVec = vec_madd(floatColorVec0,modulateVec,zero); colorInt = vec_cts(colorVec,0); // RGBx colorShort = vec_pack(colorInt,colorInt); // RGBxRGBx colorChar = vec_packsu(colorShort,colorShort); // RGBxRGBxRGBxRGBx colorChar = vec_sel(colorChar,vSel,vSel); // RGBARGBARGBARGBA replace alpha with 255 vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors); // store color } // build a list of triangles that need light numIndexes = 0; for ( i = 0 ; i < tess.numIndexes ; i += 3 ) { int a, b, c; a = tess.indexes[i]; b = tess.indexes[i+1]; c = tess.indexes[i+2]; if ( clipBits[a] & clipBits[b] & clipBits[c] ) { continue; // not lighted } hitIndexes[numIndexes] = a; hitIndexes[numIndexes+1] = b; hitIndexes[numIndexes+2] = c; numIndexes += 3; } if ( !numIndexes ) { continue; } qglEnableClientState( GL_TEXTURE_COORD_ARRAY ); qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] ); qglEnableClientState( GL_COLOR_ARRAY ); qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray ); GL_Bind( tr.dlightImage ); // include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light // where they aren't rendered if ( dl->additive ) { GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } else { GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } R_DrawElements( numIndexes, hitIndexes ); backEnd.pc.c_totalIndexes += numIndexes; backEnd.pc.c_dlightIndexes += numIndexes; } }
/* two dimensional discrete cosine transform */ void ff_fdct_altivec(int16_t *block) { vector signed short *bp; const vector float *cp = fdctconsts; vector float b00, b10, b20, b30, b40, b50, b60, b70; vector float b01, b11, b21, b31, b41, b51, b61, b71; vector float mzero, cnst, cnsts0, cnsts1, cnsts2; vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; /* setup constants {{{ */ /* mzero = -0.0 */ mzero = ((vector float) vec_splat_u32(-1)); mzero = ((vector float) vec_sl(vu32(mzero), vu32(mzero))); cnsts0 = vec_ld(0, cp); cp++; cnsts1 = vec_ld(0, cp); cp++; cnsts2 = vec_ld(0, cp); /* }}} */ /* 8x8 matrix transpose (vector short[8]) {{{ */ #define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b)) bp = (vector signed short *) block; b00 = ((vector float) vec_ld(0, bp)); b40 = ((vector float) vec_ld(16 * 4, bp)); b01 = ((vector float) MERGE_S16(h, b00, b40)); b11 = ((vector float) MERGE_S16(l, b00, b40)); bp++; b10 = ((vector float) vec_ld(0, bp)); b50 = ((vector float) vec_ld(16 * 4, bp)); b21 = ((vector float) MERGE_S16(h, b10, b50)); b31 = ((vector float) MERGE_S16(l, b10, b50)); bp++; b20 = ((vector float) vec_ld(0, bp)); b60 = ((vector float) vec_ld(16 * 4, bp)); b41 = ((vector float) MERGE_S16(h, b20, b60)); b51 = ((vector float) MERGE_S16(l, b20, b60)); bp++; b30 = ((vector float) vec_ld(0, bp)); b70 = ((vector float) vec_ld(16 * 4, bp)); b61 = ((vector float) MERGE_S16(h, b30, b70)); b71 = ((vector float) MERGE_S16(l, b30, b70)); x0 = ((vector float) MERGE_S16(h, b01, b41)); x1 = ((vector float) MERGE_S16(l, b01, b41)); x2 = ((vector float) MERGE_S16(h, b11, b51)); x3 = ((vector float) MERGE_S16(l, b11, b51)); x4 = ((vector float) MERGE_S16(h, b21, b61)); x5 = ((vector float) MERGE_S16(l, b21, b61)); x6 = ((vector float) MERGE_S16(h, b31, b71)); x7 = ((vector float) MERGE_S16(l, b31, b71)); b00 = ((vector float) MERGE_S16(h, x0, x4)); b10 = ((vector float) MERGE_S16(l, x0, x4)); b20 = ((vector float) MERGE_S16(h, x1, x5)); b30 = ((vector float) MERGE_S16(l, x1, x5)); b40 = ((vector float) MERGE_S16(h, x2, x6)); b50 = ((vector float) MERGE_S16(l, x2, x6)); b60 = ((vector float) MERGE_S16(h, x3, x7)); b70 = ((vector float) MERGE_S16(l, x3, x7)); #undef MERGE_S16 /* }}} */ /* Some of the initial calculations can be done as vector short * before conversion to vector float. The following code section * takes advantage of this. */ /* fdct rows {{{ */ x0 = ((vector float) vec_add(vs16(b00), vs16(b70))); x7 = ((vector float) vec_sub(vs16(b00), vs16(b70))); x1 = ((vector float) vec_add(vs16(b10), vs16(b60))); x6 = ((vector float) vec_sub(vs16(b10), vs16(b60))); x2 = ((vector float) vec_add(vs16(b20), vs16(b50))); x5 = ((vector float) vec_sub(vs16(b20), vs16(b50))); x3 = ((vector float) vec_add(vs16(b30), vs16(b40))); x4 = ((vector float) vec_sub(vs16(b30), vs16(b40))); b70 = ((vector float) vec_add(vs16(x0), vs16(x3))); b10 = ((vector float) vec_add(vs16(x1), vs16(x2))); b00 = ((vector float) vec_add(vs16(b70), vs16(b10))); b40 = ((vector float) vec_sub(vs16(b70), vs16(b10))); #define CTF0(n) \ b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \ b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \ b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0); \ b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0) CTF0(0); CTF0(4); b20 = ((vector float) vec_sub(vs16(x0), vs16(x3))); b60 = ((vector float) vec_sub(vs16(x1), vs16(x2))); CTF0(2); CTF0(6); #undef CTF0 x0 = vec_add(b60, b20); x1 = vec_add(b61, b21); cnst = LD_W2; x0 = vec_madd(cnst, x0, mzero); x1 = vec_madd(cnst, x1, mzero); cnst = LD_W1; b20 = vec_madd(cnst, b20, x0); b21 = vec_madd(cnst, b21, x1); cnst = LD_W0; b60 = vec_madd(cnst, b60, x0); b61 = vec_madd(cnst, b61, x1); #define CTFX(x, b) \ b ## 0 = ((vector float) vec_unpackh(vs16(x))); \ b ## 1 = ((vector float) vec_unpackl(vs16(x))); \ b ## 0 = vec_ctf(vs32(b ## 0), 0); \ b ## 1 = vec_ctf(vs32(b ## 1), 0) CTFX(x4, b7); CTFX(x5, b5); CTFX(x6, b3); CTFX(x7, b1); #undef CTFX x0 = vec_add(b70, b10); x1 = vec_add(b50, b30); x2 = vec_add(b70, b30); x3 = vec_add(b50, b10); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b70 = vec_madd(cnst, b70, x0); cnst = LD_W5; b50 = vec_madd(cnst, b50, x1); cnst = LD_W6; b30 = vec_madd(cnst, b30, x1); cnst = LD_W7; b10 = vec_madd(cnst, b10, x0); b70 = vec_add(b70, x2); b50 = vec_add(b50, x3); b30 = vec_add(b30, x2); b10 = vec_add(b10, x3); x0 = vec_add(b71, b11); x1 = vec_add(b51, b31); x2 = vec_add(b71, b31); x3 = vec_add(b51, b11); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b71 = vec_madd(cnst, b71, x0); cnst = LD_W5; b51 = vec_madd(cnst, b51, x1); cnst = LD_W6; b31 = vec_madd(cnst, b31, x1); cnst = LD_W7; b11 = vec_madd(cnst, b11, x0); b71 = vec_add(b71, x2); b51 = vec_add(b51, x3); b31 = vec_add(b31, x2); b11 = vec_add(b11, x3); /* }}} */ /* 8x8 matrix transpose (vector float[8][2]) {{{ */ x0 = VEC_FMERGEL(b00, b20); x1 = VEC_FMERGEH(b00, b20); x2 = VEC_FMERGEL(b10, b30); x3 = VEC_FMERGEH(b10, b30); b00 = VEC_FMERGEH(x1, x3); b10 = VEC_FMERGEL(x1, x3); b20 = VEC_FMERGEH(x0, x2); b30 = VEC_FMERGEL(x0, x2); x4 = VEC_FMERGEL(b41, b61); x5 = VEC_FMERGEH(b41, b61); x6 = VEC_FMERGEL(b51, b71); x7 = VEC_FMERGEH(b51, b71); b41 = VEC_FMERGEH(x5, x7); b51 = VEC_FMERGEL(x5, x7); b61 = VEC_FMERGEH(x4, x6); b71 = VEC_FMERGEL(x4, x6); x0 = VEC_FMERGEL(b01, b21); x1 = VEC_FMERGEH(b01, b21); x2 = VEC_FMERGEL(b11, b31); x3 = VEC_FMERGEH(b11, b31); x4 = VEC_FMERGEL(b40, b60); x5 = VEC_FMERGEH(b40, b60); x6 = VEC_FMERGEL(b50, b70); x7 = VEC_FMERGEH(b50, b70); b40 = VEC_FMERGEH(x1, x3); b50 = VEC_FMERGEL(x1, x3); b60 = VEC_FMERGEH(x0, x2); b70 = VEC_FMERGEL(x0, x2); b01 = VEC_FMERGEH(x5, x7); b11 = VEC_FMERGEL(x5, x7); b21 = VEC_FMERGEH(x4, x6); b31 = VEC_FMERGEL(x4, x6); /* }}} */ FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); /* round, convert back to short {{{ */ #define CTS(n) \ b ## n ## 0 = vec_round(b ## n ## 0); \ b ## n ## 1 = vec_round(b ## n ## 1); \ b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \ b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \ b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \ vs32(b ## n ## 1))); \ vec_st(vs16(b ## n ## 0), 0, bp) bp = (vector signed short *) block; CTS(0); bp++; CTS(1); bp++; CTS(2); bp++; CTS(3); bp++; CTS(4); bp++; CTS(5); bp++; CTS(6); bp++; CTS(7); #undef CTS /* }}} */ }
void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign) { /* altivec version of cfft2 from Petersen and Arbenz book, "Intro. to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6 wpp 14. Dec. 2003 */ int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle; float rp,up,wr[4] __attribute((aligned(16))); float wu[4] __attribute((aligned(16))); float *a,*b,*c,*d; const vector float vminus = (vector float) { -0.,0.,-0.,0. }; const vector float vzero = (vector float) { 0.,0.,0.,0. }; const vector unsigned char pv3201 = (vector unsigned char) { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 }; vector float V0,V1,V2,V3,V4,V5,V6,V7; vector float V8,V9,V10,V11,V12,V13,V14,V15; if(n<=1) { y[0][0] = x[0][0]; y[0][1] = x[0][1]; return; } m = (int) (log((float) n)/log(1.99)); mj = 1; mj2 = 2; lj = n/2; /* first pass thru data: x -> y */ for(j=0; j<lj; j++) { jb = n/2+j; jc = j*mj2; jd = jc + 1; rp = w[j][0]; up = w[j][1]; if(sign<0.0) up = -up; y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]); y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]); y[jc][0] = x[j][0] + x[jb][0]; y[jc][1] = x[j][1] + x[jb][1]; } if(n==2) return; /* next pass is mj = 2 */ mj = 2; mj2 = 4; lj = n/4; a = (float *)&y[0][0]; b = (float *)&y[n/2][0]; c = (float *)&x[0][0]; d = (float *)&x[mj][0]; if(n==4) { c = (float *)&y[0][0]; d = (float *)&y[mj][0]; } for(j=0; j<lj; j++) { jw = j*mj; jc = j*mj2; jd = 2*jc; rp = w[jw][0]; up = w[jw][1]; if(sign<0.0) up = -up; wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; V6 = vec_ld(0,wr); V7 = vec_ld(0,wu); V7 = vec_xor(V7,vminus); V0 = vec_ld(0,(vector float *) (a+jc)); V1 = vec_ld(0,(vector float *) (b+jc)); V2 = vec_add(V0,V1); /* a + b */ vec_st(V2,0,(vector float *) (c+jd)); /* store c */ V3 = vec_sub(V0,V1); /* a - b */ V4 = vec_perm(V3,V3,pv3201); V0 = vec_madd(V6,V3,vzero); V1 = vec_madd(V7,V4,vzero); V2 = vec_add(V0,V1); /* w*(a - b) */ vec_st(V2,0,(vector float*) (d+jd)); /* store d */ } if(n==4) return; mj *= 2; mj2 = 2*mj; lj = n/mj2; tgle = 0; for(pass=2; pass<m-1; pass++) { if(tgle) { a = (float *)&y[0][0]; b = (float *)&y[n/2][0]; c = (float *)&x[0][0]; d = (float *)&x[mj][0]; tgle = 0; } else { a = (float *)&x[0][0]; b = (float *)&x[n/2][0]; c = (float *)&y[0][0]; d = (float *)&y[mj][0]; tgle = 1; } for(j=0; j<lj; j++) { jw = j*mj; jc = j*mj2; jd = 2*jc; rp = w[jw][0]; up = w[jw][1]; if(sign<0.0) up = -up; wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; V6 = vec_ld(0,wr); V7 = vec_ld(0,wu); V7 = vec_xor(V7,vminus); for(k=0; k<mj; k+=4) { k2 = 2*k; k4 = k2+4; V0 = vec_ld(0,(vector float *) (a+jc+k2)); V1 = vec_ld(0,(vector float *) (b+jc+k2)); V2 = vec_add(V0,V1); /* a + b */ vec_st(V2,0,(vector float*) (c+jd+k2)); /* store c */ V3 = vec_sub(V0,V1); /* a - b */ V4 = vec_perm(V3,V3,pv3201); V0 = vec_madd(V6,V3,vzero); V1 = vec_madd(V7,V4,vzero); V2 = vec_add(V0,V1); /* w*(a - b) */ vec_st(V2,0,(vector float *) (d+jd+k2)); /* store d */ V8 = vec_ld(0,(vector float *) (a+jc+k4)); V9 = vec_ld(0,(vector float *) (b+jc+k4)); V10 = vec_add(V8,V9); /* a + b */ vec_st(V10,0,(vector float *) (c+jd+k4)); /* store c */ V11 = vec_sub(V8,V9); /* a - b */ V12 = vec_perm(V11,V11,pv3201); V8 = vec_madd(V6,V11,vzero); V9 = vec_madd(V7,V12,vzero); V10 = vec_add(V8,V9); /* w*(a - b) */ vec_st(V10,0,(vector float *) (d+jd+k4)); /* store d */ } } mj *= 2; mj2 = 2*mj; lj = n/mj2; } /* last pass thru data: in-place if previous in y */ c = (float *)&y[0][0]; d = (float *)&y[n/2][0]; if(tgle) { a = (float *)&y[0][0]; b = (float *)&y[n/2][0]; } else { a = (float *)&x[0][0]; b = (float *)&x[n/2][0]; } for(k=0; k<(n/2); k+=4) { k2 = 2*k; k4 = k2+4; V0 = vec_ld(0,(vector float *) (a+k2)); V1 = vec_ld(0,(vector float *) (b+k2)); V2 = vec_add(V0,V1); /* a + b */ vec_st(V2,0,(vector float*) (c+k2)); /* store c */ V3 = vec_sub(V0,V1); /* a - b */ vec_st(V3,0,(vector float *) (d+k2)); /* store d */ V4 = vec_ld(0,(vector float *) (a+k4)); V5 = vec_ld(0,(vector float *) (b+k4)); V6 = vec_add(V4,V5); /* a + b */ vec_st(V6,0,(vector float *) (c+k4)); /* store c */ V7 = vec_sub(V4,V5); /* a - b */ vec_st(V7,0,(vector float *) (d+k4)); /* store d */ } } // LLVM LOCAL begin // Implementations of sin() and cos() may vary slightly in the accuracy of // their results, typically only in the least significant bit. Round to make // the results consistent across platforms. typedef union { double d; unsigned long long ll; } dbl_ll_union; static double LLVMsin(double d) { dbl_ll_union u; u.d = sin(d); u.ll = (u.ll + 1) & ~1ULL; return u.d; } static double LLVMcos(double d) { dbl_ll_union u; u.d = cos(d); u.ll = (u.ll + 1) & ~1ULL; return u.d; } // LLVM LOCAL end void cffti(int n, float w[][2]) { /* initialization routine for cfft2: computes cos(twopi*k),sin(twopi*k) for k=0..n/2-1 - the "twiddle factors" for a binary radix FFT */ int i,n2; float aw,arg,pi; pi = 3.141592653589793; n2 = n/2; aw = 2.0*pi/((float)n); for(i=0; i<n2; i++) { arg = aw*((float)i); w[i][0] = LLVMcos(arg); w[i][1] = LLVMsin(arg); } }
void nb_kernel310_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,tsc,fs,fs2,nul; vector float dx,dy,dz; vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; vctot = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_madd(fs,rinv,nul); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; qq = vec_madd(load_1_float(charge+jnra),iq,nul); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
void nb_kernel010nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float nul; vector float dx,dy,dz; vector float Vvdwtot,c6,c12; vector float rinvsq,rsq,rinvsix; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int ntiA,tja,tjb,tjc,tjd; #ifdef GMX_THREAD_SHM_FDECOMP int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); #ifdef GMX_THREAD_SHM_FDECOMP nthreads = *p_nthreads; do { tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without tMPI_Threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_2_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinvsq = do_recip(rsq); zero_highest_3_elements_in_vector(&rinvsq); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; load_1_pair(vdwparam+tja,&c6,&c12); Vvdwtot = vec_nmsub(c6,rinvsix,Vvdwtot); Vvdwtot = vec_madd(c12, vec_madd(rinvsix,rinvsix,nul), Vvdwtot); } /* update outer data */ add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREAD_SHM_FDECOMP nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); vkrf=load_float_and_splat(p_krf); vcrf=load_float_and_splat(p_crf); ii = iinr[0]; iqO = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */