/* Store a vector to an unaligned location in memory */ static inline void StoreUnaligned (vector unsigned char v, const guchar *where) { if ((unsigned long)where & 0x0f) { /* Load the surrounding area */ vector unsigned char low = vec_ld(0, where); vector unsigned char high = vec_ld(16, where); /* Prepare the constants that we need */ vector unsigned char permuteVector = vec_lvsr(0, where); vector signed char oxFF = vec_splat_s8(-1); vector signed char ox00 = vec_splat_s8(0); /* Make a mask for which parts of the vectors to swap out */ vector unsigned char mask = (vector unsigned char)vec_perm(ox00, oxFF, permuteVector); v = vec_perm(v, v, permuteVector); /* Insert our data into the low and high vectors */ low = vec_sel(low, v, mask); high = vec_sel(v, high, mask); /* Store the two aligned result vectors */ vec_st(low, 0, CONST_BUFFER(where)); vec_st(high, 16, CONST_BUFFER(where)); } else { /* prevent overflow */ vec_st(v, 0, CONST_BUFFER(where)); } }
void foo (vector bool long long *vblr, vector double *vdr) { *vblr++ = vec_andc (vbla, vblb); *vdr++ = vec_double (vsla); *vdr++ = vec_double (vula); *vblr++ = vec_mergeh (vbla, vblb); *vblr++ = vec_mergel (vbla, vblb); *vblr++ = vec_nor (vbla, vblb); *vblr++ = vec_or (vbla, vblb); *vblr++ = vec_sel (vbla, vblb, vblc); *vblr++ = vec_sel (vbla, vblb, vulc); *vblr++ = vec_xor (vbla, vblb); }
void test_select(void) { vector float firstvector = {1.0, 2.0, 3.0, 4.0}; vector float secondvector = {100.0, 200.0, 300.0, 400.0}; vector unsigned int selector = {0, 0, 0, 0xffffffff}; vector float outputvector; float printfloat[FLOAT_ARRAYSIZE]; int i; fprintf(stderr, "--- function %s ------\n", __FUNCTION__); /* i'm expecting the output here to be {1.0, 2.0, 3.0, 400.0} */ /* select selector has zeros for all elements of the first three */ /* vectors, they should get the bits from the firstvector. since */ /* selector is all ones for the fourth element, the fourth element */ /* of outputvector should get the fourth element of secondvector */ outputvector = vec_sel(firstvector, secondvector, selector); vec_st(outputvector, 0, printfloat); fprintf(stderr, "selected vector: "); for(i = 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", printfloat[i]); } fprintf(stderr, "\n"); } /* test_select */
__SIMDd _SIMD_sel_pd(__SIMDd a, __SIMDd b, void** resultPtr) { #ifdef USE_SSE __SIMDd* result = (__SIMDd*) (*resultPtr); return _mm_or_pd(_mm_andnot_pd(*result,a),_mm_and_pd(*result,b)); #elif defined USE_AVX __SIMDd* result = (__SIMDd*) resultPtr; return _mm256_or_pd(_mm256_andnot_pd(*result,a),_mm256_and_pd(*result,b)); #elif defined USE_IBM return vec_sel(a,b,c); #endif }
void foo( float scalar) { unsigned long width; unsigned long x; vector float vColor; vector unsigned int selectMask; vColor = vec_perm( vec_ld( 0, &scalar), vec_ld( 3, &scalar), vec_lvsl( 0, &scalar) ); float *destRow; vector float store, load0; for( ; x < width; x++) { load0 = vec_sel( vColor, load0, selectMask ); vec_st( store, 0, destRow ); store = load0; } }
int main(int argc, char **argv) { int i; __vector float *vin = (__vector float *) in; __vector float *vout = (__vector float *) out; __vector float vin_negative; __vector unsigned int vpat; __vector float vzero = (__vector float) { 0.0f, 0.0f, 0.0f, 0.0f }; __vector float vminus = (__vector float) { -1.0f, -1.0f, -1.0f, -1.0f }; for (i = 0; i < SIZE/4; i++) { vpat = vec_cmpgt(vin[i], vzero); vin_negative = vec_madd(vin[i], vminus, vzero); vout[i] = vec_sel(vin_negative, vin[i], vpat); } for (i = 0; i < SIZE; i++) { printf("out[%02d]=%0.0f\n", i, out[i]); } return 0; }
static vector unsigned char permute_128(vector unsigned char input) { vector unsigned char result, new_bit; /* and now the code */ result = vec_vperm(input, input, control1); result = vec_rl(result, rotate1); new_bit = vec_vperm(input, input, control2); new_bit = vec_rl(new_bit, rotate2); result = vec_sel(result, new_bit, select2); new_bit = vec_vperm(input, input, control3); new_bit = vec_rl(new_bit, rotate3); result = vec_sel(result, new_bit, select3); new_bit = vec_vperm(input, input, control4); new_bit = vec_rl(new_bit, rotate4); result = vec_sel(result, new_bit, select4); new_bit = vec_vperm(input, input, control5); new_bit = vec_rl(new_bit, rotate5); result = vec_sel(result, new_bit, select5); new_bit = vec_vperm(input, input, control6); new_bit = vec_rl(new_bit, rotate6); result = vec_sel(result, new_bit, select6); new_bit = vec_vperm(input, input, control7); new_bit = vec_rl(new_bit, rotate7); result = vec_sel(result, new_bit, select7); new_bit = vec_vperm(input, input, control8); new_bit = vec_rl(new_bit, rotate8); result = vec_sel(result, new_bit, select8); return result; }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char perm = vec_lvsl(0, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); uint8_t *srcbis = src - (srcStride * 2); const vector unsigned char srcM2a = vec_ld(0, srcbis); const vector unsigned char srcM2b = vec_ld(16, srcbis); const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); srcbis += srcStride; const vector unsigned char srcM1a = vec_ld(0, srcbis); const vector unsigned char srcM1b = vec_ld(16, srcbis); const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); srcbis += srcStride; const vector unsigned char srcP0a = vec_ld(0, srcbis); const vector unsigned char srcP0b = vec_ld(16, srcbis); const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); srcbis += srcStride; const vector unsigned char srcP1a = vec_ld(0, srcbis); const vector unsigned char srcP1b = vec_ld(16, srcbis); const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); srcbis += srcStride; const vector unsigned char srcP2a = vec_ld(0, srcbis); const vector unsigned char srcP2b = vec_ld(16, srcbis); const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); srcbis += srcStride; vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); for (i = 0 ; i < 16 ; i++) { const vector unsigned char srcP3a = vec_ld(0, srcbis); const vector unsigned char srcP3b = vec_ld(16, srcbis); const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); srcbis += srcStride; const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); register int align = ((((unsigned long)src) - 2) % 16); for (i = 0 ; i < 16 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); src += srcStride; dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); }
/* GEMV with the following assumptions: * 1) alpha = 1 * 2) beta = 0 or 1 (controlled by BETA0/BETA1) * 3) incX = 1 and incY = 1 * 4) Column-major storage of A * 4) A is stored column major but is transposed. * * y = [0,1]*y + A*x, len(X) = M, len(Y) = N * At is MxN and is the transpose of A (NxM) */ void ATL_UGEMV( const int M, const int N, const TYPE *At, const int lda, const TYPE *X, TYPE *Y) { int i, j; const long int mu = VEC_STEP*3; /* Unrolling used for the X vector */ const long int nu = 8; /* Unrolling used for the Y vector */ vector TYPE vzero = vec_splats( ((TYPE) 0.0) ); long int M1 = (M/mu)*mu; long int N1 = (N/nu)*nu; TYPE *py = &Y[0]; for (i=0; i < N1; i+=nu) { vector TYPE vy0, vy1, vy2, vy3; vector TYPE vy4, vy5, vy6, vy7; { #if defined BETA0 vy0 = vzero; vy1 = vzero; vy2 = vzero; vy3 = vzero; vy4 = vzero; vy5 = vzero; vy6 = vzero; vy7 = vzero; #else /* BETA1 */ vector TYPE vt0, vt1; vt0 = *((vector TYPE *)(py+0*VEC_STEP)); vt1 = *((vector TYPE *)(py+1*VEC_STEP)); #ifdef DREAL vector TYPE vt2, vt3; vt2 = *((vector TYPE *)(py+2*VEC_STEP)); vt3 = *((vector TYPE *)(py+3*VEC_STEP)); #endif #ifdef DREAL vy0 = vec_mergeh(vt0, vzero); vy1 = vec_mergel(vt0, vzero); vy2 = vec_mergeh(vt1, vzero); vy3 = vec_mergel(vt1, vzero); vy4 = vec_mergeh(vt2, vzero); vy5 = vec_mergel(vt2, vzero); vy6 = vec_mergeh(vt3, vzero); vy7 = vec_mergel(vt3, vzero); #else /* SREAL */ vy0 = vec_sel(vt0, vzero, ELEM0); vy1 = vec_sel(vt0, vzero, ELEM1); vy2 = vec_sel(vt0, vzero, ELEM2); vy3 = vec_sel(vt0, vzero, ELEM3); vy4 = vec_sel(vt1, vzero, ELEM0); vy5 = vec_sel(vt1, vzero, ELEM1); vy6 = vec_sel(vt1, vzero, ELEM2); vy7 = vec_sel(vt1, vzero, ELEM3); #endif #endif } const TYPE *pa0 = &At[(i+0)*lda]; const TYPE *pa1 = &At[(i+1)*lda]; const TYPE *pa2 = &At[(i+2)*lda]; const TYPE *pa3 = &At[(i+3)*lda]; const TYPE *pa4 = &At[(i+4)*lda]; const TYPE *pa5 = &At[(i+5)*lda]; const TYPE *pa6 = &At[(i+6)*lda]; const TYPE *pa7 = &At[(i+7)*lda]; const TYPE *px = &X[0]; for (j=0; j < M1; j+=mu) { vector TYPE va00, va01, va02, va03; vector TYPE va04, va05, va06, va07; vector TYPE va10, va11, va12, va13; vector TYPE va14, va15, va16, va17; vector TYPE va20, va21, va22, va23; vector TYPE va24, va25, va26, va27; vector TYPE vx0, vx1, vx2; vx0 = *((vector TYPE *)( px+0*VEC_STEP )); vx1 = *((vector TYPE *)( px+1*VEC_STEP )); vx2 = *((vector TYPE *)( px+2*VEC_STEP )); px += mu; va00 = *((vector TYPE *)( pa0+0*VEC_STEP )); va01 = *((vector TYPE *)( pa1+0*VEC_STEP )); va02 = *((vector TYPE *)( pa2+0*VEC_STEP )); va03 = *((vector TYPE *)( pa3+0*VEC_STEP )); va04 = *((vector TYPE *)( pa4+0*VEC_STEP )); va05 = *((vector TYPE *)( pa5+0*VEC_STEP )); va06 = *((vector TYPE *)( pa6+0*VEC_STEP )); va07 = *((vector TYPE *)( pa7+0*VEC_STEP )); vy0 = vec_madd(va00, vx0, vy0); vy1 = vec_madd(va01, vx0, vy1); vy2 = vec_madd(va02, vx0, vy2); vy3 = vec_madd(va03, vx0, vy3); vy4 = vec_madd(va04, vx0, vy4); vy5 = vec_madd(va05, vx0, vy5); vy6 = vec_madd(va06, vx0, vy6); vy7 = vec_madd(va07, vx0, vy7); va10 = *((vector TYPE *)( pa0+1*VEC_STEP )); va11 = *((vector TYPE *)( pa1+1*VEC_STEP )); va12 = *((vector TYPE *)( pa2+1*VEC_STEP )); va13 = *((vector TYPE *)( pa3+1*VEC_STEP )); va14 = *((vector TYPE *)( pa4+1*VEC_STEP )); va15 = *((vector TYPE *)( pa5+1*VEC_STEP )); va16 = *((vector TYPE *)( pa6+1*VEC_STEP )); va17 = *((vector TYPE *)( pa7+1*VEC_STEP )); vy0 = vec_madd(va10, vx1, vy0); vy1 = vec_madd(va11, vx1, vy1); vy2 = vec_madd(va12, vx1, vy2); vy3 = vec_madd(va13, vx1, vy3); vy4 = vec_madd(va14, vx1, vy4); vy5 = vec_madd(va15, vx1, vy5); vy6 = vec_madd(va16, vx1, vy6); vy7 = vec_madd(va17, vx1, vy7); va20 = *((vector TYPE *)( pa0+2*VEC_STEP )); va21 = *((vector TYPE *)( pa1+2*VEC_STEP )); va22 = *((vector TYPE *)( pa2+2*VEC_STEP )); va23 = *((vector TYPE *)( pa3+2*VEC_STEP )); va24 = *((vector TYPE *)( pa4+2*VEC_STEP )); va25 = *((vector TYPE *)( pa5+2*VEC_STEP )); va26 = *((vector TYPE *)( pa6+2*VEC_STEP )); va27 = *((vector TYPE *)( pa7+2*VEC_STEP )); vy0 = vec_madd(va20, vx2, vy0); vy1 = vec_madd(va21, vx2, vy1); vy2 = vec_madd(va22, vx2, vy2); vy3 = vec_madd(va23, vx2, vy3); vy4 = vec_madd(va24, vx2, vy4); vy5 = vec_madd(va25, vx2, vy5); vy6 = vec_madd(va26, vx2, vy6); vy7 = vec_madd(va27, vx2, vy7); pa0 += mu; pa1 += mu; pa2 += mu; pa3 += mu; pa4 += mu; pa5 += mu; pa6 += mu; pa7 += mu; } { #ifdef DREAL vector TYPE vt0, vt1, vt2, vt3; vector TYPE vt4, vt5, vt6, vt7; vt0 = vec_mergeh(vy0, vy1); vt1 = vec_mergel(vy0, vy1); vt2 = vec_mergeh(vy2, vy3); vt3 = vec_mergel(vy2, vy3); vt4 = vec_mergeh(vy4, vy5); vt5 = vec_mergel(vy4, vy5); vt6 = vec_mergeh(vy6, vy7); vt7 = vec_mergel(vy6, vy7); vy0 = vec_add(vt0, vt1); vy2 = vec_add(vt2, vt3); vy4 = vec_add(vt4, vt5); vy6 = vec_add(vt6, vt7); *((vector TYPE *)(py+0*VEC_STEP)) = vy0; *((vector TYPE *)(py+1*VEC_STEP)) = vy2; *((vector TYPE *)(py+2*VEC_STEP)) = vy4; *((vector TYPE *)(py+3*VEC_STEP)) = vy6; #else /* SREAL */ vy0 = vec_reduce(vy0); vy1 = vec_reduce(vy1); vy2 = vec_reduce(vy2); vy3 = vec_reduce(vy3); vy4 = vec_reduce(vy4); vy5 = vec_reduce(vy5); vy6 = vec_reduce(vy6); vy7 = vec_reduce(vy7); vec_ste(vy0,0,py+0); vec_ste(vy1,0,py+1); vec_ste(vy2,0,py+2); vec_ste(vy3,0,py+3); vec_ste(vy4,0,py+4); vec_ste(vy5,0,py+5); vec_ste(vy6,0,py+6); vec_ste(vy7,0,py+7); #endif } { register TYPE y0, y1, y2, y3; register TYPE y4, y5, y6, y7; y0 = *(py+0); y1 = *(py+1); y2 = *(py+2); y3 = *(py+3); y4 = *(py+4); y5 = *(py+5); y6 = *(py+6); y7 = *(py+7); for (j=M1; j < M; j++) { y0 += *pa0 * X[j]; y1 += *pa1 * X[j]; y2 += *pa2 * X[j]; y3 += *pa3 * X[j]; y4 += *pa4 * X[j]; y5 += *pa5 * X[j]; y6 += *pa6 * X[j]; y7 += *pa7 * X[j]; pa0++; pa1++; pa2++; pa3++; pa4++; pa5++; pa6++; pa7++; } *(py+0) = y0; *(py+1) = y1; *(py+2) = y2; *(py+3) = y3; *(py+4) = y4; *(py+5) = y5; *(py+6) = y6; *(py+7) = y7; } py += nu; } for (i=N1; i < N; i++) { register TYPE y0; #if defined BETA0 y0 = 0.0; #else /* BETA1 */ y0 = Y[i]; #endif for (j=0; j < M; j++) y0 += At[j+i*lda] * X[j]; Y[i] = y0; } }
static inline vector unsigned int Maj(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) { return vec_sel(b,c, vec_xor(b,d)); }
/* a vectorized version of the Voigt function using Altivec / VMX instructions */ void my_voigt(const float *damping, const float *frequency_offset, float *voigt_value, int N) { // coefficients of the rational approximation formula // to the complementary error function const vector float A0 = (vector float) (122.607931777104326f); const vector float A1 = (vector float) (214.382388694706425f); const vector float A2 = (vector float) (181.928533092181549f); const vector float A3 = (vector float) (93.155580458138441f); const vector float A4 = (vector float) (30.180142196210589f); const vector float A5 = (vector float) (5.912626209773153f); const vector float A6 = (vector float) (0.564189583562615f); const vector float B0 = (vector float) (122.60793177387535f); const vector float B1 = (vector float) (352.730625110963558f); const vector float B2 = (vector float) (457.334478783897737f); const vector float B3 = (vector float) (348.703917719495792f); const vector float B4 = (vector float) (170.354001821091472f); const vector float B5 = (vector float) (53.992906912940207f); const vector float B6 = (vector float) (10.479857114260399f); vector float ivsigno; vector float V; vector float Z1_real; vector float Z1_imag; vector float Z2_real; vector float Z2_imag; vector float Z3_real; vector float Z3_imag; vector float Z4_real; vector float Z4_imag; vector float Z5_real; vector float Z5_imag; vector float Z6_real; vector float Z6_imag; vector float ZZ1_real; vector float ZZ1_imag; vector float ZZ2_real; vector float ZZ2_imag; vector float ZZ3_real; vector float ZZ3_imag; vector float ZZ4_real; vector float ZZ4_imag; vector float ZZ5_real; vector float ZZ5_imag; vector float ZZ6_real; vector float ZZ6_imag; vector float ZZ7_real; vector float ZZ7_imag; vector float division_factor; vector float ZZZ_real; vector bool int mask; const vector float one = (vector float) (1.0f); const vector float zero = (vector float) (-0.0f); const vector float mone = (vector float) (-1.0f); vector float damp; vector float offs; for(int i=0; i<N; i+=4){ damp = vec_ld(0,(float *) &damping[i]); offs = vec_ld(0,(float *) &frequency_offset[i]); mask = vec_cmplt(offs,zero); ivsigno = vec_sel(mone, one, mask); //ivsigno = (vector float) (1.0f); V = vec_madd(ivsigno, offs, zero); Z1_real = vec_madd(A6, damp, A5); Z1_imag = vec_nmsub(A6, V, zero); Z2_real = vec_add(vec_madd(Z1_real,damp,zero),vec_madd(Z1_imag,V,A4)); Z2_imag = vec_add(vec_nmsub(Z1_real,V,zero),vec_madd(Z1_imag,damp,zero)); Z3_real = vec_add(vec_madd(Z2_real,damp,zero),vec_madd(Z2_imag,V,A3)); Z3_imag = vec_add(vec_nmsub(Z2_real,V,zero),vec_madd(Z2_imag,damp,zero)); Z4_real = vec_add(vec_madd(Z3_real,damp,zero),vec_madd(Z3_imag,V,A2)); Z4_imag = vec_add(vec_nmsub(Z3_real,V,zero),vec_madd(Z3_imag,damp,zero)); Z5_real = vec_add(vec_madd(Z4_real,damp,zero),vec_madd(Z4_imag,V,A1)); Z5_imag = vec_add(vec_nmsub(Z4_real,V,zero),vec_madd(Z4_imag,damp,zero)); Z6_real = vec_add(vec_madd(Z5_real,damp,zero),vec_madd(Z5_imag,V,A0)); Z6_imag = vec_add(vec_nmsub(Z5_real,V,zero),vec_madd(Z5_imag,damp,zero)); ZZ1_real = vec_add(damp,B6); ZZ1_imag = vec_madd(mone,V,zero); ZZ2_real = vec_add(vec_madd(ZZ1_real,damp,zero),vec_madd(ZZ1_imag,V,B5)); ZZ2_imag = vec_add(vec_nmsub(ZZ1_real,V,zero),vec_madd(ZZ1_imag,damp,zero)); ZZ3_real = vec_add(vec_madd(ZZ2_real,damp,zero),vec_madd(ZZ2_imag,V,B4)); ZZ3_imag = vec_add(vec_nmsub(ZZ2_real,V,zero),vec_madd(ZZ2_imag,damp,zero)); ZZ4_real = vec_add(vec_madd(ZZ3_real,damp,zero),vec_madd(ZZ3_imag,V,B3)); ZZ4_imag = vec_add(vec_nmsub(ZZ3_real,V,zero),vec_madd(ZZ3_imag,damp,zero)); ZZ5_real = vec_add(vec_madd(ZZ4_real,damp,zero),vec_madd(ZZ4_imag,V,B2)); ZZ5_imag = vec_add(vec_nmsub(ZZ4_real,V,zero),vec_madd(ZZ4_imag,damp,zero)); ZZ6_real = vec_add(vec_madd(ZZ5_real,damp,zero),vec_madd(ZZ5_imag,V,B1)); ZZ6_imag = vec_add(vec_nmsub(ZZ5_real,V,zero),vec_madd(ZZ5_imag,damp,zero)); ZZ7_real = vec_add(vec_madd(ZZ6_real,damp,zero),vec_madd(ZZ6_imag,V,B0)); ZZ7_imag = vec_add(vec_nmsub(ZZ6_real,V,zero),vec_madd(ZZ6_imag,damp,zero)); division_factor = vec_div(one,vec_madd(ZZ7_real,ZZ7_real,vec_madd(ZZ7_imag,ZZ7_imag,zero))); ZZZ_real = vec_madd(vec_madd(Z6_real,ZZ7_real,vec_madd(Z6_imag,ZZ7_imag,zero)),division_factor,zero); vec_st(ZZZ_real,0,(float *)&voigt_value[i]); } }
static int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; const vector float zero = (const vector float)FOUROF(0.); // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_ld(0, data); data1 = vec_ld(16, data); data2 = vec_ld(32, data); data3 = vec_ld(48, data); data4 = vec_ld(64, data); data5 = vec_ld(80, data); data6 = vec_ld(96, data); data7 = vec_ld(112, data); // Transpose the data before we start TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); // load the data into floating point vectors. We load // the high half of each row into the main row vectors // and the low half into the alt vectors. row0 = vec_ctf(vec_unpackh(data0), 0); alt0 = vec_ctf(vec_unpackl(data0), 0); row1 = vec_ctf(vec_unpackh(data1), 0); alt1 = vec_ctf(vec_unpackl(data1), 0); row2 = vec_ctf(vec_unpackh(data2), 0); alt2 = vec_ctf(vec_unpackl(data2), 0); row3 = vec_ctf(vec_unpackh(data3), 0); alt3 = vec_ctf(vec_unpackl(data3), 0); row4 = vec_ctf(vec_unpackh(data4), 0); alt4 = vec_ctf(vec_unpackl(data4), 0); row5 = vec_ctf(vec_unpackh(data5), 0); alt5 = vec_ctf(vec_unpackl(data5), 0); row6 = vec_ctf(vec_unpackh(data6), 0); alt6 = vec_ctf(vec_unpackl(data6), 0); row7 = vec_ctf(vec_unpackh(data7), 0); alt7 = vec_ctf(vec_unpackl(data7), 0); } // The following block could exist as a separate an altivec dct // function. However, if we put it inline, the DCT data can remain // in the vector local variables, as floats, which we'll use during the // quantize step... { const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); int whichPass, whichHalf; for(whichPass = 1; whichPass<=2; whichPass++) { for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); row0 = vec_add(tmp10, tmp11); // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); row4 = vec_sub(tmp10, tmp11); // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), // CONST_BITS-PASS1_BITS); row2 = vec_madd(tmp13, vec_0_765366865, z1); // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), // CONST_BITS-PASS1_BITS); row6 = vec_madd(tmp12, vec_1_847759065, z1); z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z3 = vec_madd(z3, vec_1_961570560, z5); // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ z4 = vec_madd(z4, vec_0_390180644, z5); // The following adds are rolled into the multiplies above // z3 = vec_add(z3, z5); // z3 += z5; // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ // Same with this one... // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); // Swap the row values with the alts. If this is the first half, // this sets up the low values to be acted on in the second half. // If this is the second half, it puts the high values back in // the row values where they are expected to be when we're done. SWAP(row0, alt0); SWAP(row1, alt1); SWAP(row2, alt2); SWAP(row3, alt3); SWAP(row4, alt4); SWAP(row5, alt5); SWAP(row6, alt6); SWAP(row7, alt7); } if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. SWAP(row4, alt0); SWAP(row5, alt1); SWAP(row6, alt2); SWAP(row7, alt3); // Now, transpose each block of four TRANSPOSE4(row0, row1, row2, row3); TRANSPOSE4(row4, row5, row6, row7); TRANSPOSE4(alt0, alt1, alt2, alt3); TRANSPOSE4(alt4, alt5, alt6, alt7); } } } // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case // (it needs special handling). baseVector = vec_cts(vec_splat(row0, 0), 0); vec_ste(baseVector, 0, &oldBaseValue); qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } // Load the bias vector (We add 0.5 to the bias so that we're // rounding when we convert to int, instead of flooring.) { vector signed int biasInt; const vector float negOneFloat = (vector float)FOUROF(-1.0f); LOAD4(biasInt, biasAddr); bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); negBias = vec_madd(bias, negOneFloat, zero); } { vector float q0, q1, q2, q3, q4, q5, q6, q7; q0 = vec_ctf(qmat[0], QMAT_SHIFT); q1 = vec_ctf(qmat[2], QMAT_SHIFT); q2 = vec_ctf(qmat[4], QMAT_SHIFT); q3 = vec_ctf(qmat[6], QMAT_SHIFT); q4 = vec_ctf(qmat[8], QMAT_SHIFT); q5 = vec_ctf(qmat[10], QMAT_SHIFT); q6 = vec_ctf(qmat[12], QMAT_SHIFT); q7 = vec_ctf(qmat[14], QMAT_SHIFT); row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), vec_cmpgt(row0, zero)); row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), vec_cmpgt(row1, zero)); row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), vec_cmpgt(row2, zero)); row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), vec_cmpgt(row3, zero)); row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), vec_cmpgt(row4, zero)); row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), vec_cmpgt(row5, zero)); row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), vec_cmpgt(row6, zero)); row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), vec_cmpgt(row7, zero)); q0 = vec_ctf(qmat[1], QMAT_SHIFT); q1 = vec_ctf(qmat[3], QMAT_SHIFT); q2 = vec_ctf(qmat[5], QMAT_SHIFT); q3 = vec_ctf(qmat[7], QMAT_SHIFT); q4 = vec_ctf(qmat[9], QMAT_SHIFT); q5 = vec_ctf(qmat[11], QMAT_SHIFT); q6 = vec_ctf(qmat[13], QMAT_SHIFT); q7 = vec_ctf(qmat[15], QMAT_SHIFT); alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), vec_cmpgt(alt0, zero)); alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), vec_cmpgt(alt1, zero)); alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), vec_cmpgt(alt2, zero)); alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), vec_cmpgt(alt3, zero)); alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), vec_cmpgt(alt4, zero)); alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), vec_cmpgt(alt5, zero)); alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), vec_cmpgt(alt6, zero)); alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), vec_cmpgt(alt7, zero)); } } // Store the data back into the original block { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); { // Clamp for overflow vector signed int max_q_int, min_q_int; vector signed short max_q, min_q; LOAD4(max_q_int, &(s->max_qcoeff)); LOAD4(min_q_int, &(s->min_qcoeff)); max_q = vec_pack(max_q_int, max_q_int); min_q = vec_pack(min_q_int, min_q_int); data0 = vec_max(vec_min(data0, max_q), min_q); data1 = vec_max(vec_min(data1, max_q), min_q); data2 = vec_max(vec_min(data2, max_q), min_q); data4 = vec_max(vec_min(data4, max_q), min_q); data5 = vec_max(vec_min(data5, max_q), min_q); data6 = vec_max(vec_min(data6, max_q), min_q); data7 = vec_max(vec_min(data7, max_q), min_q); } { vector bool char zero_01, zero_23, zero_45, zero_67; vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); signed char lastNonZeroChar; // Determine the largest non-zero index. zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), vec_cmpeq(data1, (vector signed short)zero)); zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), vec_cmpeq(data3, (vector signed short)zero)); zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), vec_cmpeq(data5, (vector signed short)zero)); zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 4 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 2 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // largest value scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); scanIndexes_01 = vec_splat(scanIndexes_01, 0); vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; // While the data is still in vectors we check for the transpose IDCT permute // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } vec_st(data0, 0, data); vec_st(data1, 16, data); vec_st(data2, 32, data); vec_st(data3, 48, data); vec_st(data4, 64, data); vec_st(data5, 80, data); vec_st(data6, 96, data); vec_st(data7, 112, data); } } // special handling of block[0] if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else oldBaseValue /= s->c_dc_scale; } // Divide by 8, rounding the result data[0] = (oldBaseValue + 4) >> 3; } // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
static inline vec_uint4 vec_Ch(vec_uint4 x, vec_uint4 y, vec_uint4 z) { return vec_sel(z, y, x); }
static inline vec_uint4 vec_Maj(vec_uint4 x, vec_uint4 y, vec_uint4 z) { return vec_sel(x, y, vec_xor(x, z)); }
void pix_compare :: processYUV_Altivec(imageStruct &image, imageStruct &right) { register int h,w,i,j,width; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec"); return; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector bool short Ymask1; register vector unsigned char one = vec_splat_u8(1); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif if (m_direction) { for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); //compare the Y values Ymask1 = vec_cmpgt(Yres1,Yres2); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } }else{ for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Ymask1 = vec_cmplt(Yres1,Yres2); Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } } }
SIMD_INLINE void ConditionalFill(const uint8_t * src, size_t offset, const v128_u8 & threshold, const v128_u8 & value, uint8_t * dst) { v128_u8 _src = Load<align>(src + offset); v128_u8 _dst = Load<align>(dst + offset); Store<align>(dst + offset, vec_sel(_dst, value, Compare8u<compareType>(_src, threshold))); }
void init_particles(particle* system){ /* * particles will be placed randomly 4 quadrants of 3d space * q1: x>0, y>0 * q2: x<0, y>0 * q3: x<0, y<0 * q4: x>0, y<0 * */ int i = 0; srand(time(NULL)); // q1 x>0, y>0 for(; i < NO_OF_PARTICLES/4; i++){ system[i].position = (__vector float){rand()%INIT_BOUNDING_BOX, rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q2 x<0, y>0 for(; i < NO_OF_PARTICLES/2; i++){ system[i].position = (__vector float){-rand()%INIT_BOUNDING_BOX, rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q3 x<0, y<0 for(; i < 3*NO_OF_PARTICLES/4; i++){ system[i].position = (__vector float){-rand()%INIT_BOUNDING_BOX, -rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q4 x>0, y<0 for(; i < NO_OF_PARTICLES; i++){ system[i].position = (__vector float){rand()%INIT_BOUNDING_BOX, -rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } } void compute_interaction(particle* i , particle* j){ __vector float radius, radius_sqr, s_vector, displ, accel, distSqr, distSixth, invDistCube; /*compute acceleration of particle i*/ radius = vec_sub(j->position,i->position); radius_sqr = vec_madd(radius,radius, VECZERO); distSqr = vec_add(vec_splat(radius_sqr,0),vec_splat(radius_sqr,1)); distSqr = vec_add(vec_splat(radius_sqr,2),distSqr); distSqr = vec_add(EPS2_VECTOR,distSqr); distSixth = vec_madd(distSqr,distSqr,VECZERO); distSixth = vec_madd(distSixth,distSqr,VEC3ZERO); invDistCube = vec_rsqrte(distSixth); s_vector = vec_madd(MASS_VECTOR,invDistCube,VECZERO); i->acceleration = vec_madd(radius,s_vector,i->acceleration); /*compute new position & velocity of particle i*/ displ = vec_madd(i->velocity,TIME_STEP_VECTOR,i->position); accel = vec_madd(VECHALF,i->acceleration, VECZERO); i->position = vec_madd(accel,TIME_SQUARED, displ); i->velocity = vec_madd(i->acceleration,TIME_STEP_VECTOR, i->velocity); } void update_particles(particle* system){ int i, j; //Thread 1 ? for(i = 0;i<NO_OF_PARTICLES/4;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 2 ? for(i = NO_OF_PARTICLES/4;i<NO_OF_PARTICLES/2;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 3 ? for(i = NO_OF_PARTICLES/2;i<3*NO_OF_PARTICLES/4;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 4 ? for(i = 3*NO_OF_PARTICLES/4;i<NO_OF_PARTICLES;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } } __vector int get_quadrant_count(particle* system){ __vector int quad_count = VECINTZERO; __vector int quad_mask = VECINTZERO; int i; for(i = 0; i < NO_OF_PARTICLES ; i++){ __vector int top2 = (__vector int){1,1,0,0}; __vector int bottom2 = (__vector int){0,0,1,1}; __vector int left2 = (__vector int){0,1,0,1}; __vector int right2 = (__vector int){1,0,1,0}; __vector int mask1, mask2; __vector float vx = vec_splat(system[i].position,0); __vector float vy = vec_splat(system[i].position,1); mask1 = vec_sel(right2, left2, vec_cmpgt(vx,VECZERO)); mask2 = vec_sel(top2, bottom2, vec_cmpgt(vy,VECZERO)); quad_mask = vec_and(mask1,mask2); quad_count = vec_add(quad_count,quad_mask); } return quad_count; } void render(particle* system){ int i = 0; for(; i < NO_OF_PARTICLES; i++){ float *pos = (float*) &system[i].position; float *vel = (float*) &system[i].velocity; float *acc = (float*) &system[i].acceleration; printf("position : %f %f %f ", pos[0], pos[1], pos[2]); printf("velocity : %f %f %f ", vel[0], vel[1], vel[2]); printf("acceleration : %f %f %f \n", acc[0], acc[1], acc[2]); } } int main () { /* create particle system --> array of particles */ particle particle_system[NO_OF_PARTICLES] __attribute__((aligned(64))); /* vector that tracks the no. of particles in each quadrant */ __vector int quad_count; int * qc; /* place particles in 4 quadrants */ init_particles(particle_system); /* run simulation */ float simulationTime = 0.0; int iterations = COMPUTE_ITERATIONS; printf("----------------------------------------------"); printf("----------------------------------------------\n"); printf("Running Simulation with %d particles & %d iterations with %f seconds time steps\n", NO_OF_PARTICLES, COMPUTE_ITERATIONS, TIME_STEP); printf("----------------------------------------------"); printf("----------------------------------------------\n"); while(iterations > 0){ /* Compute */ update_particles(particle_system); /* Display */ //render(particle_system); /* Update Time */ simulationTime = simulationTime + TIME_STEP; printf("----------------------------------"); printf("Simulation Time: %f |",simulationTime); quad_count = get_quadrant_count(particle_system); qc = (int*)&quad_count; printf(" q1:%d q2:%d q3:%d q4:%d",qc[0], qc[1], qc[2], qc[3]); printf("----------------------------------\n"); iterations --; } return 0; }
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned int v10ui = vec_splat_u32(10); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v1ss = vec_splat_s16(1); const vector signed int v512si = (const vector signed int)AVV(512); const vector unsigned int v16ui = (const vector unsigned int)AVV(16); register int align = ((((unsigned long)src) - 2) % 16); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short psumA = vec_sub(pp1A, pp2A); const vector signed short psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vector unsigned char mperm = (const vector unsigned char) AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); int16_t *tmpbis = tmp - (tmpStride * 21); vector signed short tmpM2ssA = vec_ld(0, tmpbis); vector signed short tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpM1ssA = vec_ld(0, tmpbis); vector signed short tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP0ssA = vec_ld(0, tmpbis); vector signed short tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP1ssA = vec_ld(0, tmpbis); vector signed short tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP2ssA = vec_ld(0, tmpbis); vector signed short tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vector signed short tmpP3ssA = vec_ld(0, tmpbis); const vector signed short tmpP3ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; const vector signed int pp1Ae = vec_mule(sum1A, v20ss); const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); const vector signed int pp1Be = vec_mule(sum1B, v20ss); const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); const vector signed int pp2Ae = vec_mule(sum2A, v5ss); const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); const vector signed int pp2Be = vec_mule(sum2B, v5ss); const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); const vector signed int pp1cAe = vec_add(pp1Ae, v512si); const vector signed int pp1cAo = vec_add(pp1Ao, v512si); const vector signed int pp1cBe = vec_add(pp1Be, v512si); const vector signed int pp1cBo = vec_add(pp1Bo, v512si); const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); const vector signed int sumBe = vec_add(pp1cBe, pp32Be); const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); const vector signed int ssumAe = vec_sra(sumAe, v10ui); const vector signed int ssumAo = vec_sra(sumAo, v10ui); const vector signed int ssumBe = vec_sra(sumBe, v10ui); const vector signed int ssumBo = vec_sra(sumBo, v10ui); const vector signed short ssume = vec_packs(ssumAe, ssumBe); const vector signed short ssumo = vec_packs(ssumAo, ssumBo); const vector unsigned char sumv = vec_packsu(ssume, ssumo); const vector unsigned char sum = vec_perm(sumv, sumv, mperm); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); }
/* AltiVec version of dct_unquantize_h263 this code assumes `block' is 16 bytes-aligned */ static void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); qadd = (qscale - 1) | 1; qmul = qscale << 1; if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) block[0] = block[0] * s->y_dc_scale; else block[0] = block[0] * s->c_dc_scale; }else qadd = 0; i = 1; nCoeffs= 63; //does not always use zigzag table } else { i = 0; nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } { register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); DECLARE_ALIGNED(16, short, qmul8) = qmul; DECLARE_ALIGNED(16, short, qadd8) = qadd; register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; register vector bool short blockv_null, blockv_neg; register short backup_0 = block[0]; register int j = 0; qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); nqaddv = vec_sub(vczero, qaddv); #if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } #endif // vectorize all the 16 bytes-aligned blocks // of 8 elements for(; (j + 7) <= nCoeffs ; j+=8) { blockv = vec_ld(j << 1, block); blockv_neg = vec_cmplt(blockv, vczero); blockv_null = vec_cmpeq(blockv, vczero); // choose between +qadd or -qadd as the third operand temp1 = vec_sel(qaddv, nqaddv, blockv_neg); // multiply & add (block{i,i+7} * qmul [+-] qadd) temp1 = vec_mladd(blockv, qmulv, temp1); // put 0 where block[{i,i+7} used to have 0 blockv = vec_sel(temp1, blockv, blockv_null); vec_st(blockv, j << 1, block); } // if nCoeffs isn't a multiple of 8, finish the job // using good old scalar units. // (we could do it using a truncated vector, // but I'm not sure it's worth the hassle) for(; j <= nCoeffs ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } if (i == 1) { // cheat. this avoid special-casing the first iteration block[0] = backup_0; } } }
tmp0 = vec_sl(index0, (vector unsigned short)((((vector signed short){1,1,1,1,1,1,1,1})) )); PerIndex = (vector unsigned char)vec_packs(tmp0, vec_add(tmp0, (((vector signed short){1,1,1,1,1,1,1,1})) )); PerIndex = vec_perm(PerIndex, PerIndex, perm1); tmp0 = vec_perm(table[0], table[1], PerIndex); stmp0 = vec_perm(slope_cos[0], slope_cos[1], PerIndex); tmpIndex = vec_sub(PerIndex, (((vector unsigned char){32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32})) ); tmp1 = vec_perm(table[2], table[3], tmpIndex); stmp1 = vec_perm(slope_cos[2], slope_cos[3], tmpIndex); select = (vector unsigned short)vec_cmpgt(PerIndex, (((vector unsigned char){31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31})) ); tmp2 = vec_sel(tmp0, tmp1, select); stmp2 = vec_sel(stmp0, stmp1, select); tmpIndex = vec_sub(tmpIndex, (((vector unsigned char){32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32})) ); tmp0 = vec_perm(table[4], table[5], tmpIndex); stmp0 = vec_perm(slope_cos[4], slope_cos[5], tmpIndex); tmpIndex = vec_sub(tmpIndex, (((vector unsigned char){32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32})) ); tmp1 = vec_perm(table[6], table[7], tmpIndex); stmp1 = vec_perm(slope_cos[6], slope_cos[7], tmpIndex); select = (vector unsigned short)vec_cmpgt(PerIndex, (((vector unsigned char){95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95})) ); tmp3 = vec_sel(tmp0, tmp1, select); stmp3 = vec_sel(stmp0, stmp1, select); select = (vector unsigned short)vec_cmpgt(PerIndex, (((vector unsigned char){63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63})) );
static void vector_fmul_add_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); #if 0 //FIXME: there is still something wrong if (step == 2) { int y; vector float d0, d1, s3, t2; vector unsigned int sel = vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0)); t1 = vec_ld(16, dst); for (i=0,y=0; i<len-3; i+=4,y+=8) { s0 = vec_ld(0,src0+i); s1 = vec_ld(0,src1+i); s2 = vec_ld(0,src2+i); // t0 = vec_ld(0, dst+y); //[x x x|a] // t1 = vec_ld(16, dst+y); //[b c d|e] t2 = vec_ld(31, dst+y); //[f g h|x] d = vec_madd(s0,s1,s2); // [A B C D] // [A A B B] // [C C D D] d0 = vec_perm(t0, t1, mask); // [a b c d] d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d] edges = vec_perm(t1, t0, mask); t0 = vec_perm(edges, d0, align); // [x x x|A] t1 = vec_perm(d0, edges, align); // [b B d|e] vec_stl(t0, 0, dst+y); d1 = vec_perm(t1, t2, mask); // [e f g h] d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h] edges = vec_perm(t2, t1, mask); t1 = vec_perm(edges, d1, align); // [b B d|C] t2 = vec_perm(d1, edges, align); // [f D h|x] vec_stl(t1, 16, dst+y); t0 = t1; vec_stl(t2, 31, dst+y); t1 = t2; } } else #endif if (step == 1 && src3 == 0) for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } else ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); }
static void ProjectDlightTexture_altivec( void ) { int i, l; vec_t origin0, origin1, origin2; float texCoords0, texCoords1; vector float floatColorVec0, floatColorVec1; vector float modulateVec, colorVec, zero; vector short colorShort; vector signed int colorInt; vector unsigned char floatColorVecPerm, modulatePerm, colorChar; vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff); float *texCoords; byte *colors; int *intColors; byte clipBits[SHADER_MAX_VERTEXES]; float texCoordsArray[SHADER_MAX_VERTEXES][2]; byte colorArray[SHADER_MAX_VERTEXES][4]; glIndex_t hitIndexes[SHADER_MAX_INDEXES]; int numIndexes; float scale; float radius; float radiusInverseCubed; float intensity, remainder; vec3_t floatColor; float modulate = 0.0f; qboolean vertexLight; if ( !backEnd.refdef.num_dlights ) { return; } // There has to be a better way to do this so that floatColor // and/or modulate are already 16-byte aligned. floatColorVecPerm = vec_lvsl(0,(float *)floatColor); modulatePerm = vec_lvsl(0,(float *)&modulate); modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0); zero = (vector float)vec_splat_s8(0); for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) { dlight_t *dl; if ( !( tess.dlightBits & ( 1 << l ) ) ) { continue; // this surface definately doesn't have any of this light } // clear colors Com_Memset( colorArray, 0, sizeof( colorArray ) ); texCoords = texCoordsArray[0]; colors = colorArray[0]; dl = &backEnd.refdef.dlights[l]; origin0 = dl->transformed[0]; origin1 = dl->transformed[1]; origin2 = dl->transformed[2]; radius = dl->radius; scale = 1.0f / radius; radiusInverseCubed = dl->radiusInverseCubed; intensity = dl->intensity; vertexLight = ( ( dl->flags & REF_DIRECTED_DLIGHT ) || ( dl->flags & REF_VERTEX_DLIGHT ) ); // directional lights have max intensity and washout remainder intensity if ( dl->flags & REF_DIRECTED_DLIGHT ) { remainder = intensity * 0.125; } else { remainder = 0.0f; } if(r_greyscale->integer) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = floatColor[1] = floatColor[2] = luminance; } else if(r_greyscale->value) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value); floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value); floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value); } else { floatColor[0] = dl->color[0] * 255.0f; floatColor[1] = dl->color[1] * 255.0f; floatColor[2] = dl->color[2] * 255.0f; } floatColorVec0 = vec_ld(0, floatColor); floatColorVec1 = vec_ld(11, floatColor); floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm); for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) { int clip = 0; vec_t dist0, dist1, dist2; dist0 = origin0 - tess.xyz[i][0]; dist1 = origin1 - tess.xyz[i][1]; dist2 = origin2 - tess.xyz[i][2]; backEnd.pc.c_dlightVertexes++; // directional dlight, origin is a directional normal if ( dl->flags & REF_DIRECTED_DLIGHT ) { // twosided surfaces use absolute value of the calculated lighting modulate = intensity * DotProduct( dl->origin, tess.normal[ i ] ); if ( tess.shader->cullType == CT_TWO_SIDED ) { modulate = fabs( modulate ); } modulate += remainder; } // spherical vertex lit dlight else if ( dl->flags & REF_VERTEX_DLIGHT ) { vec3_t dir; dir[ 0 ] = radius - fabs( dist0 ); if ( dir[ 0 ] <= 0.0f ) { continue; } dir[ 1 ] = radius - fabs( dist1 ); if ( dir[ 1 ] <= 0.0f ) { continue; } dir[ 2 ] = radius - fabs( dist2 ); if ( dir[ 2 ] <= 0.0f ) { continue; } modulate = intensity * dir[ 0 ] * dir[ 1 ] * dir[ 2 ] * radiusInverseCubed; } // vertical cylinder dlight else { texCoords0 = 0.5f + dist0 * scale; texCoords1 = 0.5f + dist1 * scale; if( !r_dlightBacks->integer && // dist . tess.normal[i] ( dist0 * tess.normal[i][0] + dist1 * tess.normal[i][1] + dist2 * tess.normal[i][2] ) < 0.0f ) { clip = 63; } else { if ( texCoords0 < 0.0f ) { clip |= 1; } else if ( texCoords0 > 1.0f ) { clip |= 2; } if ( texCoords1 < 0.0f ) { clip |= 4; } else if ( texCoords1 > 1.0f ) { clip |= 8; } texCoords[0] = texCoords0; texCoords[1] = texCoords1; // modulate the strength based on the height and color if ( dist2 > radius ) { clip |= 16; modulate = 0.0f; } else if ( dist2 < -radius ) { clip |= 32; modulate = 0.0f; } else { dist2 = Q_fabs(dist2); if ( dist2 < radius * 0.5f ) { modulate = intensity; } else { modulate = intensity * 2.0f * (radius - dist2) * scale; } } } } clipBits[i] = clip; // optimizations if ( vertexLight && modulate < ( 1.0f / 128.0f ) ) { continue; } else if ( modulate > 1.0f ) { modulate = 1.0f; } // ZTM: FIXME: should probably clamp to 0-255 range before converting to char, // but I don't know how to do altvec stuff or if it's even used anymore modulateVec = vec_ld(0,(float *)&modulate); modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm); colorVec = vec_madd(floatColorVec0,modulateVec,zero); colorInt = vec_cts(colorVec,0); // RGBx colorShort = vec_pack(colorInt,colorInt); // RGBxRGBx colorChar = vec_packsu(colorShort,colorShort); // RGBxRGBxRGBxRGBx colorChar = vec_sel(colorChar,vSel,vSel); // RGBARGBARGBARGBA replace alpha with 255 vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors); // store color } // build a list of triangles that need light intColors = (int*) colorArray; numIndexes = 0; for ( i = 0 ; i < tess.numIndexes ; i += 3 ) { int a, b, c; a = tess.indexes[i]; b = tess.indexes[i+1]; c = tess.indexes[i+2]; if ( vertexLight ) { if ( !( intColors[ a ] | intColors[ b ] | intColors[ c ] ) ) { continue; } } else { if ( clipBits[a] & clipBits[b] & clipBits[c] ) { continue; // not lighted } } hitIndexes[numIndexes] = a; hitIndexes[numIndexes+1] = b; hitIndexes[numIndexes+2] = c; numIndexes += 3; } if ( !numIndexes ) { continue; } if ( !vertexLight ) { qglEnableClientState( GL_TEXTURE_COORD_ARRAY ); qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] ); } else { qglDisableClientState( GL_TEXTURE_COORD_ARRAY ); } qglEnableClientState( GL_COLOR_ARRAY ); qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray ); if ( dl->dlshader ) { shader_t *dls = dl->dlshader; for ( i = 0; i < dls->numUnfoggedPasses; i++ ) { shaderStage_t *stage = dls->stages[i]; R_BindAnimatedImage( &dls->stages[i]->bundle[0] ); GL_State( stage->stateBits | GLS_DEPTHFUNC_EQUAL ); R_DrawElements( numIndexes, hitIndexes ); backEnd.pc.c_totalIndexes += numIndexes; backEnd.pc.c_dlightIndexes += numIndexes; } } else { R_FogOff(); if ( !vertexLight ) { GL_Bind( tr.dlightImage ); } else { GL_Bind( tr.whiteImage ); } // include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light // where they aren't rendered if ( dl->flags & REF_ADDITIVE_DLIGHT ) { GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } else { GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } R_DrawElements( numIndexes, hitIndexes ); backEnd.pc.c_totalIndexes += numIndexes; backEnd.pc.c_dlightIndexes += numIndexes; RB_FogOn(); } } }
int main () { vector float fa = {1.0, 2.0, 3.0, -4.0}; vector float fb = {-2.0, -3.0, -4.0, -5.0}; vector float fc = vec_cpsgn (fa, fb); vector long long la = {5L, 14L}; vector long long lb = {3L, 86L}; vector long long lc = vec_and (la, lb); vector bool long long ld = {0, -1}; vector long long le = vec_and (la, ld); vector long long lf = vec_and (ld, lb); vector unsigned long long ua = {5L, 14L}; vector unsigned long long ub = {3L, 86L}; vector unsigned long long uc = vec_and (ua, ub); vector bool long long ud = {0, -1}; vector unsigned long long ue = vec_and (ua, ud); vector unsigned long long uf = vec_and (ud, ub); vector long long lg = vec_andc (la, lb); vector long long lh = vec_andc (la, ld); vector long long li = vec_andc (ld, lb); vector unsigned long long ug = vec_andc (ua, ub); vector unsigned long long uh = vec_andc (ua, ud); vector unsigned long long ui = vec_andc (ud, ub); vector double da = {1.0, -4.0}; vector double db = {-2.0, 5.0}; vector double dc = vec_cpsgn (da, db); vector long long lj = vec_mergeh (la, lb); vector long long lk = vec_mergeh (la, ld); vector long long ll = vec_mergeh (ld, la); vector unsigned long long uj = vec_mergeh (ua, ub); vector unsigned long long uk = vec_mergeh (ua, ud); vector unsigned long long ul = vec_mergeh (ud, ua); vector long long lm = vec_mergel (la, lb); vector long long ln = vec_mergel (la, ld); vector long long lo = vec_mergel (ld, la); vector unsigned long long um = vec_mergel (ua, ub); vector unsigned long long un = vec_mergel (ua, ud); vector unsigned long long uo = vec_mergel (ud, ua); vector long long lp = vec_nor (la, lb); vector long long lq = vec_nor (la, ld); vector long long lr = vec_nor (ld, la); vector unsigned long long up = vec_nor (ua, ub); vector unsigned long long uq = vec_nor (ua, ud); vector unsigned long long ur = vec_nor (ud, ua); vector long long ls = vec_or (la, lb); vector long long lt = vec_or (la, ld); vector long long lu = vec_or (ld, la); vector unsigned long long us = vec_or (ua, ub); vector unsigned long long ut = vec_or (ua, ud); vector unsigned long long uu = vec_or (ud, ua); vector unsigned char ca = {0,4,8,1,5,9,2,6,10,3,7,11,15,12,14,13}; vector long long lv = vec_perm (la, lb, ca); vector unsigned long long uv = vec_perm (ua, ub, ca); vector long long lw = vec_sel (la, lb, lc); vector long long lx = vec_sel (la, lb, uc); vector long long ly = vec_sel (la, lb, ld); vector unsigned long long uw = vec_sel (ua, ub, lc); vector unsigned long long ux = vec_sel (ua, ub, uc); vector unsigned long long uy = vec_sel (ua, ub, ld); vector long long lz = vec_xor (la, lb); vector long long l0 = vec_xor (la, ld); vector long long l1 = vec_xor (ld, la); vector unsigned long long uz = vec_xor (ua, ub); vector unsigned long long u0 = vec_xor (ua, ud); vector unsigned long long u1 = vec_xor (ud, ua); int ia = vec_all_eq (ua, ub); int ib = vec_all_ge (ua, ub); int ic = vec_all_gt (ua, ub); int id = vec_all_le (ua, ub); int ie = vec_all_lt (ua, ub); int ig = vec_all_ne (ua, ub); int ih = vec_any_eq (ua, ub); int ii = vec_any_ge (ua, ub); int ij = vec_any_gt (ua, ub); int ik = vec_any_le (ua, ub); int il = vec_any_lt (ua, ub); int im = vec_any_ne (ua, ub); vector int sia = {9, 16, 25, 36}; vector int sib = {-8, -27, -64, -125}; vector int sic = vec_mergee (sia, sib); vector int sid = vec_mergeo (sia, sib); vector unsigned int uia = {9, 16, 25, 36}; vector unsigned int uib = {8, 27, 64, 125}; vector unsigned int uic = vec_mergee (uia, uib); vector unsigned int uid = vec_mergeo (uia, uib); vector bool int bia = {0, -1, -1, 0}; vector bool int bib = {-1, -1, 0, -1}; vector bool int bic = vec_mergee (bia, bib); vector bool int bid = vec_mergeo (bia, bib); vector unsigned int uie = vec_packsu (ua, ub); vector long long l2 = vec_cntlz (la); vector unsigned long long u2 = vec_cntlz (ua); vector int sie = vec_cntlz (sia); vector unsigned int uif = vec_cntlz (uia); vector short ssa = {20, -40, -60, 80, 100, -120, -140, 160}; vector short ssb = vec_cntlz (ssa); vector unsigned short usa = {81, 72, 63, 54, 45, 36, 27, 18}; vector unsigned short usb = vec_cntlz (usa); vector signed char sca = {-4, 3, -9, 15, -31, 31, 0, 0, 1, 117, -36, 99, 98, 97, 96, 95}; vector signed char scb = vec_cntlz (sca); vector unsigned char cb = vec_cntlz (ca); vector double dd = vec_xl (0, &y); vec_xst (dd, 0, &z); vector double de = vec_round (dd); vector double df = vec_splat (de, 0); vector double dg = vec_splat (de, 1); vector long long l3 = vec_splat (l2, 0); vector long long l4 = vec_splat (l2, 1); vector unsigned long long u3 = vec_splat (u2, 0); vector unsigned long long u4 = vec_splat (u2, 1); vector bool long long l5 = vec_splat (ld, 0); vector bool long long l6 = vec_splat (ld, 1); vector long long l7 = vec_div (l3, l4); vector unsigned long long u5 = vec_div (u3, u4); vector long long l8 = vec_mul (l3, l4); vector unsigned long long u6 = vec_mul (u3, u4); vector double dh = vec_ctf (la, -2); vector double di = vec_ctf (ua, 2); vector long long l9 = vec_cts (dh, -2); vector unsigned long long u7 = vec_ctu (di, 2); return 0; }
void test1() { // CHECK-LABEL: define void @test1 res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_div */ res_vf = vec_div(vf, vf); // CHECK: @llvm.ppc.vsx.xvdivsp res_vd = vec_div(vd, vd); // CHECK: @llvm.ppc.vsx.xvdivdp /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
static void ProjectDlightTexture_altivec( void ) { int i, l; vec_t origin0, origin1, origin2; float texCoords0, texCoords1; vector float floatColorVec0, floatColorVec1; vector float modulateVec, colorVec, zero; vector short colorShort; vector signed int colorInt; vector unsigned char floatColorVecPerm, modulatePerm, colorChar; vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff); float *texCoords; byte *colors; byte clipBits[SHADER_MAX_VERTEXES]; float texCoordsArray[SHADER_MAX_VERTEXES][2]; byte colorArray[SHADER_MAX_VERTEXES][4]; unsigned hitIndexes[SHADER_MAX_INDEXES]; int numIndexes; float scale; float radius; vec3_t floatColor; float modulate = 0.0f; if ( !backEnd.refdef.num_dlights ) { return; } // There has to be a better way to do this so that floatColor // and/or modulate are already 16-byte aligned. floatColorVecPerm = vec_lvsl(0,(float *)floatColor); modulatePerm = vec_lvsl(0,(float *)&modulate); modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0); zero = (vector float)vec_splat_s8(0); for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) { dlight_t *dl; if ( !( tess.dlightBits & ( 1 << l ) ) ) { continue; // this surface definately doesn't have any of this light } texCoords = texCoordsArray[0]; colors = colorArray[0]; dl = &backEnd.refdef.dlights[l]; origin0 = dl->transformed[0]; origin1 = dl->transformed[1]; origin2 = dl->transformed[2]; radius = dl->radius; scale = 1.0f / radius; if(r_greyscale->integer) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = floatColor[1] = floatColor[2] = luminance; } else if(r_greyscale->value) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value); floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value); floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value); } else { floatColor[0] = dl->color[0] * 255.0f; floatColor[1] = dl->color[1] * 255.0f; floatColor[2] = dl->color[2] * 255.0f; } floatColorVec0 = vec_ld(0, floatColor); floatColorVec1 = vec_ld(11, floatColor); floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm); for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) { int clip = 0; vec_t dist0, dist1, dist2; dist0 = origin0 - tess.xyz[i][0]; dist1 = origin1 - tess.xyz[i][1]; dist2 = origin2 - tess.xyz[i][2]; backEnd.pc.c_dlightVertexes++; texCoords0 = 0.5f + dist0 * scale; texCoords1 = 0.5f + dist1 * scale; if( !r_dlightBacks->integer && // dist . tess.normal[i] ( dist0 * tess.normal[i][0] + dist1 * tess.normal[i][1] + dist2 * tess.normal[i][2] ) < 0.0f ) { clip = 63; } else { if ( texCoords0 < 0.0f ) { clip |= 1; } else if ( texCoords0 > 1.0f ) { clip |= 2; } if ( texCoords1 < 0.0f ) { clip |= 4; } else if ( texCoords1 > 1.0f ) { clip |= 8; } texCoords[0] = texCoords0; texCoords[1] = texCoords1; // modulate the strength based on the height and color if ( dist2 > radius ) { clip |= 16; modulate = 0.0f; } else if ( dist2 < -radius ) { clip |= 32; modulate = 0.0f; } else { dist2 = Q_fabs(dist2); if ( dist2 < radius * 0.5f ) { modulate = 1.0f; } else { modulate = 2.0f * (radius - dist2) * scale; } } } clipBits[i] = clip; modulateVec = vec_ld(0,(float *)&modulate); modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm); colorVec = vec_madd(floatColorVec0,modulateVec,zero); colorInt = vec_cts(colorVec,0); // RGBx colorShort = vec_pack(colorInt,colorInt); // RGBxRGBx colorChar = vec_packsu(colorShort,colorShort); // RGBxRGBxRGBxRGBx colorChar = vec_sel(colorChar,vSel,vSel); // RGBARGBARGBARGBA replace alpha with 255 vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors); // store color } // build a list of triangles that need light numIndexes = 0; for ( i = 0 ; i < tess.numIndexes ; i += 3 ) { int a, b, c; a = tess.indexes[i]; b = tess.indexes[i+1]; c = tess.indexes[i+2]; if ( clipBits[a] & clipBits[b] & clipBits[c] ) { continue; // not lighted } hitIndexes[numIndexes] = a; hitIndexes[numIndexes+1] = b; hitIndexes[numIndexes+2] = c; numIndexes += 3; } if ( !numIndexes ) { continue; } qglEnableClientState( GL_TEXTURE_COORD_ARRAY ); qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] ); qglEnableClientState( GL_COLOR_ARRAY ); qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray ); GL_Bind( tr.dlightImage ); // include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light // where they aren't rendered if ( dl->additive ) { GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } else { GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } R_DrawElements( numIndexes, hitIndexes ); backEnd.pc.c_totalIndexes += numIndexes; backEnd.pc.c_dlightIndexes += numIndexes; } }
static inline vector unsigned int Ch(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) { return vec_sel(d,c,b); }