static void vector_fmul_reverse_altivec(float *dst, const float *src0, const float *src1, int len) { int i; vector float d, s0, s1, h0, l0, s2, s3, zero = (vector float)vec_splat_u32(0); src1 += len-4; for(i=0; i<len-7; i+=8) { s1 = vec_ld(0, src1-i); // [a,b,c,d] s0 = vec_ld(0, src0+i); l0 = vec_mergel(s1, s1); // [c,c,d,d] s3 = vec_ld(-16, src1-i); h0 = vec_mergeh(s1, s1); // [a,a,b,b] s2 = vec_ld(16, src0+i); s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] vec_mergeh(l0,h0)); // [c,a,c,a] // [d,c,b,a] l0 = vec_mergel(s3, s3); d = vec_madd(s0, s1, zero); h0 = vec_mergeh(s3, s3); vec_st(d, 0, dst+i); s3 = vec_mergeh(vec_mergel(l0,h0), vec_mergeh(l0,h0)); d = vec_madd(s2, s3, zero); vec_st(d, 16, dst+i); } }
static void test() { /* Input vectors. */ vector long vla = {-2,-1}; vector long vlb = {0,1}; vector double vda = {-2.0,-1.0}; vector double vdb = {0.0,1.0}; /* Result vectors. */ vector long vlh, vll; vector double vdh, vdl; /* Expected result vectors. */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ vector long vlrh = {1,-1}; vector long vlrl = {0,-2}; vector double vdrh = {1.0,-1.0}; vector double vdrl = {0.0,-2.0}; #else vector long vlrh = {-2,0}; vector long vlrl = {-1,1}; vector double vdrh = {-2.0,0.0}; vector double vdrl = {-1.0,1.0}; #endif vlh = vec_mergeh (vla, vlb); vll = vec_mergel (vla, vlb); vdh = vec_mergeh (vda, vdb); vdl = vec_mergel (vda, vdb); check (vec_long_eq (vlh, vlrh), "vlh"); check (vec_long_eq (vll, vlrl), "vll"); check (vec_double_eq (vdh, vdrh), "vdh" ); check (vec_double_eq (vdl, vdrl), "vdl" ); }
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, long len, int channels) { int i; vector signed short d0, d1, d2, c0, c1, t0, t1; vector unsigned char align; if(channels == 1) float_to_int16_altivec(dst, src[0], len); else if (channels == 2) { if(((long)dst) & 15) for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); t0 = float_to_int16_one_altivec(src[0] + i); d1 = vec_ld(31, dst + i); t1 = float_to_int16_one_altivec(src[1] + i); c0 = vec_mergeh(t0, t1); c1 = vec_mergel(t0, t1); d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d2, c0, align); d1 = vec_perm(c0, c1, align); vec_st(d0, 0, dst + i); d0 = vec_perm(c1, d2, align); vec_st(d1, 15, dst + i); vec_st(d0, 31, dst + i); dst += 8; } else for(i = 0; i < len - 7; i += 8) { t0 = float_to_int16_one_altivec(src[0] + i); t1 = float_to_int16_one_altivec(src[1] + i); d0 = vec_mergeh(t0, t1); d1 = vec_mergel(t0, t1); vec_st(d0, 0, dst + i); vec_st(d1, 16, dst + i); dst += 8; } } else { DECLARE_ALIGNED(16, int16_t, tmp)[len]; int c, j; for (c = 0; c < channels; c++) { float_to_int16_altivec(tmp, src[c], len); for (i = 0, j = c; i < len; i++, j += channels) { dst[j] = tmp[i]; } } } }
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val)); ac = vec_add(ac, vec_sld(ac, ac, 8)); vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val)); bd = vec_add(bd, vec_sld(bd, bd, 8)); return v_float32x4(vec_mergeh(ac, bd)); }
static void test() { /* Input vectors. */ vector long long vla = {-2,-1}; vector long long vlb = {0,1}; vector double vda = {-2.0,-1.0}; vector double vdb = {0.0,1.0}; vector unsigned int vuia = {0,1,2,3}; vector unsigned int vuib = {4,5,6,7}; vector signed int vsia = {-4,-3,-2,-1}; vector signed int vsib = {0,1,2,3}; vector float vfa = {-4.0,-3.0,-2.0,-1.0}; vector float vfb = {0.0,1.0,2.0,3.0}; /* Result vectors. */ vector long long vlh, vll; vector double vdh, vdl; vector unsigned int vuih, vuil; vector signed int vsih, vsil; vector float vfh, vfl; /* Expected result vectors. */ vector long long vlrh = {-2,0}; vector long long vlrl = {-1,1}; vector double vdrh = {-2.0,0.0}; vector double vdrl = {-1.0,1.0}; vector unsigned int vuirh = {0,4,1,5}; vector unsigned int vuirl = {2,6,3,7}; vector signed int vsirh = {-4,0,-3,1}; vector signed int vsirl = {-2,2,-1,3}; vector float vfrh = {-4.0,0.0,-3.0,1.0}; vector float vfrl = {-2.0,2.0,-1.0,3.0}; vlh = vec_mergeh (vla, vlb); vll = vec_mergel (vla, vlb); vdh = vec_mergeh (vda, vdb); vdl = vec_mergel (vda, vdb); vuih = vec_mergeh (vuia, vuib); vuil = vec_mergel (vuia, vuib); vsih = vec_mergeh (vsia, vsib); vsil = vec_mergel (vsia, vsib); vfh = vec_mergeh (vfa, vfb ); vfl = vec_mergel (vfa, vfb ); check (vec_long_long_eq (vlh, vlrh), "vlh"); check (vec_long_long_eq (vll, vlrl), "vll"); check (vec_double_eq (vdh, vdrh), "vdh" ); check (vec_double_eq (vdl, vdrl), "vdl" ); check (vec_all_eq (vuih, vuirh), "vuih"); check (vec_all_eq (vuil, vuirl), "vuil"); check (vec_all_eq (vsih, vsirh), "vsih"); check (vec_all_eq (vsil, vsirl), "vsil"); check (vec_all_eq (vfh, vfrh), "vfh"); check (vec_all_eq (vfl, vfrl), "vfl"); }
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize / 4; vector signed short hiImage, loImage, hiRight, loRight; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char *)image.data; vector unsigned char *rightData = (vector unsigned char *)right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif do { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif hiImage = (vector signed short)vec_mergeh(zero,inData[0]); loImage = (vector signed short)vec_mergel(zero,inData[0]); hiRight = (vector signed short)vec_mergeh(zero,rightData[0]); loRight = (vector signed short)vec_mergel(zero,rightData[0]); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } while (--datasize); #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); #endif }
void foo (vector bool long long *vblr, vector double *vdr, vector unsigned long long *vullz, vector double *vdz, vector bool char *vbcz, vector signed char *vscz, vector unsigned char *vucz, vector bool int *vbiz, vector int *viz, vector unsigned int *vuiz, vector signed long long int *vslliz, vector bool short int *vbsiz, vector signed short int *vssiz, vector unsigned short int *vusiz, vector float *vfz) { *vblr++ = vec_andc (vbla, vblb); *vdr++ = vec_double (vslla); *vdr++ = vec_double (vulla); *vblr++ = vec_mergeh (vbla, vblb); *vblr++ = vec_mergel (vbla, vblb); *vblr++ = vec_nor (vbla, vblb); *vblr++ = vec_or (vbla, vblb); *vblr++ = vec_sel (vbla, vblb, vblc); *vblr++ = vec_sel (vbla, vblb, vullc); *vblr++ = vec_xor (vbla, vblb); *vullz++ = vec_sel (vulla, vullb, vbllc); *vullz++ = vec_sel (vulla, vullb, vullc); *vdz++ = vec_sel(vda, vdb, vullc); *vbcz++ = vec_sel (vbca, vbcb, vbcc); *vbcz++ = vec_sel (vbca, vbcb, vucc); *vbcz++ = vec_xor (vbca, vbcb); *vscz++ = vec_sel (vsca, vscb, vbcc); *vscz++ = vec_sel (vsca, vscb, vucc); *vucz++ = vec_sel (vuca, vucb, vbcc); *vucz++ = vec_sel (vuca, vucb, vucc); *vbiz++ = vec_sel (vbia, vbib, vbic); *vbiz++ = vec_sel (vbia, vbib, vuic); *vbiz++ = vec_xor (vbia, vbib); *viz++ = vec_sel (vsia, vsib, vbic); *viz++ = vec_sel (vsia, vsib, vuic); *vuiz++ = vec_sel (vuia, vuib, vbic); *vuiz++ = vec_sel (vuia, vuib, vuic); *vslliz++ = vec_sel(vslla, vsllb, vbllc); *vslliz++ = vec_sel(vslla, vsllb, vullc); *vssiz++ = vec_sel(vssia, vssib, vbsic); *vssiz++ = vec_sel(vssia, vssib, vusic); *vusiz++ = vec_sel(vusia, vusib, vbsic); *vusiz++ = vec_sel(vusia, vusib, vusic); *vbsiz++ = vec_sel (vbsia, vbsib, vbsic); *vbsiz++ = vec_sel (vbsia, vbsib, vusic); *vbsiz++ = vec_xor (vbsia, vbsib); *vdz++ = vec_sel (vda, vdb, vbllc); *vfz++ = vec_sel (vfa, vfb, vbic); *vfz++ = vec_sel (vfa, vfb, vuic); *vfz++ = vec_xor (vfa, vfb); }
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, long len, int channels) { int i; vector signed short d0, d1, d2, c0, c1, t0, t1; vector unsigned char align; if (channels == 1) float_to_int16_altivec(dst, src[0], len); else { if (channels == 2) { if (((long)dst) & 15) { for (i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); t0 = float_to_int16_one_altivec(src[0] + i); d1 = vec_ld(31, dst + i); t1 = float_to_int16_one_altivec(src[1] + i); c0 = vec_mergeh(t0, t1); c1 = vec_mergel(t0, t1); d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d2, c0, align); d1 = vec_perm(c0, c1, align); vec_st(d0, 0, dst + i); d0 = vec_perm(c1, d2, align); vec_st(d1, 15, dst + i); vec_st(d0, 31, dst + i); dst += 8; } } else { for (i = 0; i < len - 7; i += 8) { t0 = float_to_int16_one_altivec(src[0] + i); t1 = float_to_int16_one_altivec(src[1] + i); d0 = vec_mergeh(t0, t1); d1 = vec_mergel(t0, t1); vec_st(d0, 0, dst + i); vec_st(d1, 16, dst + i); dst += 8; } } } else { for (i = 0; i < channels; i++) float_to_int16_stride_altivec(dst + i, src[i], len, channels); } } }
static void OSX_AudioIOProc16Bit_Altivec(SInt16 *myInBuffer, float *myOutBuffer) { register UInt32 i; float f = SOUND_BUFFER_SCALE_16BIT; const vector float gain = vec_load_ps1(&f); // multiplier const vector float mix = vec_setzero(); if (gBufferMono2Stereo) { int j=0; // TEST: OK for (i=0;i<SOUND_BUFFER_SIZE;i+=8, j+=16) { vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float vector float v3 = vec_madd(v1, gain, mix); // scale vector float v4 = vec_madd(v2, gain, mix); // scale vector float v5 = vec_mergel(v3, v3); // v3(0,0,1,1); vector float v6 = vec_mergeh(v3, v3); // v3(2,2,3,3); vector float v7 = vec_mergel(v4, v4); // v4(0,0,1,1); vector float v8 = vec_mergeh(v4, v4); // v4(2,2,3,3); vec_st(v5, 0, myOutBuffer + j); // Store 4 floats vec_st(v6, 0, myOutBuffer + 4 + j); // Store 4 floats vec_st(v7, 0, myOutBuffer + 8 + j); // Store 4 floats vec_st(v8, 0, myOutBuffer + 12 + j); // Store 4 floats } } else { // TEST: OK for (i=0;i<SOUND_BUFFER_SIZE;i+=8) { vector short int v0 = vec_ld(0, myInBuffer + i); // Load 8 shorts vector float v1 = vec_ctf((vector signed int)vec_unpackh(v0), 0); // convert to float vector float v2 = vec_ctf((vector signed int)vec_unpackl(v0), 0); // convert to float vector float v3 = vec_madd(v1, gain, mix); // scale vector float v4 = vec_madd(v2, gain, mix); // scale vec_st(v3, 0, myOutBuffer + i); // Store 4 floats vec_st(v4, 0, myOutBuffer + 4 + i); // Store 4 floats } } }
static force_inline vector unsigned int pix_multiply (vector unsigned int p, vector unsigned int a) { vector unsigned short hi, lo, mod; /* unpack to short */ hi = (vector unsigned short) vec_mergeh ((vector unsigned char)AVV (0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergeh ((vector unsigned char)AVV (0), (vector unsigned char)a); hi = vec_mladd (hi, mod, (vector unsigned short) AVV (0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080)); hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); hi = vec_sr (hi, vec_splat_u16 (8)); /* unpack to short */ lo = (vector unsigned short) vec_mergel ((vector unsigned char)AVV (0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergel ((vector unsigned char)AVV (0), (vector unsigned char)a); lo = vec_mladd (lo, mod, (vector unsigned short) AVV (0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080)); lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); lo = vec_sr (lo, vec_splat_u16 (8)); return (vector unsigned int)vec_packsu (hi, lo); }
void v_store_interleave_f32(float *ptr, vector float a, vector float b, vector float c) { vector float hbc = vec_mergeh(b, c); static const vector unsigned char ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; vec_xst(vec_perm(a, hbc, ahbc), 0, ptr); vector float lab = vec_mergel(a, b); vec_xst(vec_sld(lab, hbc, 8), 16, ptr); static const vector unsigned char clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15}; vec_xst(vec_perm(c, lab, clab), 32, ptr); }
static void predict_16x16_p_altivec( uint8_t *src ) { int16_t a, b, c, i; int H = 0; int V = 0; int16_t i00; for( i = 1; i <= 8; i++ ) { H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] ); V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] ); } a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); b = ( 5 * H + 32 ) >> 6; c = ( 5 * V + 32 ) >> 6; i00 = a - b * 7 - c * 7 + 16; vect_sshort_u i00_u, b_u, c_u; i00_u.s[0] = i00; b_u.s[0] = b; c_u.s[0] = c; vec_u16_t val5_v = vec_splat_u16(5); vec_s16_t i00_v, b_v, c_v; i00_v = vec_splat(i00_u.v, 0); b_v = vec_splat(b_u.v, 0); c_v = vec_splat(c_u.v, 0); vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7); vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3)); vec_s32_t mule_b_v = vec_mule(induc_v, b_v); vec_s32_t mulo_b_v = vec_mulo(induc_v, b_v); vec_s16_t mul_b_induc0_v = vec_pack(vec_mergeh(mule_b_v, mulo_b_v), vec_mergel(mule_b_v, mulo_b_v)); vec_s16_t add_i0_b_0v = vec_adds(i00_v, mul_b_induc0_v); vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v); int y; for( y = 0; y < 16; y++ ) { vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v); vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v); vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v); vec_st( com_sat_v, 0, &src[0]); src += FDEC_STRIDE; i00 += c; add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); add_i0_b_8v = vec_adds(add_i0_b_8v, c_v); } }
void foo (vector bool long long *vblr, vector double *vdr) { *vblr++ = vec_andc (vbla, vblb); *vdr++ = vec_double (vsla); *vdr++ = vec_double (vula); *vblr++ = vec_mergeh (vbla, vblb); *vblr++ = vec_mergel (vbla, vblb); *vblr++ = vec_nor (vbla, vblb); *vblr++ = vec_or (vbla, vblb); *vblr++ = vec_sel (vbla, vblb, vblc); *vblr++ = vec_sel (vbla, vblb, vulc); *vblr++ = vec_xor (vbla, vblb); }
/* Place the content of the array of structures in vectors x_vec, y_vec, z_vec, and t_vec */ int main(int argc, char **argv) { vector float x_vec, y_vec, z_vec, t_vec, hold[4], tmp[4]; /* Load structures into vectors */ hold[0] = vec_ld(0, (float*)p_motion); hold[1] = vec_ld(0, (float*)&p_motion[1]); hold[2] = vec_ld(0, (float*)&p_motion[2]); hold[3] = vec_ld(0, (float*)&p_motion[3]); /* Perform first step of the swizzle */ tmp[0] = vec_mergeh(hold[0], hold[2]); tmp[1] = vec_mergeh(hold[1], hold[3]); tmp[2] = vec_mergel(hold[0], hold[2]); tmp[3] = vec_mergel(hold[1], hold[3]); /* Perform second step of the swizzle */ x_vec = vec_mergeh(tmp[0], tmp[1]); y_vec = vec_mergel(tmp[0], tmp[1]); z_vec = vec_mergeh(tmp[2], tmp[3]); t_vec = vec_mergel(tmp[2], tmp[3]); return 0; }
static int a52_resample_STEREO_to_2_altivec(float * _f, int16_t * s16){ #if 0 int i; int32_t * f = (int32_t *) _f; for (i = 0; i < 256; i++) { s16[2*i] = convert (f[i]); s16[2*i+1] = convert (f[i+256]); } return 2*256; #else int i = 0; int32_t * f = (int32_t *) _f; register vector signed int f0, f4, f256, f260; register vector signed short reven, rodd, r0, r1; for (i = 0; i < 256; i+= 8) { f0 = vec_ld(0, f); f4 = vec_ld(16, f); f256 = vec_ld(1024, f); f260 = vec_ld(1040, f); reven = convert16_altivec(f0, f4); rodd = convert16_altivec(f256, f260); r0 = vec_mergeh(reven, rodd); r1 = vec_mergel(reven, rodd); // FIXME can be merged to spare some I/O unaligned_store(r0, 0, s16); unaligned_store(r1, 16, s16); f += 8; s16 += 16; } return(2*256); #endif }
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 perm = vec_lvsl(0, src); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); uint8_t *srcbis = src - (srcStride * 2); const vec_u8 srcM2a = vec_ld(0, srcbis); const vec_u8 srcM2b = vec_ld(16, srcbis); const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); //srcbis += srcStride; const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcM1b = vec_ld(16, srcbis); const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); //srcbis += srcStride; const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP0b = vec_ld(16, srcbis); const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); //srcbis += srcStride; const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP1b = vec_ld(16, srcbis); const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); //srcbis += srcStride; const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP2b = vec_ld(16, srcbis); const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); //srcbis += srcStride; vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB, srcP3ssA, srcP3ssB, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; for (i = 0 ; i < 16 ; i++) { srcP3a = vec_ld(0, srcbis += srcStride); srcP3b = vec_ld(16, srcbis); srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); //srcbis += srcStride; sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1B = vec_adds(srcP0ssB, srcP1ssB); sum2A = vec_adds(srcM1ssA, srcP2ssA); sum2B = vec_adds(srcM1ssB, srcP2ssB); sum3A = vec_adds(srcM2ssA, srcP3ssA); sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }
tmp1 = vec_perm(table[6], table[7], tmpIndex); stmp1 = vec_perm(slope_cos[6], slope_cos[7], tmpIndex); select = (vector unsigned short)vec_cmpgt(PerIndex, (((vector unsigned char){95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95})) ); tmp3 = vec_sel(tmp0, tmp1, select); stmp3 = vec_sel(stmp0, stmp1, select); select = (vector unsigned short)vec_cmpgt(PerIndex, (((vector unsigned char){63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63})) ); table1 = vec_sel(tmp2, tmp3, select); slope1 = vec_sel(stmp2, stmp3, select); L_tmp0 = vec_sra(vec_mule(slope0, offset0), (((vector unsigned int){12,12,12,12})) ); L_tmp1 = vec_sra(vec_mulo(slope0, offset0), (((vector unsigned int){12,12,12,12})) ); L_tmp2 = vec_sra(vec_mule(slope1, offset1), (((vector unsigned int){12,12,12,12})) ); L_tmp3 = vec_sra(vec_mulo(slope1, offset1), (((vector unsigned int){12,12,12,12})) ); tmp0 = vec_packs(L_tmp0, L_tmp2); tmp1 = vec_packs(L_tmp1, L_tmp3); tmp2 = vec_mergeh(tmp0, tmp1); tmp3 = vec_mergel(tmp0, tmp1); lspq[0] = vec_adds(table0, tmp2); lspq[1] = vec_adds(table1, tmp3); return; }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right) { int h,w,width; width = image.xsize/8; //format is U Y V Y union { //unsigned int i; short elements[8]; //vector signed char v; vector signed short v; }shortBuffer; union { //unsigned int i; unsigned char elements[16]; //vector signed char v; vector unsigned char v; }charBuffer; //vector unsigned char c; register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp; // vector unsigned char zero = vec_splat_u8(0); register vector unsigned char c,one; // vector signed short zshort = vec_splat_s16(0); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; //Write the pixel (pair) to the transfer buffer charBuffer.elements[0] = 2; charBuffer.elements[1] = 1; charBuffer.elements[2] = 2; charBuffer.elements[3] = 1; charBuffer.elements[4] = 2; charBuffer.elements[5] = 1; charBuffer.elements[6] = 2; charBuffer.elements[7] = 1; charBuffer.elements[8] = 2; charBuffer.elements[9] = 1; charBuffer.elements[10] = 2; charBuffer.elements[11] = 1; charBuffer.elements[12] = 2; charBuffer.elements[13] = 1; charBuffer.elements[14] = 2; charBuffer.elements[15] = 1; //Load it into the vector unit c = charBuffer.v; one = vec_splat_u8( 1 ); shortBuffer.elements[0] = 255; //Load it into the vector unit d = shortBuffer.v; d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0)); #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //interleaved U Y V Y chars //vec_mule UV * 2 to short vector U V U V shorts UVImage = static_cast<vector signed short>(vec_mule(one,inData[0])); UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0])); //vec_mulo Y * 1 to short vector Y Y Y Y shorts YImage = static_cast<vector signed short>(vec_mulo(c,inData[0])); YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0])); //vel_subs UV - 255 UVRight = static_cast<vector signed short>(vec_subs(UVRight, d)); //vec_adds UV UVTemp = vec_adds(UVImage,UVRight); //vec_adds Y YTemp = vec_adds(YImage,YRight); hiImage = vec_mergeh(UVTemp,YTemp); loImage = vec_mergel(UVTemp,YTemp); //vec_mergel + vec_mergeh Y and UV inData[0] = vec_packsu(hiImage, loImage); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
void ff_fdct_altivec(int16_t *block) { vector signed short *bp; vector float *cp; vector float b00, b10, b20, b30, b40, b50, b60, b70; vector float b01, b11, b21, b31, b41, b51, b61, b71; vector float mzero, cnst, cnsts0, cnsts1, cnsts2; vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; /* setup constants {{{ */ /* mzero = -0.0 */ mzero = ((vector float)vec_splat_u32(-1)); mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); cp = fdctconsts; cnsts0 = vec_ld(0, cp); cp++; cnsts1 = vec_ld(0, cp); cp++; cnsts2 = vec_ld(0, cp); /* }}} */ /* 8x8 matrix transpose (vector short[8]) {{{ */ #define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) bp = (vector signed short*)block; b00 = ((vector float)vec_ld(0, bp)); b40 = ((vector float)vec_ld(16*4, bp)); b01 = ((vector float)MERGE_S16(h, b00, b40)); b11 = ((vector float)MERGE_S16(l, b00, b40)); bp++; b10 = ((vector float)vec_ld(0, bp)); b50 = ((vector float)vec_ld(16*4, bp)); b21 = ((vector float)MERGE_S16(h, b10, b50)); b31 = ((vector float)MERGE_S16(l, b10, b50)); bp++; b20 = ((vector float)vec_ld(0, bp)); b60 = ((vector float)vec_ld(16*4, bp)); b41 = ((vector float)MERGE_S16(h, b20, b60)); b51 = ((vector float)MERGE_S16(l, b20, b60)); bp++; b30 = ((vector float)vec_ld(0, bp)); b70 = ((vector float)vec_ld(16*4, bp)); b61 = ((vector float)MERGE_S16(h, b30, b70)); b71 = ((vector float)MERGE_S16(l, b30, b70)); x0 = ((vector float)MERGE_S16(h, b01, b41)); x1 = ((vector float)MERGE_S16(l, b01, b41)); x2 = ((vector float)MERGE_S16(h, b11, b51)); x3 = ((vector float)MERGE_S16(l, b11, b51)); x4 = ((vector float)MERGE_S16(h, b21, b61)); x5 = ((vector float)MERGE_S16(l, b21, b61)); x6 = ((vector float)MERGE_S16(h, b31, b71)); x7 = ((vector float)MERGE_S16(l, b31, b71)); b00 = ((vector float)MERGE_S16(h, x0, x4)); b10 = ((vector float)MERGE_S16(l, x0, x4)); b20 = ((vector float)MERGE_S16(h, x1, x5)); b30 = ((vector float)MERGE_S16(l, x1, x5)); b40 = ((vector float)MERGE_S16(h, x2, x6)); b50 = ((vector float)MERGE_S16(l, x2, x6)); b60 = ((vector float)MERGE_S16(h, x3, x7)); b70 = ((vector float)MERGE_S16(l, x3, x7)); #undef MERGE_S16 /* }}} */ /* Some of the initial calculations can be done as vector short before * conversion to vector float. The following code section takes advantage * of this. */ /* fdct rows {{{ */ x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); #define CTF0(n) \ b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ b##n##1 = vec_ctf(vs32(b##n##1), 0); \ b##n##0 = vec_ctf(vs32(b##n##0), 0); CTF0(0); CTF0(4); b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); CTF0(2); CTF0(6); #undef CTF0 x0 = vec_add(b60, b20); x1 = vec_add(b61, b21); cnst = LD_W2; x0 = vec_madd(cnst, x0, mzero); x1 = vec_madd(cnst, x1, mzero); cnst = LD_W1; b20 = vec_madd(cnst, b20, x0); b21 = vec_madd(cnst, b21, x1); cnst = LD_W0; b60 = vec_madd(cnst, b60, x0); b61 = vec_madd(cnst, b61, x1); #define CTFX(x,b) \ b##0 = ((vector float)vec_unpackh(vs16(x))); \ b##1 = ((vector float)vec_unpackl(vs16(x))); \ b##0 = vec_ctf(vs32(b##0), 0); \ b##1 = vec_ctf(vs32(b##1), 0); \ CTFX(x4, b7); CTFX(x5, b5); CTFX(x6, b3); CTFX(x7, b1); #undef CTFX x0 = vec_add(b70, b10); x1 = vec_add(b50, b30); x2 = vec_add(b70, b30); x3 = vec_add(b50, b10); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b70 = vec_madd(cnst, b70, x0); cnst = LD_W5; b50 = vec_madd(cnst, b50, x1); cnst = LD_W6; b30 = vec_madd(cnst, b30, x1); cnst = LD_W7; b10 = vec_madd(cnst, b10, x0); b70 = vec_add(b70, x2); b50 = vec_add(b50, x3); b30 = vec_add(b30, x2); b10 = vec_add(b10, x3); x0 = vec_add(b71, b11); x1 = vec_add(b51, b31); x2 = vec_add(b71, b31); x3 = vec_add(b51, b11); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b71 = vec_madd(cnst, b71, x0); cnst = LD_W5; b51 = vec_madd(cnst, b51, x1); cnst = LD_W6; b31 = vec_madd(cnst, b31, x1); cnst = LD_W7; b11 = vec_madd(cnst, b11, x0); b71 = vec_add(b71, x2); b51 = vec_add(b51, x3); b31 = vec_add(b31, x2); b11 = vec_add(b11, x3); /* }}} */ /* 8x8 matrix transpose (vector float[8][2]) {{{ */ x0 = vec_mergel(b00, b20); x1 = vec_mergeh(b00, b20); x2 = vec_mergel(b10, b30); x3 = vec_mergeh(b10, b30); b00 = vec_mergeh(x1, x3); b10 = vec_mergel(x1, x3); b20 = vec_mergeh(x0, x2); b30 = vec_mergel(x0, x2); x4 = vec_mergel(b41, b61); x5 = vec_mergeh(b41, b61); x6 = vec_mergel(b51, b71); x7 = vec_mergeh(b51, b71); b41 = vec_mergeh(x5, x7); b51 = vec_mergel(x5, x7); b61 = vec_mergeh(x4, x6); b71 = vec_mergel(x4, x6); x0 = vec_mergel(b01, b21); x1 = vec_mergeh(b01, b21); x2 = vec_mergel(b11, b31); x3 = vec_mergeh(b11, b31); x4 = vec_mergel(b40, b60); x5 = vec_mergeh(b40, b60); x6 = vec_mergel(b50, b70); x7 = vec_mergeh(b50, b70); b40 = vec_mergeh(x1, x3); b50 = vec_mergel(x1, x3); b60 = vec_mergeh(x0, x2); b70 = vec_mergel(x0, x2); b01 = vec_mergeh(x5, x7); b11 = vec_mergel(x5, x7); b21 = vec_mergeh(x4, x6); b31 = vec_mergel(x4, x6); /* }}} */ FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); /* round, convert back to short {{{ */ #define CTS(n) \ b##n##0 = vec_round(b##n##0); \ b##n##1 = vec_round(b##n##1); \ b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ vec_st(vs16(b##n##0), 0, bp); bp = (vector signed short*)block; CTS(0); bp++; CTS(1); bp++; CTS(2); bp++; CTS(3); bp++; CTS(4); bp++; CTS(5); bp++; CTS(6); bp++; CTS(7); #undef CTS /* }}} */ }
int main () { vector float fa = {1.0, 2.0, 3.0, -4.0}; vector float fb = {-2.0, -3.0, -4.0, -5.0}; vector float fc = vec_cpsgn (fa, fb); vector long long la = {5L, 14L}; vector long long lb = {3L, 86L}; vector long long lc = vec_and (la, lb); vector bool long long ld = {0, -1}; vector long long le = vec_and (la, ld); vector long long lf = vec_and (ld, lb); vector unsigned long long ua = {5L, 14L}; vector unsigned long long ub = {3L, 86L}; vector unsigned long long uc = vec_and (ua, ub); vector bool long long ud = {0, -1}; vector unsigned long long ue = vec_and (ua, ud); vector unsigned long long uf = vec_and (ud, ub); vector long long lg = vec_andc (la, lb); vector long long lh = vec_andc (la, ld); vector long long li = vec_andc (ld, lb); vector unsigned long long ug = vec_andc (ua, ub); vector unsigned long long uh = vec_andc (ua, ud); vector unsigned long long ui = vec_andc (ud, ub); vector double da = {1.0, -4.0}; vector double db = {-2.0, 5.0}; vector double dc = vec_cpsgn (da, db); vector long long lj = vec_mergeh (la, lb); vector long long lk = vec_mergeh (la, ld); vector long long ll = vec_mergeh (ld, la); vector unsigned long long uj = vec_mergeh (ua, ub); vector unsigned long long uk = vec_mergeh (ua, ud); vector unsigned long long ul = vec_mergeh (ud, ua); vector long long lm = vec_mergel (la, lb); vector long long ln = vec_mergel (la, ld); vector long long lo = vec_mergel (ld, la); vector unsigned long long um = vec_mergel (ua, ub); vector unsigned long long un = vec_mergel (ua, ud); vector unsigned long long uo = vec_mergel (ud, ua); vector long long lp = vec_nor (la, lb); vector long long lq = vec_nor (la, ld); vector long long lr = vec_nor (ld, la); vector unsigned long long up = vec_nor (ua, ub); vector unsigned long long uq = vec_nor (ua, ud); vector unsigned long long ur = vec_nor (ud, ua); vector long long ls = vec_or (la, lb); vector long long lt = vec_or (la, ld); vector long long lu = vec_or (ld, la); vector unsigned long long us = vec_or (ua, ub); vector unsigned long long ut = vec_or (ua, ud); vector unsigned long long uu = vec_or (ud, ua); vector unsigned char ca = {0,4,8,1,5,9,2,6,10,3,7,11,15,12,14,13}; vector long long lv = vec_perm (la, lb, ca); vector unsigned long long uv = vec_perm (ua, ub, ca); vector long long lw = vec_sel (la, lb, lc); vector long long lx = vec_sel (la, lb, uc); vector long long ly = vec_sel (la, lb, ld); vector unsigned long long uw = vec_sel (ua, ub, lc); vector unsigned long long ux = vec_sel (ua, ub, uc); vector unsigned long long uy = vec_sel (ua, ub, ld); vector long long lz = vec_xor (la, lb); vector long long l0 = vec_xor (la, ld); vector long long l1 = vec_xor (ld, la); vector unsigned long long uz = vec_xor (ua, ub); vector unsigned long long u0 = vec_xor (ua, ud); vector unsigned long long u1 = vec_xor (ud, ua); int ia = vec_all_eq (ua, ub); int ib = vec_all_ge (ua, ub); int ic = vec_all_gt (ua, ub); int id = vec_all_le (ua, ub); int ie = vec_all_lt (ua, ub); int ig = vec_all_ne (ua, ub); int ih = vec_any_eq (ua, ub); int ii = vec_any_ge (ua, ub); int ij = vec_any_gt (ua, ub); int ik = vec_any_le (ua, ub); int il = vec_any_lt (ua, ub); int im = vec_any_ne (ua, ub); vector int sia = {9, 16, 25, 36}; vector int sib = {-8, -27, -64, -125}; vector int sic = vec_mergee (sia, sib); vector int sid = vec_mergeo (sia, sib); vector unsigned int uia = {9, 16, 25, 36}; vector unsigned int uib = {8, 27, 64, 125}; vector unsigned int uic = vec_mergee (uia, uib); vector unsigned int uid = vec_mergeo (uia, uib); vector bool int bia = {0, -1, -1, 0}; vector bool int bib = {-1, -1, 0, -1}; vector bool int bic = vec_mergee (bia, bib); vector bool int bid = vec_mergeo (bia, bib); vector unsigned int uie = vec_packsu (ua, ub); vector long long l2 = vec_cntlz (la); vector unsigned long long u2 = vec_cntlz (ua); vector int sie = vec_cntlz (sia); vector unsigned int uif = vec_cntlz (uia); vector short ssa = {20, -40, -60, 80, 100, -120, -140, 160}; vector short ssb = vec_cntlz (ssa); vector unsigned short usa = {81, 72, 63, 54, 45, 36, 27, 18}; vector unsigned short usb = vec_cntlz (usa); vector signed char sca = {-4, 3, -9, 15, -31, 31, 0, 0, 1, 117, -36, 99, 98, 97, 96, 95}; vector signed char scb = vec_cntlz (sca); vector unsigned char cb = vec_cntlz (ca); vector double dd = vec_xl (0, &y); vec_xst (dd, 0, &z); vector double de = vec_round (dd); vector double df = vec_splat (de, 0); vector double dg = vec_splat (de, 1); vector long long l3 = vec_splat (l2, 0); vector long long l4 = vec_splat (l2, 1); vector unsigned long long u3 = vec_splat (u2, 0); vector unsigned long long u4 = vec_splat (u2, 1); vector bool long long l5 = vec_splat (ld, 0); vector bool long long l6 = vec_splat (ld, 1); vector long long l7 = vec_div (l3, l4); vector unsigned long long u5 = vec_div (u3, u4); vector long long l8 = vec_mul (l3, l4); vector unsigned long long u6 = vec_mul (u3, u4); vector double dh = vec_ctf (la, -2); vector double di = vec_ctf (ua, 2); vector long long l9 = vec_cts (dh, -2); vector unsigned long long u7 = vec_ctu (di, 2); return 0; }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
/* more optimized version - unrolled and load-hoisted */ void pix_offset :: processYUVAltivec(imageStruct &image) { register int h,w,width,height; width = image.xsize/16; //for altivec height = image.ysize; //format is U Y V Y // start of working altivec function union { short elements[8]; vector signed short v; } transferBuffer; register vector signed short c, hi, lo; register vector signed short hi1, lo1; register vector signed short loadhi, loadhi1, loadlo, loadlo1; register vector unsigned char zero = vec_splat_u8(0); register vector unsigned char *inData = (vector unsigned char*) image.data; //Write the pixel (pair) to the transfer buffer //transferBuffer.i = (U << 24) | (Y << 16) | (V << 8 ) | Y; transferBuffer.elements[0] = U; transferBuffer.elements[1] = Y; transferBuffer.elements[2] = V; transferBuffer.elements[3] = Y; transferBuffer.elements[4] = U; transferBuffer.elements[5] = Y; transferBuffer.elements[6] = V; transferBuffer.elements[7] = Y; //Load it into the vector unit c = transferBuffer.v; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( inData+16, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( inData+64, prefetchSize, 3 ); #endif //expand the UInt8's to short's loadhi = (vector signed short) vec_mergeh( zero, inData[0] ); loadlo = (vector signed short) vec_mergel( zero, inData[0] ); loadhi1 = (vector signed short) vec_mergeh( zero, inData[1] ); loadlo1 = (vector signed short) vec_mergel( zero, inData[1] ); \ for ( h=0; h<height; h++) { for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( inData+16, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( inData+64, prefetchSize, 3 ); #endif //add the constant to it hi = vec_add( loadhi, c ); lo = vec_add( loadlo, c ); hi1 = vec_add( loadhi1, c ); lo1 = vec_add( loadlo1, c ); //expand the UInt8's to short's loadhi = (vector signed short) vec_mergeh( zero, inData[2] ); loadlo = (vector signed short) vec_mergel( zero, inData[2] ); loadhi1 = (vector signed short) vec_mergeh( zero, inData[3] ); loadlo1 = (vector signed short) vec_mergel( zero, inData[3] ); //pack the result back down, with saturation inData[0] = vec_packsu( hi, lo ); inData++; inData[0] = vec_packsu( hi1, lo1 ); inData++; } } // // finish the last iteration after the loop // hi = vec_add( loadhi, c ); lo = vec_add( loadlo, c ); hi1 = vec_add( loadhi1, c ); lo1 = vec_add( loadlo1, c ); //pack the result back down, with saturation inData[0] = vec_packsu( hi, lo ); inData++; inData[0] = vec_packsu( hi1, lo1 ); inData++; #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); //end of working altivec function #endif }
static int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; const vector float zero = (const vector float)FOUROF(0.); // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_ld(0, data); data1 = vec_ld(16, data); data2 = vec_ld(32, data); data3 = vec_ld(48, data); data4 = vec_ld(64, data); data5 = vec_ld(80, data); data6 = vec_ld(96, data); data7 = vec_ld(112, data); // Transpose the data before we start TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); // load the data into floating point vectors. We load // the high half of each row into the main row vectors // and the low half into the alt vectors. row0 = vec_ctf(vec_unpackh(data0), 0); alt0 = vec_ctf(vec_unpackl(data0), 0); row1 = vec_ctf(vec_unpackh(data1), 0); alt1 = vec_ctf(vec_unpackl(data1), 0); row2 = vec_ctf(vec_unpackh(data2), 0); alt2 = vec_ctf(vec_unpackl(data2), 0); row3 = vec_ctf(vec_unpackh(data3), 0); alt3 = vec_ctf(vec_unpackl(data3), 0); row4 = vec_ctf(vec_unpackh(data4), 0); alt4 = vec_ctf(vec_unpackl(data4), 0); row5 = vec_ctf(vec_unpackh(data5), 0); alt5 = vec_ctf(vec_unpackl(data5), 0); row6 = vec_ctf(vec_unpackh(data6), 0); alt6 = vec_ctf(vec_unpackl(data6), 0); row7 = vec_ctf(vec_unpackh(data7), 0); alt7 = vec_ctf(vec_unpackl(data7), 0); } // The following block could exist as a separate an altivec dct // function. However, if we put it inline, the DCT data can remain // in the vector local variables, as floats, which we'll use during the // quantize step... { const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); int whichPass, whichHalf; for(whichPass = 1; whichPass<=2; whichPass++) { for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); row0 = vec_add(tmp10, tmp11); // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); row4 = vec_sub(tmp10, tmp11); // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), // CONST_BITS-PASS1_BITS); row2 = vec_madd(tmp13, vec_0_765366865, z1); // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), // CONST_BITS-PASS1_BITS); row6 = vec_madd(tmp12, vec_1_847759065, z1); z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z3 = vec_madd(z3, vec_1_961570560, z5); // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ z4 = vec_madd(z4, vec_0_390180644, z5); // The following adds are rolled into the multiplies above // z3 = vec_add(z3, z5); // z3 += z5; // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ // Same with this one... // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); // Swap the row values with the alts. If this is the first half, // this sets up the low values to be acted on in the second half. // If this is the second half, it puts the high values back in // the row values where they are expected to be when we're done. SWAP(row0, alt0); SWAP(row1, alt1); SWAP(row2, alt2); SWAP(row3, alt3); SWAP(row4, alt4); SWAP(row5, alt5); SWAP(row6, alt6); SWAP(row7, alt7); } if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. SWAP(row4, alt0); SWAP(row5, alt1); SWAP(row6, alt2); SWAP(row7, alt3); // Now, transpose each block of four TRANSPOSE4(row0, row1, row2, row3); TRANSPOSE4(row4, row5, row6, row7); TRANSPOSE4(alt0, alt1, alt2, alt3); TRANSPOSE4(alt4, alt5, alt6, alt7); } } } // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case // (it needs special handling). baseVector = vec_cts(vec_splat(row0, 0), 0); vec_ste(baseVector, 0, &oldBaseValue); qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } // Load the bias vector (We add 0.5 to the bias so that we're // rounding when we convert to int, instead of flooring.) { vector signed int biasInt; const vector float negOneFloat = (vector float)FOUROF(-1.0f); LOAD4(biasInt, biasAddr); bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); negBias = vec_madd(bias, negOneFloat, zero); } { vector float q0, q1, q2, q3, q4, q5, q6, q7; q0 = vec_ctf(qmat[0], QMAT_SHIFT); q1 = vec_ctf(qmat[2], QMAT_SHIFT); q2 = vec_ctf(qmat[4], QMAT_SHIFT); q3 = vec_ctf(qmat[6], QMAT_SHIFT); q4 = vec_ctf(qmat[8], QMAT_SHIFT); q5 = vec_ctf(qmat[10], QMAT_SHIFT); q6 = vec_ctf(qmat[12], QMAT_SHIFT); q7 = vec_ctf(qmat[14], QMAT_SHIFT); row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), vec_cmpgt(row0, zero)); row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), vec_cmpgt(row1, zero)); row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), vec_cmpgt(row2, zero)); row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), vec_cmpgt(row3, zero)); row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), vec_cmpgt(row4, zero)); row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), vec_cmpgt(row5, zero)); row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), vec_cmpgt(row6, zero)); row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), vec_cmpgt(row7, zero)); q0 = vec_ctf(qmat[1], QMAT_SHIFT); q1 = vec_ctf(qmat[3], QMAT_SHIFT); q2 = vec_ctf(qmat[5], QMAT_SHIFT); q3 = vec_ctf(qmat[7], QMAT_SHIFT); q4 = vec_ctf(qmat[9], QMAT_SHIFT); q5 = vec_ctf(qmat[11], QMAT_SHIFT); q6 = vec_ctf(qmat[13], QMAT_SHIFT); q7 = vec_ctf(qmat[15], QMAT_SHIFT); alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), vec_cmpgt(alt0, zero)); alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), vec_cmpgt(alt1, zero)); alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), vec_cmpgt(alt2, zero)); alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), vec_cmpgt(alt3, zero)); alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), vec_cmpgt(alt4, zero)); alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), vec_cmpgt(alt5, zero)); alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), vec_cmpgt(alt6, zero)); alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), vec_cmpgt(alt7, zero)); } } // Store the data back into the original block { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); { // Clamp for overflow vector signed int max_q_int, min_q_int; vector signed short max_q, min_q; LOAD4(max_q_int, &(s->max_qcoeff)); LOAD4(min_q_int, &(s->min_qcoeff)); max_q = vec_pack(max_q_int, max_q_int); min_q = vec_pack(min_q_int, min_q_int); data0 = vec_max(vec_min(data0, max_q), min_q); data1 = vec_max(vec_min(data1, max_q), min_q); data2 = vec_max(vec_min(data2, max_q), min_q); data4 = vec_max(vec_min(data4, max_q), min_q); data5 = vec_max(vec_min(data5, max_q), min_q); data6 = vec_max(vec_min(data6, max_q), min_q); data7 = vec_max(vec_min(data7, max_q), min_q); } { vector bool char zero_01, zero_23, zero_45, zero_67; vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); signed char lastNonZeroChar; // Determine the largest non-zero index. zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), vec_cmpeq(data1, (vector signed short)zero)); zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), vec_cmpeq(data3, (vector signed short)zero)); zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), vec_cmpeq(data5, (vector signed short)zero)); zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 4 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 2 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // largest value scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); scanIndexes_01 = vec_splat(scanIndexes_01, 0); vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; // While the data is still in vectors we check for the transpose IDCT permute // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } vec_st(data0, 0, data); vec_st(data1, 16, data); vec_st(data2, 32, data); vec_st(data3, 48, data); vec_st(data4, 64, data); vec_st(data5, 80, data); vec_st(data6, 96, data); vec_st(data7, 112, data); } } // special handling of block[0] if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else oldBaseValue /= s->c_dc_scale; } // Divide by 8, rounding the result data[0] = (oldBaseValue + 4) >> 3; } // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; }
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned int v10ui = vec_splat_u32(10); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v1ss = vec_splat_s16(1); const vector signed int v512si = (const vector signed int)AVV(512); const vector unsigned int v16ui = (const vector unsigned int)AVV(16); register int align = ((((unsigned long)src) - 2) % 16); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short psumA = vec_sub(pp1A, pp2A); const vector signed short psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vector unsigned char mperm = (const vector unsigned char) AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); int16_t *tmpbis = tmp - (tmpStride * 21); vector signed short tmpM2ssA = vec_ld(0, tmpbis); vector signed short tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpM1ssA = vec_ld(0, tmpbis); vector signed short tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP0ssA = vec_ld(0, tmpbis); vector signed short tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP1ssA = vec_ld(0, tmpbis); vector signed short tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP2ssA = vec_ld(0, tmpbis); vector signed short tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vector signed short tmpP3ssA = vec_ld(0, tmpbis); const vector signed short tmpP3ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; const vector signed int pp1Ae = vec_mule(sum1A, v20ss); const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); const vector signed int pp1Be = vec_mule(sum1B, v20ss); const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); const vector signed int pp2Ae = vec_mule(sum2A, v5ss); const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); const vector signed int pp2Be = vec_mule(sum2B, v5ss); const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); const vector signed int pp1cAe = vec_add(pp1Ae, v512si); const vector signed int pp1cAo = vec_add(pp1Ao, v512si); const vector signed int pp1cBe = vec_add(pp1Be, v512si); const vector signed int pp1cBo = vec_add(pp1Bo, v512si); const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); const vector signed int sumBe = vec_add(pp1cBe, pp32Be); const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); const vector signed int ssumAe = vec_sra(sumAe, v10ui); const vector signed int ssumAo = vec_sra(sumAo, v10ui); const vector signed int ssumBe = vec_sra(sumBe, v10ui); const vector signed int ssumBo = vec_sra(sumBo, v10ui); const vector signed short ssume = vec_packs(ssumAe, ssumBe); const vector signed short ssumo = vec_packs(ssumAo, ssumBo); const vector unsigned char sumv = vec_packsu(ssume, ssumo); const vector unsigned char sum = vec_perm(sumv, sumv, mperm); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char perm = vec_lvsl(0, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); uint8_t *srcbis = src - (srcStride * 2); const vector unsigned char srcM2a = vec_ld(0, srcbis); const vector unsigned char srcM2b = vec_ld(16, srcbis); const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); srcbis += srcStride; const vector unsigned char srcM1a = vec_ld(0, srcbis); const vector unsigned char srcM1b = vec_ld(16, srcbis); const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); srcbis += srcStride; const vector unsigned char srcP0a = vec_ld(0, srcbis); const vector unsigned char srcP0b = vec_ld(16, srcbis); const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); srcbis += srcStride; const vector unsigned char srcP1a = vec_ld(0, srcbis); const vector unsigned char srcP1b = vec_ld(16, srcbis); const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); srcbis += srcStride; const vector unsigned char srcP2a = vec_ld(0, srcbis); const vector unsigned char srcP2b = vec_ld(16, srcbis); const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); srcbis += srcStride; vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); for (i = 0 ; i < 16 ; i++) { const vector unsigned char srcP3a = vec_ld(0, srcbis); const vector unsigned char srcP3b = vec_ld(16, srcbis); const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); srcbis += srcStride; const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); }
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 permM2 = vec_lvsl(-2, src); const vec_u8 permM1 = vec_lvsl(-1, src); const vec_u8 permP0 = vec_lvsl(+0, src); const vec_u8 permP1 = vec_lvsl(+1, src); const vec_u8 permP2 = vec_lvsl(+2, src); const vec_u8 permP3 = vec_lvsl(+3, src); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u32 v10ui = vec_splat_u32(10); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v1ss = vec_splat_s16(1); const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); register int align = ((((unsigned long)src) - 2) % 16); vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, psumA, psumB; const vec_u8 mperm = (const vec_u8) {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; int16_t *tmpbis = tmp; vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP2ssA, tmpP2ssB; vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, ssumAe, ssumAo, ssumBe, ssumBo; vec_u8 fsum, sumv, sum; vec_s16 ssume, ssumo; src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8 srcR1 = vec_ld(-2, src); vec_u8 srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); sum2A = vec_adds(srcM1A, srcP2A); sum2B = vec_adds(srcM1B, srcP2B); sum3A = vec_adds(srcM2A, srcP3A); sum3B = vec_adds(srcM2B, srcP3B); pp1A = vec_mladd(sum1A, v20ss, sum3A); pp1B = vec_mladd(sum1B, v20ss, sum3B); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); psumA = vec_sub(pp1A, pp2A); psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } tmpM2ssA = vec_ld(0, tmpbis); tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpM1ssA = vec_ld(0, tmpbis); tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP0ssA = vec_ld(0, tmpbis); tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP1ssA = vec_ld(0, tmpbis); tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP2ssA = vec_ld(0, tmpbis); tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpbis += tmpStride; tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; pp1Ae = vec_mule(sum1A, v20ss); pp1Ao = vec_mulo(sum1A, v20ss); pp1Be = vec_mule(sum1B, v20ss); pp1Bo = vec_mulo(sum1B, v20ss); pp2Ae = vec_mule(sum2A, v5ss); pp2Ao = vec_mulo(sum2A, v5ss); pp2Be = vec_mule(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss); pp3Ae = vec_sra((vec_s32)sum3A, v16ui); pp3Ao = vec_mulo(sum3A, v1ss); pp3Be = vec_sra((vec_s32)sum3B, v16ui); pp3Bo = vec_mulo(sum3B, v1ss); pp1cAe = vec_add(pp1Ae, v512si); pp1cAo = vec_add(pp1Ao, v512si); pp1cBe = vec_add(pp1Be, v512si); pp1cBo = vec_add(pp1Bo, v512si); pp32Ae = vec_sub(pp3Ae, pp2Ae); pp32Ao = vec_sub(pp3Ao, pp2Ao); pp32Be = vec_sub(pp3Be, pp2Be); pp32Bo = vec_sub(pp3Bo, pp2Bo); sumAe = vec_add(pp1cAe, pp32Ae); sumAo = vec_add(pp1cAo, pp32Ao); sumBe = vec_add(pp1cBe, pp32Be); sumBo = vec_add(pp1cBo, pp32Bo); ssumAe = vec_sra(sumAe, v10ui); ssumAo = vec_sra(sumAo, v10ui); ssumBe = vec_sra(sumBe, v10ui); ssumBo = vec_sra(sumBo, v10ui); ssume = vec_packs(ssumAe, ssumBe); ssumo = vec_packs(ssumAo, ssumBo); sumv = vec_packsu(ssume, ssumo); sum = vec_perm(sumv, sumv, mperm); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }
/* next one assumes that ((line_size % 16) == 0) */ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) { register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2; register vector unsigned short temp3, temp4, pixelssum1, pixelssum2, pixelssum3, pixelssum4; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); } pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv4 = vec_mergel(vczero, pixelsv2); pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum3 = vec_add((vector unsigned short)pixelsv3, (vector unsigned short)pixelsv4); pixelssum3 = vec_add(pixelssum3, vcone); pixelssum1 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); for (i = 0; i < h ; i++) { blockv = vec_ld(0, block); temp1 = vec_ld(line_size, pixels); temp2 = vec_ld(line_size + 16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); } pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv4 = vec_mergel(vczero, pixelsv2); pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum4 = vec_add((vector unsigned short)pixelsv3, (vector unsigned short)pixelsv4); pixelssum2 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); temp4 = vec_add(pixelssum3, pixelssum4); temp4 = vec_sra(temp4, vctwo); temp3 = vec_add(pixelssum1, pixelssum2); temp3 = vec_sra(temp3, vctwo); pixelssum3 = vec_add(pixelssum4, vcone); pixelssum1 = vec_add(pixelssum2, vcone); blockv = vec_packsu(temp3, temp4); vec_st(blockv, 0, block); block += line_size; pixels += line_size; } }
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 permM2 = vec_lvsl(-2, src); const vec_u8 permM1 = vec_lvsl(-1, src); const vec_u8 permP0 = vec_lvsl(+0, src); const vec_u8 permP1 = vec_lvsl(+1, src); const vec_u8 permP2 = vec_lvsl(+2, src); const vec_u8 permP3 = vec_lvsl(+3, src); const vec_s16 v5ss = vec_splat_s16(5); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; register int align = ((((unsigned long)src) - 2) % 16); vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB; vec_u8 sum, fsum; for (i = 0 ; i < 16 ; i ++) { vec_u8 srcR1 = vec_ld(-2, src); vec_u8 srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); sum2A = vec_adds(srcM1A, srcP2A); sum2B = vec_adds(srcM1B, srcP2B); sum3A = vec_adds(srcM2A, srcP3A); sum3B = vec_adds(srcM2B, srcP3B); pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); src += srcStride; dst += dstStride; } }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); register int align = ((((unsigned long)src) - 2) % 16); for (i = 0 ; i < 16 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); src += srcStride; dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); }