static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
void foo( float scalar) { unsigned long width; unsigned long x; vector float vColor; vector unsigned int selectMask; vColor = vec_perm( vec_ld( 0, &scalar), vec_ld( 3, &scalar), vec_lvsl( 0, &scalar) ); float *destRow; vector float store, load0; for( ; x < width; x++) { load0 = vec_sel( vColor, load0, selectMask ); vec_st( store, 0, destRow ); store = load0; } }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static void predict_16x16_h_altivec( uint8_t *src ) { for( int i = 0; i < 16; i++ ) { vec_u8_t v = vec_ld(-1, src); vec_u8_t v_v = vec_splat(v, 15); vec_st(v_v, 0, src); src += FDEC_STRIDE; } }
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { register int i, j; LOCAL_ALIGNED(16, int, val, [16]); vector signed int vo1, vo2, vo3, vo4; vector unsigned short vs1, vs2; vector unsigned char vf; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); for (i = 0; i < 16; i++) val[i] = dither[(x + i + offset) & 7] << 12; vo1 = vec_ld(0, val); vo2 = vec_ld(16, val); vo3 = vec_ld(32, val); vo4 = vec_ld(48, val); for (j = 0; j < filterSize; j++) { unsigned int joffset=j<<1; unsigned int xoffset=x<<1; vector unsigned char perm; vector signed short l1,vLumFilter; LOAD_FILTER(vLumFilter,filter); vLumFilter = vec_splat(vLumFilter, 0); LOAD_L1(l1,src[j],perm); yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter); yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter); } vo1 = vec_sra(vo1, altivec_vectorShiftInt19); vo2 = vec_sra(vo2, altivec_vectorShiftInt19); vo3 = vec_sra(vo3, altivec_vectorShiftInt19); vo4 = vec_sra(vo4, altivec_vectorShiftInt19); vs1 = vec_packsu(vo1, vo2); vs2 = vec_packsu(vo3, vo4); vf = vec_packsu(vs1, vs2); VEC_ST(vf, 0, dest); }
void audio_convert_float_to_s16_altivec(int16_t *out, const float *in, size_t samples) { // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely). if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0) { size_t i; for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { vector float input0 = vec_ld( 0, in); vector float input1 = vec_ld(16, in); vector signed int result0 = vec_cts(input0, 15); vector signed int result1 = vec_cts(input1, 15); vec_st(vec_packs(result0, result1), 0, out); } audio_convert_float_to_s16_C(out, in, samples - i); } else audio_convert_float_to_s16_C(out, in, samples); }
static inline void altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { register int i; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); if ((uintptr_t)dest % 16) { /* badly aligned store, we force store alignment */ /* and will handle load misalignment on val w/ vec_perm */ vector unsigned char perm1; vector signed int v1; for (i = 0 ; (i < dstW) && (((uintptr_t)dest + i) % 16) ; i++) { int t = val[i] >> 19; dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); } perm1 = vec_lvsl(i << 2, val); v1 = vec_ld(i << 2, val); for ( ; i < (dstW - 15); i+=16) { int offset = i << 2; vector signed int v2 = vec_ld(offset + 16, val); vector signed int v3 = vec_ld(offset + 32, val); vector signed int v4 = vec_ld(offset + 48, val); vector signed int v5 = vec_ld(offset + 64, val); vector signed int v12 = vec_perm(v1, v2, perm1); vector signed int v23 = vec_perm(v2, v3, perm1); vector signed int v34 = vec_perm(v3, v4, perm1); vector signed int v45 = vec_perm(v4, v5, perm1); vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); vector unsigned short vs1 = vec_packsu(vA, vB); vector unsigned short vs2 = vec_packsu(vC, vD); vector unsigned char vf = vec_packsu(vs1, vs2); vec_st(vf, i, dest); v1 = v5; } } else { // dest is properly aligned, great
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16*)v1; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 t0, t1, i0, i1, i4; register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); register vec_s32 res = zero_s32v; register vec_u8 align = vec_lvsl(0, v2); int32_t ires; order >>= 4; do { i1 = vec_ld(16, v2); t0 = vec_perm(i2, i1, align); i2 = vec_ld(32, v2); t1 = vec_perm(i1, i2, align); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); i4 = vec_ld(16, v3); t0 = vec_perm(i3, i4, align); i3 = vec_ld(32, v3); t1 = vec_perm(i4, i3, align); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; v2 += 8; v3 += 8; } while(--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
void float_to_int16_altivec(int16_t *dst, const float *src, int len) { int i; vector float s0, s1; vector signed int t0, t1; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); d0 = vec_ld(0, dst+i); t1 = vec_cts(s1, 0); d1 = vec_ld(15, dst+i); d = vec_packs(t0,t1); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); t1 = vec_cts(s1, 0); d = vec_packs(t0,t1); vec_st(d, 0, dst+i); } }
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) { vec_s16 va0, va1, va2, va3; vec_s16 vz0, vz1, vz2, vz3; vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; vec_u8 va_u8; vec_u32 va_u32; vec_s16 vdst_ss; const vec_u16 v6us = vec_splat_u16(6); vec_u8 vdst, vdst_orig; vec_u8 vdst_mask = vec_lvsl(0, dst); int element = ((unsigned long)dst & 0xf) >> 2; LOAD_ZERO; block[0] += 32; /* add 32 as a DC-level for rounding */ vtmp0 = vec_ld(0,block); vtmp1 = vec_sld(vtmp0, vtmp0, 8); vtmp2 = vec_ld(16,block); vtmp3 = vec_sld(vtmp2, vtmp2, 8); memset(block, 0, 16 * sizeof(int16_t)); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); va0 = vec_sra(va0,v6us); va1 = vec_sra(va1,v6us); va2 = vec_sra(va2,v6us); va3 = vec_sra(va3,v6us); VEC_LOAD_U8_ADD_S16_STORE_U8(va0); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va1); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va2); dst += stride; VEC_LOAD_U8_ADD_S16_STORE_U8(va3); }
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) { union { vector float v; float s[4]; } mul_u; int i; vector float src1, src2, dst1, dst2, mul_v, zero; zero = (vector float)vec_splat_u32(0); mul_u.s[0] = mul; mul_v = vec_splat(mul_u.v, 0); for(i=0; i<len; i+=8) { src1 = vec_ctf(vec_ld(0, src+i), 0); src2 = vec_ctf(vec_ld(16, src+i), 0); dst1 = vec_madd(src1, mul_v, zero); dst2 = vec_madd(src2, mul_v, zero); vec_st(dst1, 0, dst+i); vec_st(dst2, 16, dst+i); } }
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) { int i; register vec_s16_t vec, *pv; for(i = 0; i < order; i += 8){ pv = (vec_s16_t*)v2; vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); v1 += 8; v2 += 8; } }
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, intptr_t blocksize) { int i; vector float m, a; vector bool int t0, t1; const vector unsigned int v_31 = //XXX vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); for (i = 0; i < blocksize; i += 4) { m = vec_ld(0, mag+i); a = vec_ld(0, ang+i); t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); t0 = (vector bool int)vec_and(a, t1); t1 = (vector bool int)vec_andc(a, t1); a = vec_sub(m, (vector float)t1); m = vec_add(m, (vector float)t0); vec_stl(a, 0, ang+i); vec_stl(m, 0, mag+i); } }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static void vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } }
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) { int i; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { d0 = vec_ld(0, dst+i); d = float_to_int16_one_altivec(src+i); d1 = vec_ld(15, dst+i); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { d = float_to_int16_one_altivec(src+i); vec_st(d, 0, dst+i); } }
int main (int argc, const char * argv[]) { int i; const float cf = 1.0; vector float v; const vector float cv = (vector float){1.0, 2.0, 3.0, 4.0}; vec_dst(&cv, i, 0); v = vec_ld(0, &cv); v = vec_lde(0, &cf); vec_lvsl(0, &cf); return 0; }
static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; vec_u8 perm_ldv = vec_lvsl(0, dst); vec_u8 perm_stv = vec_lvsr(8, dst); const vec_u16 onev = vec_splat_u16(1); const vec_u16 twov = vec_splat_u16(2); const vec_u16 sixv = vec_splat_u16(6); const vec_u8 sel = (vec_u8) { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; LOAD_ZERO; dct[0] += 32; // rounding for the >>6 at the end s0 = vec_ld(0x00, (int16_t *)dct); s1 = vec_ld(0x10, (int16_t *)dct); s2 = vec_ld(0x20, (int16_t *)dct); s3 = vec_ld(0x30, (int16_t *)dct); s4 = vec_ld(0x40, (int16_t *)dct); s5 = vec_ld(0x50, (int16_t *)dct); s6 = vec_ld(0x60, (int16_t *)dct); s7 = vec_ld(0x70, (int16_t *)dct); IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); }
static int a52_resample_STEREO_to_2_altivec(float * _f, int16_t * s16){ #if 0 int i; int32_t * f = (int32_t *) _f; for (i = 0; i < 256; i++) { s16[2*i] = convert (f[i]); s16[2*i+1] = convert (f[i+256]); } return 2*256; #else int i = 0; int32_t * f = (int32_t *) _f; register vector signed int f0, f4, f256, f260; register vector signed short reven, rodd, r0, r1; for (i = 0; i < 256; i+= 8) { f0 = vec_ld(0, f); f4 = vec_ld(16, f); f256 = vec_ld(1024, f); f260 = vec_ld(1040, f); reven = convert16_altivec(f0, f4); rodd = convert16_altivec(f256, f260); r0 = vec_mergeh(reven, rodd); r1 = vec_mergel(reven, rodd); // FIXME can be merged to spare some I/O unaligned_store(r0, 0, s16); unaligned_store(r1, 16, s16); f += 8; s16 += 16; } return(2*256); #endif }
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len) { union { vector float v; float s[4]; } vadd; vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj; const vector unsigned char reverse = vcprm(3,2,1,0); int i,j; dst += len; win += len; src0+= len; vadd.s[0] = add_bias; vadd_bias = vec_splat(vadd.v, 0); zero = (vector float)vec_splat_u32(0); for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { s0 = vec_ld(i, src0); s1 = vec_ld(j, src1); wi = vec_ld(i, win); wj = vec_ld(j, win); s1 = vec_perm(s1, s1, reverse); wj = vec_perm(wj, wj, reverse); t0 = vec_madd(s0, wj, vadd_bias); t0 = vec_nmsub(s1, wi, t0); t1 = vec_madd(s0, wi, vadd_bias); t1 = vec_madd(s1, wj, t1); t1 = vec_perm(t1, t1, reverse); vec_st(t0, i, dst); vec_st(t1, j, dst); } }
float expf(float x) { #else float vexpf(float x) { #endif vector float vexpa, va;//, vx, vn, va, vb, // v0, vlog2, vln2; register float exp, a;//, b, b2, b3, b4;//, b6, b8, b10, R0, R; float __attribute__((aligned(16))) xa[4]; xa[0] = x; /* // set up a few constants vlog2 = vec_ld(0, &C_EXPF[0]); v0 = (vector float) vec_splat_u32(0); vln2 = vec_splat(vlog2, 1); vlog2 = vec_splat(vlog2, 0); // Load x into a vector float vx = vec_ld(0, xa); vx = vec_splat(vx, 0); // Split x = n*log2e + b vn = vec_madd(vx, vlog2e, v0); vn = vec_floor(vn);*/ xa[0] = truncf(x*M_LOG2E); va = vec_ld(0, xa); vexpa = vec_expte(va); a = xa[0] * M_LN2; vec_st(vexpa, 0, xa); /* b = x - a; b2 = b*b; b3 = b2*b; b4 = b2*b2; b6 = b4*b2; b8 = b6*b2; b10 = b8*b2; R0 = 0.1666666666666666019037 *b2 - 0.00277777777770155933842 *b4 + 6.61375632143793436117e-05 *b6 - 1.65339022054652515390e-06 *b8 + 4.13813679705723846039e-08 *b10; R = b - R0; //exp = 1.0 + 2.0*b/(2.0 - R); exp = (1680.0 + 840*b + 180*b2 + 20*b3 + b4)/(1680 - 840*b + 180*b2 - 20*b3 + b4); */ exp = xa[0]; return exp; }
void imageFilterSubFrom_Altivec(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { SUBFROM_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using Altivec (sub 16 8-bit unsigned integers, with saturation) while(n >= 16) { vector unsigned char s = vec_ld(0,src); vector unsigned char d = vec_ld(0,dst); vector unsigned char r = vec_subs(d, s); vec_st(r,0,dst); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_SUBFROM(); }
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16 *) v1; register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; register vec_s16 t0, t1, i0, i1, i4, i2, i3; register vec_s32 res = zero_s32v; #if HAVE_BIGENDIAN register vec_u8 align = vec_lvsl(0, v2); i2 = vec_ld(0, v2); i3 = vec_ld(0, v3); #endif int32_t ires; order >>= 4; do { GET_T(t0,t1,v2,i1,i2); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); GET_T(t0,t1,v3,i4,i3); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; v2 += 16; v3 += 16; } while (--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
/* Place the content of the array of structures in vectors x_vec, y_vec, z_vec, and t_vec */ int main(int argc, char **argv) { vector float x_vec, y_vec, z_vec, t_vec, hold[4], tmp[4]; /* Load structures into vectors */ hold[0] = vec_ld(0, (float*)p_motion); hold[1] = vec_ld(0, (float*)&p_motion[1]); hold[2] = vec_ld(0, (float*)&p_motion[2]); hold[3] = vec_ld(0, (float*)&p_motion[3]); /* Perform first step of the swizzle */ tmp[0] = vec_mergeh(hold[0], hold[2]); tmp[1] = vec_mergeh(hold[1], hold[3]); tmp[2] = vec_mergel(hold[0], hold[2]); tmp[3] = vec_mergel(hold[1], hold[3]); /* Perform second step of the swizzle */ x_vec = vec_mergeh(tmp[0], tmp[1]); y_vec = vec_mergel(tmp[0], tmp[1]); z_vec = vec_mergeh(tmp[2], tmp[3]); t_vec = vec_mergel(tmp[2], tmp[3]); return 0; }
void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] ) { vec_u16_t onev = vec_splat_u16(1); vec_u16_t twov = vec_splat_u16(2); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; s0 = vec_ld(0x00, dct); s1 = vec_ld(0x10, dct); s2 = vec_ld(0x20, dct); s3 = vec_ld(0x30, dct); s4 = vec_ld(0x40, dct); s5 = vec_ld(0x50, dct); s6 = vec_ld(0x60, dct); s7 = vec_ld(0x70, dct); vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7; VEC_TRANSPOSE_8( d0, d1, d2, d3, d4, d5, d6, d7, tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7); vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; IDCT8_1D_ALTIVEC(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); vec_u8_t perm_ldv = vec_lvsl(0, dst); vec_u8_t perm_stv = vec_lvsr(8, dst); vec_u16_t sixv = vec_splat_u16(6); const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1); LOAD_ZERO; ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel); }
void b() { z = vec_add (x, y); /* Make sure the predicates accept correct argument types. */ int1 = vec_all_in (f, g); int1 = vec_all_ge (f, g); int1 = vec_all_eq (c, d); int1 = vec_all_ne (s, t); int1 = vec_any_eq (i, j); int1 = vec_any_ge (f, g); int1 = vec_all_ngt (f, g); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); vec_mtvscr (i); vec_dssall (); s = (vector signed short) vec_mfvscr (); vec_dss (3); vec_dst (pi, int1 + int2, 3); vec_dstst (pi, int1 + int2, 3); vec_dststt (pi, int1 + int2, 3); vec_dstt (pi, int1 + int2, 3); uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi); uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi); c = vec_lde (int1, (signed char *) pi); s = vec_lde (int1, (signed short *) pi); i = vec_lde (int1, (signed int *) pi); i = vec_ldl (int1, pi); i = vec_ld (int1, pi); vec_st (i, int2, pi); vec_ste (c, int2, (signed char *) pi); vec_ste (s, int2, (signed short *) pi); vec_ste (i, int2, (signed int *) pi); vec_stl (i, int2, pi); }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ vec1 = vec_unaligned_load(v1); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
/* next one assumes that ((line_size % 16) == 0) */ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2; register vector unsigned char pixelsv1B, pixelsv2B; register vector unsigned char pixelsv1C, pixelsv2C; register vector unsigned char pixelsv1D, pixelsv2D; register vector unsigned char perm = vec_lvsl(0, pixels); int i; register ptrdiff_t line_size_2 = line_size << 1; register ptrdiff_t line_size_3 = line_size + line_size_2; register ptrdiff_t line_size_4 = line_size << 2; // hand-unrolling the loop by 4 gains about 15% // mininum execution time goes from 74 to 60 cycles // it's faster than -funroll-loops, but using // -funroll-loops w/ this is bad - 74 cycles again. // all this is on a 7450, tuning for the 7450 for (i = 0; i < h; i += 4) { pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(15, pixels); pixelsv1B = vec_ld(line_size, pixels); pixelsv2B = vec_ld(15 + line_size, pixels); pixelsv1C = vec_ld(line_size_2, pixels); pixelsv2C = vec_ld(15 + line_size_2, pixels); pixelsv1D = vec_ld(line_size_3, pixels); pixelsv2D = vec_ld(15 + line_size_3, pixels); vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block); vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), line_size, (unsigned char*)block); vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), line_size_2, (unsigned char*)block); vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), line_size_3, (unsigned char*)block); pixels+=line_size_4; block +=line_size_4; } }
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) { register vector unsigned char pixelsv1, pixelsv2; register vector unsigned char pixelsv1B, pixelsv2B; register vector unsigned char pixelsv1C, pixelsv2C; register vector unsigned char pixelsv1D, pixelsv2D; register vector unsigned char perm = vec_lvsl(0, src); int i; register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; // hand-unrolling the loop by 4 gains about 15% // mininum execution time goes from 74 to 60 cycles // it's faster than -funroll-loops, but using // -funroll-loops w/ this is bad - 74 cycles again. // all this is on a 7450, tuning for the 7450 for (i = 0; i < h; i += 4) { pixelsv1 = vec_ld( 0, src); pixelsv2 = vec_ld(15, src); pixelsv1B = vec_ld(sstride, src); pixelsv2B = vec_ld(15 + sstride, src); pixelsv1C = vec_ld(sstride2, src); pixelsv2C = vec_ld(15 + sstride2, src); pixelsv1D = vec_ld(sstride3, src); pixelsv2D = vec_ld(15 + sstride3, src); vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)dst); vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), dstride, (unsigned char*)dst); vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), dstride2, (unsigned char*)dst); vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), dstride3, (unsigned char*)dst); src += sstride4; dst += dstride4; } }
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { register vector unsigned char p2 = vec_ld(-3*stride, pix); register vector unsigned char p1 = vec_ld(-2*stride, pix); register vector unsigned char p0 = vec_ld(-1*stride, pix); register vector unsigned char q0 = vec_ld(0, pix); register vector unsigned char q1 = vec_ld(stride, pix); register vector unsigned char q2 = vec_ld(2*stride, pix); h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); vec_st(p1, -2*stride, pix); vec_st(p0, -1*stride, pix); vec_st(q0, 0, pix); vec_st(q1, stride, pix); } }