int main () { *vecfloat++ = vec_andc(vecint[0], vecfloat[1]); *vecfloat++ = vec_andc(vecfloat[0], vecint[1]); *vecfloat++ = vec_vxor(vecint[0], vecfloat[1]); *vecfloat++ = vec_vxor(vecfloat[0], vecint[1]); *varpixel++ = vec_packpx(vecuint[0], vecuint[1]); *varpixel++ = vec_vpkpx(vecuint[0], vecuint[1]); *vecshort++ = vec_vmulosb(vecchar[0], vecchar[1]); *vecint++ = vec_ld(var_int[0], longp[1]); *vecint++ = vec_lde(var_int[0], longp[1]); *vecint++ = vec_ldl(var_int[0], longp[1]); *vecint++ = vec_lvewx(var_int[0], longp[1]); *vecint++ = vec_unpackh(vecshort[0]); *vecint++ = vec_unpackl(vecshort[0]); *vecushort++ = vec_andc(vecshort[0], vecushort[1]); *vecushort++ = vec_andc(vecushort[0], vecshort[1]); *vecushort++ = vec_vxor(vecshort[0], vecushort[1]); *vecushort++ = vec_vxor(vecushort[0], vecshort[1]); *vecuint++ = vec_ld(var_int[0], ulongp[1]); *vecuint++ = vec_lvx(var_int[0], ulongp[1]); *vecuint++ = vec_vmsumubm(vecuchar[0], vecuchar[1], vecuint[2]); *vecuchar++ = vec_xor(vecuchar[0], vecchar[1]); return 0; }
void b() { z = vec_add (x, y); /* Make sure the predicates accept correct argument types. */ int1 = vec_all_in (f, g); int1 = vec_all_ge (f, g); int1 = vec_all_eq (c, d); int1 = vec_all_ne (s, t); int1 = vec_any_eq (i, j); int1 = vec_any_ge (f, g); int1 = vec_all_ngt (f, g); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); vec_mtvscr (i); vec_dssall (); s = (vector signed short) vec_mfvscr (); vec_dss (3); vec_dst (pi, int1 + int2, 3); vec_dstst (pi, int1 + int2, 3); vec_dststt (pi, int1 + int2, 3); vec_dstt (pi, int1 + int2, 3); uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi); uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi); c = vec_lde (int1, (signed char *) pi); s = vec_lde (int1, (signed short *) pi); i = vec_lde (int1, (signed int *) pi); i = vec_ldl (int1, pi); i = vec_ld (int1, pi); vec_st (i, int2, pi); vec_ste (c, int2, (signed char *) pi); vec_ste (s, int2, (signed short *) pi); vec_ste (i, int2, (signed int *) pi); vec_stl (i, int2, pi); }
void f22() { *var_vec_u32++ = vec_ld(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u32++ = vec_lde(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u32++ = vec_ldl(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u32++ = vec_lvewx(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u32++ = vec_lvx(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u32++ = vec_lvxl(var_int[0], var_unsigned_long_ptr[1]); }
void f13() { *var_vec_s32++ = vec_ld(var_int[0], var_long_ptr[1]); *var_vec_s32++ = vec_lde(var_int[0], var_long_ptr[1]); *var_vec_s32++ = vec_ldl(var_int[0], var_long_ptr[1]); *var_vec_s32++ = vec_lvewx(var_int[0], var_long_ptr[1]); *var_vec_s32++ = vec_lvx(var_int[0], var_long_ptr[1]); *var_vec_s32++ = vec_lvxl(var_int[0], var_long_ptr[1]); }
static void test () { vector unsigned char vuc; vector signed char vsc; vector unsigned short vus; vector signed short vss; vector unsigned int vui; vector signed int vsi; vector float vf; init (); vuc = vec_lde (9*1, (unsigned char *)svuc); vsc = vec_lde (14*1, (signed char *)svsc); vus = vec_lde (7*2, (unsigned short *)svus); vss = vec_lde (1*2, (signed short *)svss); vui = vec_lde (3*4, (unsigned int *)svui); vsi = vec_lde (2*4, (signed int *)svsi); vf = vec_lde (0*4, (float *)svf); check (vec_extract (vuc, 9) == 9, "vuc"); check (vec_extract (vsc, 14) == 6, "vsc"); check (vec_extract (vus, 7) == 7, "vus"); check (vec_extract (vss, 1) == -3, "vss"); check (vec_extract (vui, 3) == 3, "vui"); check (vec_extract (vsi, 2) == 0, "vsi"); check (vec_extract (vf, 0) == 0.0, "vf"); }
int main (int argc, const char * argv[]) { int i; const float cf = 1.0; vector float v; const vector float cv = (vector float){1.0, 2.0, 3.0, 4.0}; vec_dst(&cv, i, 0); v = vec_ld(0, &cv); v = vec_lde(0, &cf); vec_lvsl(0, &cf); return 0; }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; vec_s32 v_dc32; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; v_dc32 = vec_lde(0, &dc); dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1); if (size == 4) dc16 = VEC_SLD16(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); #if !HAVE_BIGENDIAN aligner = vec_perm(aligner, zero_u8v, vcswapc()); #endif dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8 * to preserve proper dst alignment. */ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { int i; const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = { (16 - x16) * (16 - y16), /* A */ (x16) * (16 - y16), /* B */ (16 - x16) * (y16), /* C */ (x16) * (y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const vector unsigned char vczero = (const vector unsigned char) vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short) vec_splat_u16(8); register vector unsigned char dstv, dstv2, srcvB, srcvC, srcvD; register vector unsigned short tempB, tempC, tempD; unsigned long dst_odd = (unsigned long) dst & 0x0000000F; unsigned long src_really_odd = (unsigned long) src & 0x0000000F; register vector unsigned short tempA = vec_ld(0, (const unsigned short *) ABCD); register vector unsigned short Av = vec_splat(tempA, 0); register vector unsigned short Bv = vec_splat(tempA, 1); register vector unsigned short Cv = vec_splat(tempA, 2); register vector unsigned short Dv = vec_splat(tempA, 3); register vector unsigned short rounderV = vec_splat((vec_u16) vec_lde(0, &rounder_a), 0); /* we'll be able to pick-up our 9 char elements at src from those * 32 bytes we load the first batch here, as inside the loop we can * reuse 'src + stride' from one iteration as the 'src' of the next. */ register vector unsigned char src_0 = vec_ld(0, src); register vector unsigned char src_1 = vec_ld(16, src); register vector unsigned char srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); else srcvB = src_1; srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for (i = 0; i < h; i++) { dst_odd = (unsigned long) dst & 0x0000000F; src_really_odd = (((unsigned long) src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); /* We'll be able to pick-up our 9 char elements at src + stride from * those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD * as the next srcvA and srcvB. */ src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); else srcvD = src_1; srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); /* OK, now we (finally) do the math :-) * Those four instructions replace 32 int muls & 32 int adds. * Isn't AltiVec nice? */ tempA = vec_mladd((vector unsigned short) srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short) srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short) srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short) srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short) vczero); if (dst_odd) dstv2 = vec_perm(dstv, dstv2, vcprm(0, 1, s0, s1)); else dstv2 = vec_perm(dstv, dstv2, vcprm(s0, s1, 2, 3)); vec_st(dstv2, 0, dst); dst += stride; src += stride; } }
/* AltiVec version of dct_unquantize_h263 this code assumes `block' is 16 bytes-aligned */ static void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); qadd = (qscale - 1) | 1; qmul = qscale << 1; if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) block[0] = block[0] * s->y_dc_scale; else block[0] = block[0] * s->c_dc_scale; }else qadd = 0; i = 1; nCoeffs= 63; //does not always use zigzag table } else { i = 0; nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } { register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); DECLARE_ALIGNED(16, short, qmul8) = qmul; DECLARE_ALIGNED(16, short, qadd8) = qadd; register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; register vector bool short blockv_null, blockv_neg; register short backup_0 = block[0]; register int j = 0; qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); nqaddv = vec_sub(vczero, qaddv); #if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } #endif // vectorize all the 16 bytes-aligned blocks // of 8 elements for(; (j + 7) <= nCoeffs ; j+=8) { blockv = vec_ld(j << 1, block); blockv_neg = vec_cmplt(blockv, vczero); blockv_null = vec_cmpeq(blockv, vczero); // choose between +qadd or -qadd as the third operand temp1 = vec_sel(qaddv, nqaddv, blockv_neg); // multiply & add (block{i,i+7} * qmul [+-] qadd) temp1 = vec_mladd(blockv, qmulv, temp1); // put 0 where block[{i,i+7} used to have 0 blockv = vec_sel(temp1, blockv, blockv_null); vec_st(blockv, j << 1, block); } // if nCoeffs isn't a multiple of 8, finish the job // using good old scalar units. // (we could do it using a truncated vector, // but I'm not sure it's worth the hassle) for(; j <= nCoeffs ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } if (i == 1) { // cheat. this avoid special-casing the first iteration block[0] = backup_0; } } }
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a) = rounder; const DECLARE_ALIGNED_16(unsigned short, ABCD)[8] = { (16-x16)*(16-y16), /* A */ ( x16)*(16-y16), /* B */ (16-x16)*( y16), /* C */ ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; int i; unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); Cv = vec_splat(tempA, 2); Dv = vec_splat(tempA, 3); rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0); // we'll be able to pick-up our 9 char elements // at src from those 32 bytes // we load the first batch here, as inside the loop // we can re-use 'src+stride' from one iteration // as the 'src' of the next. src_0 = vec_ld(0, src); src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned // on the second vector. srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); } else { srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for(i=0; i<h; i++) { dst_odd = (unsigned long)dst & 0x0000000F; src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); // we we'll be able to pick-up our 9 char elements // at src + stride from those 32 bytes // then reuse the resulting 2 vectors srvcC and srcvD // as the next srcvA and srcvB src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned // on the second vector. srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); } else { srcvD = src_1; } srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); // OK, now we (finally) do the math :-) // those four instructions replaces 32 int muls & 32 int adds. // isn't AltiVec nice ? tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short)vczero); if (dst_odd) { dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); } else { dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); } vec_st(dstv2, 0, dst); dst += stride; src += stride; } POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); }