static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; dc16 = __vsplth(__lvewx(&dc, 0), 1); if (size == 4) dc16 = __vsldoi(dc16, zero_s16v, 8); dcplus = __vpkshus(dc16, zero_s16v); dcminus = __vpkshus(__vsubuhm(zero_s16v, dc16), zero_s16v); aligner = __lvsr(0, dst); dcplus = __perm(dcplus, dcplus, aligner); dcminus = __perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = __lvx(dst+0*stride, 0); v1 = __lvx(dst+1*stride, 0); v2 = __lvx(dst+2*stride, 0); v3 = __lvx(dst+3*stride, 0); v0 = __vaddubs(v0, dcplus); v1 = __vaddubs(v1, dcplus); v2 = __vaddubs(v2, dcplus); v3 = __vaddubs(v3, dcplus); v0 = __vsububs(v0, dcminus); v1 = __vsububs(v1, dcminus); v2 = __vsububs(v2, dcminus); v3 = __vsububs(v3, dcminus); __stvx(v0, dst+0*stride, 0); __stvx(v1, dst+1*stride, 0); __stvx(v2, dst+2*stride, 0); __stvx(v3, dst+3*stride, 0); dst += 4*stride; } }
/* dest*~srca + src */ static force_inline __vector4 over (__vector4 src, __vector4 srca, __vector4 dest) { __vector4 tmp = pix_multiply (dest, negate (srca)); tmp = __vaddubs (src, tmp); return tmp; }
static force_inline __vector4 pix_add (__vector4 a, __vector4 b) { return __vaddubs(a,b); }