static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, mask1, mask2; #if HAVE_BIGENDIAN vec_u8 tmp1, tmp2, mask, edges, align; mask1 = vec_lvsl(0, src1); mask2 = vec_lvsl(0, src2); mask = vec_lvsl(0, dst); align = vec_lvsr(0, dst); #endif for (i = 0; i < h; i++) { a = load_with_perm_vec(i * src_stride1, src1, mask1); b = load_with_perm_vec(i * 16, src2, mask2); d = vec_avg(a, b); put_unaligned_store_with_mask_align(d, dst, mask, align); dst += dst_stride; } }
static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, mask_; #if HAVE_BIGENDIAN vec_u8 tmp1, tmp2, mask, edges, align; mask_ = vec_lvsl(0, src2); #endif for (i = 0; i < h; i++) { a = unaligned_load(i * src_stride1, src1); b = load_with_perm_vec(i * 16, src2, mask_); d = vec_avg(a, b); avg_unligned_store(d, dst); dst += dst_stride; } }
static av_always_inline void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int my, int w, int is6tap) { LOAD_V_SUBPEL_FILTER(my-1); vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); vec_u16 c7 = vec_splat_u16(7); // we want pixels 0-7 to be in the even positions and 8-15 in the odd, // so combine this permute with the alignment permute vector align_vech = vec_lvsl(0, src); align_vecl = vec_sld(align_vech, align_vech, 8); if (w ==16) perm_vec = vec_mergeh(align_vech, align_vecl); else perm_vec = vec_mergeh(align_vech, align_vech); if (is6tap) s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); if (is6tap) s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); src += (2+is6tap)*src_stride; while (h --> 0) { if (is6tap) s5 = load_with_perm_vec(0, src, perm_vec); else s4 = load_with_perm_vec(0, src, perm_vec); FILTER_V(f16h, vec_mule); if (w == 16) { FILTER_V(f16l, vec_mulo); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); if (w == 4) filt = (vec_u8)vec_splat((vec_u32)filt, 0); else vec_ste((vec_u32)filt, 4, (uint32_t*)dst); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); } if (is6tap) s0 = s1; s1 = s2; s2 = s3; s3 = s4; if (is6tap) s4 = s5; dst += dst_stride; src += src_stride; } }
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride) { register int i; LOAD_ZERO; vec_u8 perm; #if HAVE_BIGENDIAN perm = vec_lvsl(0, src); #endif const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const uint8_t *srcbis = src - (srcStride * 2); const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB, srcP3ssA, srcP3ssB, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; vec_u8 sum, fsum, srcP3; for (i = 0 ; i < 16 ; i++) { srcP3 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1B = vec_adds(srcP0ssB, srcP1ssB); sum2A = vec_adds(srcM1ssA, srcP2ssA); sum2B = vec_adds(srcM1ssB, srcP2ssB); sum3A = vec_adds(srcM2ssA, srcP3ssA); sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }