static WEBP_INLINE uint32_t Average3(const uint32_t* const a, const uint32_t* const b, const uint32_t* const c) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); const uint8x8_t avg1 = vhadd_u8(a0, c0); const uint8x8_t avg2 = vhadd_u8(avg1, b0); return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); }
static WEBP_INLINE uint32_t Average4(const uint32_t* const a, const uint32_t* const b, const uint32_t* const c, const uint32_t* const d) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); const uint8x8_t avg1 = vhadd_u8(a0, b0); const uint8x8_t avg2 = vhadd_u8(c0, d0); const uint8x8_t avg3 = vhadd_u8(avg1, avg2); return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); }
static WEBP_INLINE uint32_t Average3(const uint32_t* const a, const uint32_t* const b, const uint32_t* const c) { const uint64x1_t a0 = { *a }, b0 = { *b }, c0 = { *c }; const uint8x8_t a1 = vreinterpret_u8_u64(a0); const uint8x8_t b1 = vreinterpret_u8_u64(b0); const uint8x8_t c1 = vreinterpret_u8_u64(c0); const uint8x8_t avg1 = vhadd_u8(a1, c1); const uint8x8_t avg2 = vhadd_u8(avg1, b1); uint32_t ret; vst1_lane_u32(&ret, vreinterpret_u32_u8(avg2), 0); return ret; }
static WEBP_INLINE uint32_t Average2(const uint32_t* const a, const uint32_t* const b) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t avg = vhadd_u8(a0, b0); return vget_lane_u32(vreinterpret_u32_u8(avg), 0); }
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XABCD_u8 = vld1_u8(above - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); const uint8_t D = vget_lane_u8(XABCD_u8, 4); const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); }
static WEBP_INLINE uint32_t Average2(const uint32_t* const a, const uint32_t* const b) { const uint64x1_t a0 = { *a }, b0 = { *b }; const uint8x8_t a1 = vreinterpret_u8_u64(a0); const uint8x8_t b1 = vreinterpret_u8_u64(b0); const uint8x8_t avg = vhadd_u8(a1, b1); uint32_t ret; vst1_lane_u32(&ret, vreinterpret_u32_u8(avg), 0); return ret; }
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); return vget_lane_u32(vreinterpret_u32_u8(out), 0); }
void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; png_const_bytep pp = prev_row; uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); png_debug(1, "in png_read_filter_row_avg4_neon"); for (; rp < rp_stop; rp += 16, pp += 16) { uint32x2x4_t vtmp; uint8x8x4_t *vrpt, *vppt; uint8x8x4_t vrp, vpp; uint32x2x4_t *temp_pointer; vtmp = vld4_u32(png_ptr(uint32_t,rp)); vrpt = png_ptr(uint8x8x4_t,&vtmp); vrp = *vrpt; vtmp = vld4_u32(png_ptrc(uint32_t,pp)); vppt = png_ptr(uint8x8x4_t,&vtmp); vpp = *vppt; vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); } }
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 }; const uint8x8_t p0 = vreinterpret_u8_u64(C0); const uint8x8_t p1 = vreinterpret_u8_u64(C1); const uint8x8_t p2 = vreinterpret_u8_u64(C2); const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); uint32_t ret; vst1_lane_u32(&ret, vreinterpret_u32_u8(out), 0); return ret; }
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; const uint8x8_t sh_12345677 = vld1_u8(shuffle1); const uint8x8_t sh_23456777 = vld1_u8(shuffle2); const uint8x8_t A0 = vld1_u8(above); // top row const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); const uint8x8_t avg1 = vhadd_u8(A0, A2); uint8x8_t row = vrhadd_u8(avg1, A1); int i; (void)left; for (i = 0; i < 7; ++i) { vst1_u8(dst + i * stride, row); row = vtbl1_u8(row, sh_12345677); } vst1_u8(dst + i * stride, row); }
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row const uint64x1_t A1 = vshr_n_u64(A0, 8); const uint64x1_t A2 = vshr_n_u64(A0, 16); const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r0 = vreinterpret_u32_u8(avg2); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); (void)left; vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); dst[3 * stride + 3] = above[7]; }
void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_const_bytep pp = prev_row; png_bytep rp_stop = row + row_info->rowbytes; uint8x16_t vtmp; uint8x8x2_t *vrpt; uint8x8x2_t vrp; uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); vtmp = vld1q_u8(rp); vrpt = png_ptr(uint8x8x2_t,&vtmp); vrp = *vrpt; png_debug(1, "in png_read_filter_row_avg3_neon"); for (; rp < rp_stop; pp += 12) { uint8x8_t vtmp1, vtmp2, vtmp3; uint8x8x2_t *vppt; uint8x8x2_t vpp; uint32x2_t *temp_pointer; vtmp = vld1q_u8(pp); vppt = png_ptr(uint8x8x2_t,&vtmp); vpp = *vppt; vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); vtmp = vld1q_u8(rp + 12); vrpt = png_ptr(uint8x8x2_t,&vtmp); vrp = *vrpt; vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); rp += 3; } }