static WEBP_INLINE uint32_t Select(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); const int32_t pa_minus_pb = vget_lane_s32(diff, 0); return (pa_minus_pb <= 0) ? *c0 : *c1; }
static WEBP_INLINE uint32_t Select(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 }; const uint8x8_t p0 = vreinterpret_u8_u64(C0); const uint8x8_t p1 = vreinterpret_u8_u64(C1); const uint8x8_t p2 = vreinterpret_u8_u64(C2); const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); int32_t pa_minus_pb; vst1_lane_s32(&pa_minus_pb, diff, 0); return (pa_minus_pb <= 0) ? *c0 : *c1; }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
unsigned int vp8_variance_halfpixvar16x16_hv_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; int32x4_t q13s32, q14s32, q15s32; int64x2_t q0s64, q1s64, q5s64; q13s32 = vdupq_n_s32(0); q14s32 = vdupq_n_s32(0); q15s32 = vdupq_n_s32(0); q0u8 = vld1q_u8(src_ptr); q1u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q1u8 = vextq_u8(q0u8, q1u8, 1); q0u8 = vrhaddq_u8(q0u8, q1u8); for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon q2u8 = vld1q_u8(src_ptr); q3u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q4u8 = vld1q_u8(src_ptr); q5u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q6u8 = vld1q_u8(src_ptr); q7u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q8u8 = vld1q_u8(src_ptr); q9u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q3u8 = vextq_u8(q2u8, q3u8, 1); q5u8 = vextq_u8(q4u8, q5u8, 1); q7u8 = vextq_u8(q6u8, q7u8, 1); q9u8 = vextq_u8(q8u8, q9u8, 1); q1u8 = vrhaddq_u8(q2u8, q3u8); q2u8 = vrhaddq_u8(q4u8, q5u8); q3u8 = vrhaddq_u8(q6u8, q7u8); q4u8 = vrhaddq_u8(q8u8, q9u8); q0u8 = vrhaddq_u8(q0u8, q1u8); q1u8 = vrhaddq_u8(q1u8, q2u8); q2u8 = vrhaddq_u8(q2u8, q3u8); q3u8 = vrhaddq_u8(q3u8, q4u8); q5u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q6u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q7u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q8u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); d4u8 = vget_low_u8(q2u8); d5u8 = vget_high_u8(q2u8); d6u8 = vget_low_u8(q3u8); d7u8 = vget_high_u8(q3u8); q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); q14s32 = vmlal_s16(q14s32, d18s16, d18s16); q15s32 = vmlal_s16(q15s32, d19s16, d19s16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); q14s32 = vmlal_s16(q14s32, d20s16, d20s16); q15s32 = vmlal_s16(q15s32, d21s16, d21s16); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); q14s32 = vmlal_s16(q14s32, d22s16, d22s16); q15s32 = vmlal_s16(q15s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); q14s32 = vmlal_s16(q14s32, d24s16, d24s16); q15s32 = vmlal_s16(q15s32, d25s16, d25s16); d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); q14s32 = vmlal_s16(q14s32, d0s16, d0s16); q15s32 = vmlal_s16(q15s32, d1s16, d1s16); d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); q14s32 = vmlal_s16(q14s32, d2s16, d2s16); q15s32 = vmlal_s16(q15s32, d3s16, d3s16); d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); q14s32 = vmlal_s16(q14s32, d10s16, d10s16); q15s32 = vmlal_s16(q15s32, d11s16, d11s16); d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); q14s32 = vmlal_s16(q14s32, d12s16, d12s16); q15s32 = vmlal_s16(q15s32, d13s16, d13s16); q0u8 = q4u8; } q15s32 = vaddq_s32(q14s32, q15s32); q0s64 = vpaddlq_s32(q13s32); q1s64 = vpaddlq_s32(q15s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
void vp8_short_fdct4x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; uint16x4_t d4u16; int16x8_t q0s16, q1s16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x4x2_t v2tmp0, v2tmp1; int32x2x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); q11s32 = vdupq_n_s32(12000); q12s32 = vdupq_n_s32(51000); // Part one pitch >>= 1; d0s16 = vld1_s16(input); input += pitch; d1s16 = vld1_s16(input); input += pitch; d2s16 = vld1_s16(input); input += pitch; d3s16 = vld1_s16(input); v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d4s16 = vshl_n_s16(d4s16, 3); d5s16 = vshl_n_s16(d5s16, 3); d6s16 = vshl_n_s16(d6s16, 3); d7s16 = vshl_n_s16(d7s16, 3); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q9s32 = vmlal_s16(q9s32, d7s16, d16s16); q10s32 = vmlal_s16(q10s32, d7s16, d17s16); q9s32 = vmlal_s16(q9s32, d6s16, d17s16); q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 12); d3s16 = vshrn_n_s32(q10s32, 12); // Part two v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d26s16 = vdup_n_s16(7); d4s16 = vadd_s16(d4s16, d26s16); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q11s32 = vmlal_s16(q11s32, d7s16, d16s16); q12s32 = vmlal_s16(q12s32, d7s16, d17s16); dEmptys16 = vdup_n_s16(0); d4u16 = vceq_s16(d7s16, dEmptys16); d0s16 = vshr_n_s16(d0s16, 4); d2s16 = vshr_n_s16(d2s16, 4); q11s32 = vmlal_s16(q11s32, d6s16, d17s16); q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); d4u16 = vmvn_u16(d4u16); d1s16 = vshrn_n_s32(q11s32, 16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); d3s16 = vshrn_n_s32(q12s32, 16); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); return; }
unsigned int vp8_sub_pixel_variance16x16_neon_func( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { int i; DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); unsigned char *tmpp; unsigned char *tmpp2; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; uint8x8_t d19u8, d20u8, d21u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; tmpp2 = tmp + 272; tmpp = tmp; if (xoffset == 0) { // secondpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); q11u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } else if (yoffset == 0) { // firstpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); for (i = 4; i > 0 ; i--) { d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp2, q7u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q8u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q9u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q10u8); tmpp2 += 16; } } else { d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; // First Pass: output_height lines x output_width columns (17x16) for (i = 3; i > 0; i--) { q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; } // First-pass filtering for rest 5 lines d14u8 = vld1_u8(src_ptr); d15u8 = vld1_u8(src_ptr + 8); d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q9u16 = vmull_u8(d2u8, d0u8); q10u16 = vmull_u8(d3u8, d0u8); q11u16 = vmull_u8(d5u8, d0u8); q12u16 = vmull_u8(d6u8, d0u8); q13u16 = vmull_u8(d8u8, d0u8); q14u16 = vmull_u8(d9u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); q9u16 = vmlal_u8(q9u16, d2u8, d1u8); q11u16 = vmlal_u8(q11u16, d5u8, d1u8); q13u16 = vmlal_u8(q13u16, d8u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); q10u16 = vmlal_u8(q10u16, d3u8, d1u8); q12u16 = vmlal_u8(q12u16, d6u8, d1u8); q14u16 = vmlal_u8(q14u16, d9u8, d1u8); q1u16 = vmull_u8(d11u8, d0u8); q2u16 = vmull_u8(d12u8, d0u8); q3u16 = vmull_u8(d14u8, d0u8); q4u16 = vmull_u8(d15u8, d0u8); d11u8 = vext_u8(d11u8, d12u8, 1); d14u8 = vext_u8(d14u8, d15u8, 1); q1u16 = vmlal_u8(q1u16, d11u8, d1u8); q3u16 = vmlal_u8(q3u16, d14u8, d1u8); d12u8 = vext_u8(d12u8, d13u8, 1); d15u8 = vext_u8(d15u8, d16u8, 1); q2u16 = vmlal_u8(q2u16, d12u8, d1u8); q4u16 = vmlal_u8(q4u16, d15u8, d1u8); d10u8 = vqrshrn_n_u16(q9u16, 7); d11u8 = vqrshrn_n_u16(q10u16, 7); d12u8 = vqrshrn_n_u16(q11u16, 7); d13u8 = vqrshrn_n_u16(q12u16, 7); d14u8 = vqrshrn_n_u16(q13u16, 7); d15u8 = vqrshrn_n_u16(q14u16, 7); d16u8 = vqrshrn_n_u16(q1u16, 7); d17u8 = vqrshrn_n_u16(q2u16, 7); d18u8 = vqrshrn_n_u16(q3u16, 7); d19u8 = vqrshrn_n_u16(q4u16, 7); q5u8 = vcombine_u8(d10u8, d11u8); q6u8 = vcombine_u8(d12u8, d13u8); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); // secondpass_filter d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); tmpp = tmp; tmpp2 = tmpp + 272; q11u8 = vld1q_u8(tmpp); tmpp += 16; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(tmpp); tmpp += 16; q13u8 = vld1q_u8(tmpp); tmpp += 16; q14u8 = vld1q_u8(tmpp); tmpp += 16; q15u8 = vld1q_u8(tmpp); tmpp += 16; q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } // sub_pixel_variance16x16_neon q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); tmpp = tmp + 272; for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop q0u8 = vld1q_u8(tmpp); tmpp += 16; q1u8 = vld1q_u8(tmpp); tmpp += 16; q2u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; q3u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
void vp8_short_fdct8x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; uint16x4_t d28u16, d29u16; uint16x8_t q14u16; int16x8_t q0s16, q1s16, q2s16, q3s16; int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x8x2_t v2tmp0, v2tmp1; int32x4x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); // Part one pitch >>= 1; q0s16 = vld1q_s16(input); input += pitch; q1s16 = vld1q_s16(input); input += pitch; q2s16 = vld1q_s16(input); input += pitch; q3s16 = vld1q_s16(input); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q11s16 = vshlq_n_s16(q11s16, 3); q12s16 = vshlq_n_s16(q12s16, 3); q13s16 = vshlq_n_s16(q13s16, 3); q14s16 = vshlq_n_s16(q14s16, 3); q0s16 = vaddq_s16(q11s16, q12s16); q2s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d2s16 = vshrn_n_s32(q9s32, 12); d6s16 = vshrn_n_s32(q10s32, 12); d3s16 = vshrn_n_s32(q11s32, 12); d7s16 = vshrn_n_s32(q12s32, 12); q1s16 = vcombine_s16(d2s16, d3s16); q3s16 = vcombine_s16(d6s16, d7s16); // Part two q9s32 = vdupq_n_s32(12000); q10s32 = vdupq_n_s32(51000); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q15s16 = vdupq_n_s16(7); q11s16 = vaddq_s16(q11s16, q15s16); q0s16 = vaddq_s16(q11s16, q12s16); q1s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d0s16 = vget_low_s16(q0s16); d1s16 = vget_high_s16(q0s16); d2s16 = vget_low_s16(q1s16); d3s16 = vget_high_s16(q1s16); d0s16 = vshr_n_s16(d0s16, 4); d4s16 = vshr_n_s16(d1s16, 4); d2s16 = vshr_n_s16(d2s16, 4); d6s16 = vshr_n_s16(d3s16, 4); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 16); d3s16 = vshrn_n_s32(q10s32, 16); d5s16 = vshrn_n_s32(q11s32, 16); d7s16 = vshrn_n_s32(q12s32, 16); qEmptys16 = vdupq_n_s16(0); q14u16 = vceqq_s16(q14s16, qEmptys16); q14u16 = vmvnq_u16(q14u16); d28u16 = vget_low_u16(q14u16); d29u16 = vget_high_u16(q14u16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); q2s16 = vcombine_s16(d4s16, d5s16); q3s16 = vcombine_s16(d6s16, d7s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); vst1q_s16(output + 16, q2s16); vst1q_s16(output + 24, q3s16); return; }
unsigned int vp8_variance16x8_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 4; i++) { // variance16x8_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; __builtin_prefetch(src_ptr); q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; __builtin_prefetch(ref_ptr); q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
unsigned int vpx_mse16x16_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; int64x1_t d0s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; q7s32 = vdupq_n_s32(0); q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 8; i++) { // mse16x16_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q7s32 = vmlal_s16(q7s32, d22s16, d22s16); q8s32 = vmlal_s16(q8s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q7s32 = vmlal_s16(q7s32, d26s16, d26s16); q8s32 = vmlal_s16(q8s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q10s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h) { int height; const uint8_t *s; uint8_t *d; uint32x2_t d2u32, d3u32; uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; assert(y_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_x; src -= src_stride * 3; q0s16 = vld1q_s16(filter_y); for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h s = src; d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); s += src_stride; d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); s += src_stride; d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); s += src_stride; d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); s += src_stride; d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); s += src_stride; d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); s += src_stride; d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); s += src_stride; q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d = dst; for (height = h; height > 0; height -= 4) { // loop_vert d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); s += src_stride; d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); s += src_stride; d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); s += src_stride; d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); s += src_stride; q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); __builtin_prefetch(d); __builtin_prefetch(d + dst_stride); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, q0s16); __builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 3); q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, q0s16); __builtin_prefetch(s); __builtin_prefetch(s + src_stride); q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, d27s16, q0s16); __builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 3); q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, d27s16, d25s16, q0s16); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); d += dst_stride; q8u16 = q10u16; d18s16 = d22s16; d19s16 = d24s16; q10u16 = q13u16; d22s16 = d25s16; } } return; }