static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { const uint8x8_t f0 = vmov_n_u8(filter[0]); const uint8x8_t f1 = vmov_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); } // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; } }
inline unsigned int GetByteMask2(uint8x16_t a, uint8x16_t b) { uint8x16_t am = vandq_u8(a, compaction_mask); uint8x16_t bm = vandq_u8(b, compaction_mask); uint8x8_t a_sum = vpadd_u8(vget_high_u8(am), vget_low_u8(am)); uint8x8_t b_sum = vpadd_u8(vget_high_u8(bm), vget_low_u8(bm)); a_sum = vpadd_u8(b_sum, a_sum); a_sum = vpadd_u8(a_sum, a_sum); return vget_lane_u32(vreinterpret_u32_u8(a_sum), 0); }
void test_vget_lowu8 (void) { register uint8x8_t out_uint8x8_t asm ("d18"); uint8x16_t arg0_uint8x16_t; out_uint8x8_t = vget_low_u8 (arg0_uint8x16_t); }
void test_vget_lowu8 (void) { uint8x8_t out_uint8x8_t; uint8x16_t arg0_uint8x16_t; out_uint8x8_t = vget_low_u8 (arg0_uint8x16_t); }
SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask) { uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask); uint16x8_t lo = Square(vget_low_u8(ad)); uint16x8_t hi = Square(vget_high_u8(ad)); return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); }
static v16 mulby(uint8_t x, v16 v) { #ifdef LIBRS_USE_NEON #define uint8x16_to_8x8x2(v) ((uint8x8x2_t) { vget_low_u8(v), vget_high_u8(v) }) v16 lo, hi; lo = v & VEC16(0x0f); hi = vshrq_n_u8(v, 4); lo = vcombine_u8( vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_low_u8(lo)), vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_high_u8(lo))); hi = vcombine_u8( vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_low_u8(hi)), vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_high_u8(hi))); return lo ^ hi; #elif defined(LIBRS_USE_SSSE3) v16 lo, hi; lo = v & VEC16(0x0f); hi = __builtin_ia32_psrawi128(v, 4); hi &= VEC16(0x0f); lo = __builtin_ia32_pshufb128(rs_nibmul[x].lo, lo); hi = __builtin_ia32_pshufb128(rs_nibmul[x].hi, hi); return lo ^ hi; #else v16 vv = VEC16(0); while (x != 0) { if (x & 1) vv ^= v; x >>= 1; v = mul2(v); } return vv; #endif }
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; uint16x8_t vec_sum_lo = vdupq_n_u16(0); uint16x8_t vec_sum_hi = vdupq_n_u16(0); const int shift_factor = ((height >> 5) + 3) * -1; const int16x8_t vec_shift = vdupq_n_s16(shift_factor); for (i = 0; i < height; i += 8) { const uint8x16_t vec_row1 = vld1q_u8(ref); const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); ref += ref_stride * 8; } vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); hbuf += 8; vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); const uint8x8_t shuffle = vld1_u8(kGreenShuffle); for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); }
static INLINE void scaledconvolve_vert_w16( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filters, const int y0_q4, const int y_step_q4, const int w, const int h) { int x, y; int y_q4 = y0_q4; src -= src_stride * (SUBPEL_TAPS / 2 - 1); y = h; do { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { x = 0; do { const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); uint8x16_t ss[8]; uint8x8_t s[8], d[2]; load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], &ss[5], &ss[6], &ss[7]); s[0] = vget_low_u8(ss[0]); s[1] = vget_low_u8(ss[1]); s[2] = vget_low_u8(ss[2]); s[3] = vget_low_u8(ss[3]); s[4] = vget_low_u8(ss[4]); s[5] = vget_low_u8(ss[5]); s[6] = vget_low_u8(ss[6]); s[7] = vget_low_u8(ss[7]); d[0] = scale_filter_8(s, filters); s[0] = vget_high_u8(ss[0]); s[1] = vget_high_u8(ss[1]); s[2] = vget_high_u8(ss[2]); s[3] = vget_high_u8(ss[3]); s[4] = vget_high_u8(ss[4]); s[5] = vget_high_u8(ss[5]); s[6] = vget_high_u8(ss[6]); s[7] = vget_high_u8(ss[7]); d[1] = scale_filter_8(s, filters); vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); src_y += 16; x += 16; } while (x < w); } else { memcpy(dst, &src_y[3 * src_stride], w); } dst += dst_stride; y_q4 += y_step_q4; } while (--y); }
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { int i; uint16x8_t vec_sum = vdupq_n_u16(0); for (i = 0; i < width; i += 16) { const uint8x16_t vec_row = vld1q_u8(ref); vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); ref += 16; } return horizontal_add_u16x8(vec_sum); }
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint8x8_t d2u8 = vdup_n_u8(0); uint8x16_t q0u8 = vdupq_n_u8(0); uint8x16_t q1u8 = vdupq_n_u8(0); (void)above; for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { q0u8 = vdupq_lane_u8(d2u8, 0); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 1); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 2); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 3); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 4); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 5); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 6); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 7); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; } } }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; uint8x16_t q0u8, q1u8; int16x8_t q0s16, q1s16, q8s16, q11s16; uint16x4_t d20u16; uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; q0u8 = vld1q_dup_u8(above - 1); q1u8 = vld1q_u8(above); q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); for (k = 0; k < 2; k++, left += 8) { d18u8 = vld1_u8(left); q10u16 = vmovl_u8(d18u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); q8u16 = vdupq_lane_u16(d20u16, 1); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); d23u8 = vqmovun_s16(q8s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); dst += stride; vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 2); q8u16 = vdupq_lane_u16(d20u16, 3); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); d23u8 = vqmovun_s16(q8s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); dst += stride; vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); dst += stride; } } }
static INLINE void vp8_loop_filter_simple_vertical_edge_neon( unsigned char *s, int p, const unsigned char *blimit) { unsigned char *src1; uint8x16_t qblimit, q0u8; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; int16x8_t q2s16, q13s16, q11s16; int8x8_t d28s8, d29s8; int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; uint8x8x4_t d0u8x4; // d6, d7, d8, d9 uint8x8x4_t d1u8x4; // d10, d11, d12, d13 uint8x8x2_t d2u8x2; // d12, d13 uint8x8x2_t d3u8x2; // d14, d15 qblimit = vdupq_n_u8(*blimit); src1 = s - 2; d0u8x4 = read_4x8(src1, p); src1 += p * 8; d1u8x4 = read_4x8(src1, p); q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 q15u8 = vabdq_u8(q5u8, q4u8); q14u8 = vabdq_u8(q3u8, q6u8); q15u8 = vqaddq_u8(q15u8, q15u8); q14u8 = vshrq_n_u8(q14u8, 1); q0u8 = vdupq_n_u8(0x80); q11s16 = vdupq_n_s16(3); q15u8 = vqaddq_u8(q15u8, q14u8); q3u8 = veorq_u8(q3u8, q0u8); q4u8 = veorq_u8(q4u8, q0u8); q5u8 = veorq_u8(q5u8, q0u8); q6u8 = veorq_u8(q6u8, q0u8); q15u8 = vcgeq_u8(qblimit, q15u8); q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), vget_low_s8(vreinterpretq_s8_u8(q5u8))); q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), vget_high_s8(vreinterpretq_s8_u8(q5u8))); q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), vreinterpretq_s8_u8(q6u8)); q2s16 = vmulq_s16(q2s16, q11s16); q13s16 = vmulq_s16(q13s16, q11s16); q11u8 = vdupq_n_u8(3); q12u8 = vdupq_n_u8(4); q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); d28s8 = vqmovn_s16(q2s16); d29s8 = vqmovn_s16(q13s16); q14s8 = vcombine_s8(d28s8, d29s8); q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); q2s8 = vshrq_n_s8(q2s8, 3); q14s8 = vshrq_n_s8(q3s8, 3); q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); d2u8x2.val[0] = vget_low_u8(q6u8); // d12 d2u8x2.val[1] = vget_low_u8(q7u8); // d14 d3u8x2.val[0] = vget_high_u8(q6u8); // d13 d3u8x2.val[1] = vget_high_u8(q7u8); // d15 src1 = s - 1; write_2x8(src1, p, d2u8x2, d3u8x2); }
inline bool isFound(uint8x16_t x) { uint8x8_t xx = vorr_u8(vget_low_u8(x), vget_high_u8(x)); return vget_lane_u64(vreinterpret_u64_u8(xx), 0); }
void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; uint8x16_t q0u8, q1u8, q2u8; int16x8_t q12s16, q13s16, q14s16, q15s16; uint16x4_t d6u16; uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; q0u8 = vld1q_dup_u8(above - 1); q1u8 = vld1q_u8(above); q2u8 = vld1q_u8(above + 16); q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); for (k = 0; k < 4; k++, left += 8) { d26u8 = vld1_u8(left); q3u16 = vmovl_u8(d26u8); d6u16 = vget_low_u16(q3u16); for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { q0u16 = vdupq_lane_u16(d6u16, 0); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 1); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 2); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 3); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; } } }
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const uint *src = (const uint *) srcPixels; uint *dst = (uint *) destPixels; int16x8_t half = vdupq_n_s16(0x80); int16x8_t full = vdupq_n_s16(0xff); if (const_alpha == 256) { for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { // all opaque vst1q_s32((int32_t *)&dst[x], src32); } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s >= 0xff000000) dst[x] = s; else if (s != 0) dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } else if (const_alpha != 0) { const_alpha = (const_alpha * 255) >> 8; int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half); const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half); const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s != 0) { s = BYTE_MUL(s, const_alpha); dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } }
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, const uint8x8_t shuffle) { return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); }
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
void vp8_mbloop_filter_vertical_edge_uv_neon( unsigned char *u, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh, unsigned char *v) { unsigned char *us, *ud; unsigned char *vs, *vd; uint8x16_t qblimit, qlimit, qthresh, q3, q4; uint8x16_t q5, q6, q7, q8, q9, q10; uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; uint8x8_t d15, d16, d17, d18, d19, d20, d21; uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; qblimit = vdupq_n_u8(blimit); qlimit = vdupq_n_u8(limit); qthresh = vdupq_n_u8(thresh); us = u - 4; vs = v - 4; d6 = vld1_u8(us); us += pitch; d7 = vld1_u8(vs); vs += pitch; d8 = vld1_u8(us); us += pitch; d9 = vld1_u8(vs); vs += pitch; d10 = vld1_u8(us); us += pitch; d11 = vld1_u8(vs); vs += pitch; d12 = vld1_u8(us); us += pitch; d13 = vld1_u8(vs); vs += pitch; d14 = vld1_u8(us); us += pitch; d15 = vld1_u8(vs); vs += pitch; d16 = vld1_u8(us); us += pitch; d17 = vld1_u8(vs); vs += pitch; d18 = vld1_u8(us); us += pitch; d19 = vld1_u8(vs); vs += pitch; d20 = vld1_u8(us); d21 = vld1_u8(vs); q3 = vcombine_u8(d6, d7); q4 = vcombine_u8(d8, d9); q5 = vcombine_u8(d10, d11); q6 = vcombine_u8(d12, d13); q7 = vcombine_u8(d14, d15); q8 = vcombine_u8(d16, d17); q9 = vcombine_u8(d18, d19); q10 = vcombine_u8(d20, d21); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, q10, &q4, &q5, &q6, &q7, &q8, &q9); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; ud = u - 4; vst1_u8(ud, vget_low_u8(q3)); ud += pitch; vst1_u8(ud, vget_low_u8(q4)); ud += pitch; vst1_u8(ud, vget_low_u8(q5)); ud += pitch; vst1_u8(ud, vget_low_u8(q6)); ud += pitch; vst1_u8(ud, vget_low_u8(q7)); ud += pitch; vst1_u8(ud, vget_low_u8(q8)); ud += pitch; vst1_u8(ud, vget_low_u8(q9)); ud += pitch; vst1_u8(ud, vget_low_u8(q10)); vd = v - 4; vst1_u8(vd, vget_high_u8(q3)); vd += pitch; vst1_u8(vd, vget_high_u8(q4)); vd += pitch; vst1_u8(vd, vget_high_u8(q5)); vd += pitch; vst1_u8(vd, vget_high_u8(q6)); vd += pitch; vst1_u8(vd, vget_high_u8(q7)); vd += pitch; vst1_u8(vd, vget_high_u8(q8)); vd += pitch; vst1_u8(vd, vget_high_u8(q9)); vd += pitch; vst1_u8(vd, vget_high_u8(q10)); return; }
unsigned int vp8_variance_halfpixvar16x16_hv_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; int32x4_t q13s32, q14s32, q15s32; int64x2_t q0s64, q1s64, q5s64; q13s32 = vdupq_n_s32(0); q14s32 = vdupq_n_s32(0); q15s32 = vdupq_n_s32(0); q0u8 = vld1q_u8(src_ptr); q1u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q1u8 = vextq_u8(q0u8, q1u8, 1); q0u8 = vrhaddq_u8(q0u8, q1u8); for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon q2u8 = vld1q_u8(src_ptr); q3u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q4u8 = vld1q_u8(src_ptr); q5u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q6u8 = vld1q_u8(src_ptr); q7u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q8u8 = vld1q_u8(src_ptr); q9u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q3u8 = vextq_u8(q2u8, q3u8, 1); q5u8 = vextq_u8(q4u8, q5u8, 1); q7u8 = vextq_u8(q6u8, q7u8, 1); q9u8 = vextq_u8(q8u8, q9u8, 1); q1u8 = vrhaddq_u8(q2u8, q3u8); q2u8 = vrhaddq_u8(q4u8, q5u8); q3u8 = vrhaddq_u8(q6u8, q7u8); q4u8 = vrhaddq_u8(q8u8, q9u8); q0u8 = vrhaddq_u8(q0u8, q1u8); q1u8 = vrhaddq_u8(q1u8, q2u8); q2u8 = vrhaddq_u8(q2u8, q3u8); q3u8 = vrhaddq_u8(q3u8, q4u8); q5u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q6u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q7u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q8u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); d4u8 = vget_low_u8(q2u8); d5u8 = vget_high_u8(q2u8); d6u8 = vget_low_u8(q3u8); d7u8 = vget_high_u8(q3u8); q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); q14s32 = vmlal_s16(q14s32, d18s16, d18s16); q15s32 = vmlal_s16(q15s32, d19s16, d19s16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); q14s32 = vmlal_s16(q14s32, d20s16, d20s16); q15s32 = vmlal_s16(q15s32, d21s16, d21s16); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); q14s32 = vmlal_s16(q14s32, d22s16, d22s16); q15s32 = vmlal_s16(q15s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); q14s32 = vmlal_s16(q14s32, d24s16, d24s16); q15s32 = vmlal_s16(q15s32, d25s16, d25s16); d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); q14s32 = vmlal_s16(q14s32, d0s16, d0s16); q15s32 = vmlal_s16(q15s32, d1s16, d1s16); d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); q14s32 = vmlal_s16(q14s32, d2s16, d2s16); q15s32 = vmlal_s16(q15s32, d3s16, d3s16); d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); q14s32 = vmlal_s16(q14s32, d10s16, d10s16); q15s32 = vmlal_s16(q15s32, d11s16, d11s16); d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); q14s32 = vmlal_s16(q14s32, d12s16, d12s16); q15s32 = vmlal_s16(q15s32, d13s16, d13s16); q0u8 = q4u8; } q15s32 = vaddq_s32(q14s32, q15s32); q0s64 = vpaddlq_s32(q13s32); q1s64 = vpaddlq_s32(q15s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
unsigned int vp8_sub_pixel_variance16x16_neon_func( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { int i; DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); unsigned char *tmpp; unsigned char *tmpp2; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; uint8x8_t d19u8, d20u8, d21u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; tmpp2 = tmp + 272; tmpp = tmp; if (xoffset == 0) { // secondpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); q11u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } else if (yoffset == 0) { // firstpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); for (i = 4; i > 0 ; i--) { d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp2, q7u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q8u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q9u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q10u8); tmpp2 += 16; } } else { d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; // First Pass: output_height lines x output_width columns (17x16) for (i = 3; i > 0; i--) { q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; } // First-pass filtering for rest 5 lines d14u8 = vld1_u8(src_ptr); d15u8 = vld1_u8(src_ptr + 8); d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q9u16 = vmull_u8(d2u8, d0u8); q10u16 = vmull_u8(d3u8, d0u8); q11u16 = vmull_u8(d5u8, d0u8); q12u16 = vmull_u8(d6u8, d0u8); q13u16 = vmull_u8(d8u8, d0u8); q14u16 = vmull_u8(d9u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); q9u16 = vmlal_u8(q9u16, d2u8, d1u8); q11u16 = vmlal_u8(q11u16, d5u8, d1u8); q13u16 = vmlal_u8(q13u16, d8u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); q10u16 = vmlal_u8(q10u16, d3u8, d1u8); q12u16 = vmlal_u8(q12u16, d6u8, d1u8); q14u16 = vmlal_u8(q14u16, d9u8, d1u8); q1u16 = vmull_u8(d11u8, d0u8); q2u16 = vmull_u8(d12u8, d0u8); q3u16 = vmull_u8(d14u8, d0u8); q4u16 = vmull_u8(d15u8, d0u8); d11u8 = vext_u8(d11u8, d12u8, 1); d14u8 = vext_u8(d14u8, d15u8, 1); q1u16 = vmlal_u8(q1u16, d11u8, d1u8); q3u16 = vmlal_u8(q3u16, d14u8, d1u8); d12u8 = vext_u8(d12u8, d13u8, 1); d15u8 = vext_u8(d15u8, d16u8, 1); q2u16 = vmlal_u8(q2u16, d12u8, d1u8); q4u16 = vmlal_u8(q4u16, d15u8, d1u8); d10u8 = vqrshrn_n_u16(q9u16, 7); d11u8 = vqrshrn_n_u16(q10u16, 7); d12u8 = vqrshrn_n_u16(q11u16, 7); d13u8 = vqrshrn_n_u16(q12u16, 7); d14u8 = vqrshrn_n_u16(q13u16, 7); d15u8 = vqrshrn_n_u16(q14u16, 7); d16u8 = vqrshrn_n_u16(q1u16, 7); d17u8 = vqrshrn_n_u16(q2u16, 7); d18u8 = vqrshrn_n_u16(q3u16, 7); d19u8 = vqrshrn_n_u16(q4u16, 7); q5u8 = vcombine_u8(d10u8, d11u8); q6u8 = vcombine_u8(d12u8, d13u8); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); // secondpass_filter d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); tmpp = tmp; tmpp2 = tmpp + 272; q11u8 = vld1q_u8(tmpp); tmpp += 16; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(tmpp); tmpp += 16; q13u8 = vld1q_u8(tmpp); tmpp += 16; q14u8 = vld1q_u8(tmpp); tmpp += 16; q15u8 = vld1q_u8(tmpp); tmpp += 16; q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } // sub_pixel_variance16x16_neon q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); tmpp = tmp + 272; for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop q0u8 = vld1q_u8(tmpp); tmpp += 16; q1u8 = vld1q_u8(tmpp); tmpp += 16; q2u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; q3u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { int r, c; if (cols > 16) { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; c += 32) { const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); } diff += diff_stride; pred += pred_stride; src += src_stride; } } else if (cols > 8) { for (r = 0; r < rows; ++r) { const uint8x16_t v_src = vld1q_u8(&src[0]); const uint8x16_t v_pred = vld1q_u8(&pred[0]); const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); diff += diff_stride; pred += pred_stride; src += src_stride; } } else if (cols > 4) { for (r = 0; r < rows; ++r) { const uint8x8_t v_src = vld1_u8(&src[0]); const uint8x8_t v_pred = vld1_u8(&pred[0]); const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; } } else { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; diff += diff_stride; pred += pred_stride; src += src_stride; } } }
void vp8_sixtap_predict8x8_neon( unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch) { unsigned char *src, *tmpp; unsigned char tmp[64]; int i; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; if (xoffset == 0) { // secondpass_filter8x8_only // load second_pass filter dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); d0s8 = vdup_lane_s8(dtmps8, 0); d1s8 = vdup_lane_s8(dtmps8, 1); d2s8 = vdup_lane_s8(dtmps8, 2); d3s8 = vdup_lane_s8(dtmps8, 3); d4s8 = vdup_lane_s8(dtmps8, 4); d5s8 = vdup_lane_s8(dtmps8, 5); d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); // load src data src = src_ptr - src_pixels_per_line * 2; d18u8 = vld1_u8(src); src += src_pixels_per_line; d19u8 = vld1_u8(src); src += src_pixels_per_line; d20u8 = vld1_u8(src); src += src_pixels_per_line; d21u8 = vld1_u8(src); src += src_pixels_per_line; d22u8 = vld1_u8(src); src += src_pixels_per_line; d23u8 = vld1_u8(src); src += src_pixels_per_line; d24u8 = vld1_u8(src); src += src_pixels_per_line; d25u8 = vld1_u8(src); src += src_pixels_per_line; d26u8 = vld1_u8(src); src += src_pixels_per_line; d27u8 = vld1_u8(src); src += src_pixels_per_line; d28u8 = vld1_u8(src); src += src_pixels_per_line; d29u8 = vld1_u8(src); src += src_pixels_per_line; d30u8 = vld1_u8(src); for (i = 2; i > 0; i--) { q3u16 = vmull_u8(d18u8, d0u8); q4u16 = vmull_u8(d19u8, d0u8); q5u16 = vmull_u8(d20u8, d0u8); q6u16 = vmull_u8(d21u8, d0u8); q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); q3u16 = vmlal_u8(q3u16, d20u8, d2u8); q4u16 = vmlal_u8(q4u16, d21u8, d2u8); q5u16 = vmlal_u8(q5u16, d22u8, d2u8); q6u16 = vmlal_u8(q6u16, d23u8, d2u8); q3u16 = vmlal_u8(q3u16, d23u8, d5u8); q4u16 = vmlal_u8(q4u16, d24u8, d5u8); q5u16 = vmlal_u8(q5u16, d25u8, d5u8); q6u16 = vmlal_u8(q6u16, d26u8, d5u8); q7u16 = vmull_u8(d21u8, d3u8); q8u16 = vmull_u8(d22u8, d3u8); q9u16 = vmull_u8(d23u8, d3u8); q10u16 = vmull_u8(d24u8, d3u8); q3s16 = vreinterpretq_s16_u16(q3u16); q4s16 = vreinterpretq_s16_u16(q4u16); q5s16 = vreinterpretq_s16_u16(q5u16); q6s16 = vreinterpretq_s16_u16(q6u16); q7s16 = vreinterpretq_s16_u16(q7u16); q8s16 = vreinterpretq_s16_u16(q8u16); q9s16 = vreinterpretq_s16_u16(q9u16); q10s16 = vreinterpretq_s16_u16(q10u16); q7s16 = vqaddq_s16(q7s16, q3s16); q8s16 = vqaddq_s16(q8s16, q4s16); q9s16 = vqaddq_s16(q9s16, q5s16); q10s16 = vqaddq_s16(q10s16, q6s16); d6u8 = vqrshrun_n_s16(q7s16, 7); d7u8 = vqrshrun_n_s16(q8s16, 7); d8u8 = vqrshrun_n_s16(q9s16, 7); d9u8 = vqrshrun_n_s16(q10s16, 7); d18u8 = d22u8; d19u8 = d23u8; d20u8 = d24u8; d21u8 = d25u8; d22u8 = d26u8; d23u8 = d27u8; d24u8 = d28u8; d25u8 = d29u8; d26u8 = d30u8; vst1_u8(dst_ptr, d6u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d7u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d8u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d9u8); dst_ptr += dst_pitch; } return; } // load first_pass filter dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); d0s8 = vdup_lane_s8(dtmps8, 0); d1s8 = vdup_lane_s8(dtmps8, 1); d2s8 = vdup_lane_s8(dtmps8, 2); d3s8 = vdup_lane_s8(dtmps8, 3); d4s8 = vdup_lane_s8(dtmps8, 4); d5s8 = vdup_lane_s8(dtmps8, 5); d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); // First pass: output_height lines x output_width columns (9x4) if (yoffset == 0) // firstpass_filter4x4_only src = src_ptr - 2; else src = src_ptr - 2 - (src_pixels_per_line * 2); tmpp = tmp; for (i = 2; i > 0; i--) { q3u8 = vld1q_u8(src); src += src_pixels_per_line; q4u8 = vld1q_u8(src); src += src_pixels_per_line; q5u8 = vld1q_u8(src); src += src_pixels_per_line; q6u8 = vld1q_u8(src); src += src_pixels_per_line; __builtin_prefetch(src); __builtin_prefetch(src + src_pixels_per_line); __builtin_prefetch(src + src_pixels_per_line * 2); q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); q7u16 = vmlal_u8(q7u16, d28u8, d2u8); q8u16 = vmlal_u8(q8u16, d29u8, d2u8); q9u16 = vmlal_u8(q9u16, d30u8, d2u8); q10u16 = vmlal_u8(q10u16, d31u8, d2u8); d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); q7u16 = vmlal_u8(q7u16, d28u8, d5u8); q8u16 = vmlal_u8(q8u16, d29u8, d5u8); q9u16 = vmlal_u8(q9u16, d30u8, d5u8); q10u16 = vmlal_u8(q10u16, d31u8, d5u8); d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); q3u16 = vmull_u8(d28u8, d3u8); q4u16 = vmull_u8(d29u8, d3u8); q5u16 = vmull_u8(d30u8, d3u8); q6u16 = vmull_u8(d31u8, d3u8); q3s16 = vreinterpretq_s16_u16(q3u16); q4s16 = vreinterpretq_s16_u16(q4u16); q5s16 = vreinterpretq_s16_u16(q5u16); q6s16 = vreinterpretq_s16_u16(q6u16); q7s16 = vreinterpretq_s16_u16(q7u16); q8s16 = vreinterpretq_s16_u16(q8u16); q9s16 = vreinterpretq_s16_u16(q9u16); q10s16 = vreinterpretq_s16_u16(q10u16); q7s16 = vqaddq_s16(q7s16, q3s16); q8s16 = vqaddq_s16(q8s16, q4s16); q9s16 = vqaddq_s16(q9s16, q5s16); q10s16 = vqaddq_s16(q10s16, q6s16); d22u8 = vqrshrun_n_s16(q7s16, 7); d23u8 = vqrshrun_n_s16(q8s16, 7); d24u8 = vqrshrun_n_s16(q9s16, 7); d25u8 = vqrshrun_n_s16(q10s16, 7); if (yoffset == 0) { // firstpass_filter8x4_only vst1_u8(dst_ptr, d22u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d23u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d24u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d25u8); dst_ptr += dst_pitch; } else { vst1_u8(tmpp, d22u8); tmpp += 8; vst1_u8(tmpp, d23u8); tmpp += 8; vst1_u8(tmpp, d24u8); tmpp += 8; vst1_u8(tmpp, d25u8); tmpp += 8; } } if (yoffset == 0) return; // First Pass on rest 5-line data q3u8 = vld1q_u8(src); src += src_pixels_per_line; q4u8 = vld1q_u8(src); src += src_pixels_per_line; q5u8 = vld1q_u8(src); src += src_pixels_per_line; q6u8 = vld1q_u8(src); src += src_pixels_per_line; q7u8 = vld1q_u8(src); q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); q8u16 = vmlal_u8(q8u16, d27u8, d2u8); q9u16 = vmlal_u8(q9u16, d28u8, d2u8); q10u16 = vmlal_u8(q10u16, d29u8, d2u8); q11u16 = vmlal_u8(q11u16, d30u8, d2u8); q12u16 = vmlal_u8(q12u16, d31u8, d2u8); d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); q8u16 = vmlal_u8(q8u16, d27u8, d5u8); q9u16 = vmlal_u8(q9u16, d28u8, d5u8); q10u16 = vmlal_u8(q10u16, d29u8, d5u8); q11u16 = vmlal_u8(q11u16, d30u8, d5u8); q12u16 = vmlal_u8(q12u16, d31u8, d5u8); d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); q3u16 = vmull_u8(d27u8, d3u8); q4u16 = vmull_u8(d28u8, d3u8); q5u16 = vmull_u8(d29u8, d3u8); q6u16 = vmull_u8(d30u8, d3u8); q7u16 = vmull_u8(d31u8, d3u8); q3s16 = vreinterpretq_s16_u16(q3u16); q4s16 = vreinterpretq_s16_u16(q4u16); q5s16 = vreinterpretq_s16_u16(q5u16); q6s16 = vreinterpretq_s16_u16(q6u16); q7s16 = vreinterpretq_s16_u16(q7u16); q8s16 = vreinterpretq_s16_u16(q8u16); q9s16 = vreinterpretq_s16_u16(q9u16); q10s16 = vreinterpretq_s16_u16(q10u16); q11s16 = vreinterpretq_s16_u16(q11u16); q12s16 = vreinterpretq_s16_u16(q12u16); q8s16 = vqaddq_s16(q8s16, q3s16); q9s16 = vqaddq_s16(q9s16, q4s16); q10s16 = vqaddq_s16(q10s16, q5s16); q11s16 = vqaddq_s16(q11s16, q6s16); q12s16 = vqaddq_s16(q12s16, q7s16); d26u8 = vqrshrun_n_s16(q8s16, 7); d27u8 = vqrshrun_n_s16(q9s16, 7); d28u8 = vqrshrun_n_s16(q10s16, 7); d29u8 = vqrshrun_n_s16(q11s16, 7); d30u8 = vqrshrun_n_s16(q12s16, 7); // Second pass: 8x8 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); d0s8 = vdup_lane_s8(dtmps8, 0); d1s8 = vdup_lane_s8(dtmps8, 1); d2s8 = vdup_lane_s8(dtmps8, 2); d3s8 = vdup_lane_s8(dtmps8, 3); d4s8 = vdup_lane_s8(dtmps8, 4); d5s8 = vdup_lane_s8(dtmps8, 5); d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); tmpp = tmp; q9u8 = vld1q_u8(tmpp); tmpp += 16; q10u8 = vld1q_u8(tmpp); tmpp += 16; q11u8 = vld1q_u8(tmpp); tmpp += 16; q12u8 = vld1q_u8(tmpp); d18u8 = vget_low_u8(q9u8); d19u8 = vget_high_u8(q9u8); d20u8 = vget_low_u8(q10u8); d21u8 = vget_high_u8(q10u8); d22u8 = vget_low_u8(q11u8); d23u8 = vget_high_u8(q11u8); d24u8 = vget_low_u8(q12u8); d25u8 = vget_high_u8(q12u8); for (i = 2; i > 0; i--) { q3u16 = vmull_u8(d18u8, d0u8); q4u16 = vmull_u8(d19u8, d0u8); q5u16 = vmull_u8(d20u8, d0u8); q6u16 = vmull_u8(d21u8, d0u8); q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); q3u16 = vmlal_u8(q3u16, d20u8, d2u8); q4u16 = vmlal_u8(q4u16, d21u8, d2u8); q5u16 = vmlal_u8(q5u16, d22u8, d2u8); q6u16 = vmlal_u8(q6u16, d23u8, d2u8); q3u16 = vmlal_u8(q3u16, d23u8, d5u8); q4u16 = vmlal_u8(q4u16, d24u8, d5u8); q5u16 = vmlal_u8(q5u16, d25u8, d5u8); q6u16 = vmlal_u8(q6u16, d26u8, d5u8); q7u16 = vmull_u8(d21u8, d3u8); q8u16 = vmull_u8(d22u8, d3u8); q9u16 = vmull_u8(d23u8, d3u8); q10u16 = vmull_u8(d24u8, d3u8); q3s16 = vreinterpretq_s16_u16(q3u16); q4s16 = vreinterpretq_s16_u16(q4u16); q5s16 = vreinterpretq_s16_u16(q5u16); q6s16 = vreinterpretq_s16_u16(q6u16); q7s16 = vreinterpretq_s16_u16(q7u16); q8s16 = vreinterpretq_s16_u16(q8u16); q9s16 = vreinterpretq_s16_u16(q9u16); q10s16 = vreinterpretq_s16_u16(q10u16); q7s16 = vqaddq_s16(q7s16, q3s16); q8s16 = vqaddq_s16(q8s16, q4s16); q9s16 = vqaddq_s16(q9s16, q5s16); q10s16 = vqaddq_s16(q10s16, q6s16); d6u8 = vqrshrun_n_s16(q7s16, 7); d7u8 = vqrshrun_n_s16(q8s16, 7); d8u8 = vqrshrun_n_s16(q9s16, 7); d9u8 = vqrshrun_n_s16(q10s16, 7); d18u8 = d22u8; d19u8 = d23u8; d20u8 = d24u8; d21u8 = d25u8; d22u8 = d26u8; d23u8 = d27u8; d24u8 = d28u8; d25u8 = d29u8; d26u8 = d30u8; vst1_u8(dst_ptr, d6u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d7u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d8u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d9u8); dst_ptr += dst_pitch; } return; }
uint8x8_t test_vget_low_u8(uint8x16_t a) { // CHECK-LABEL: test_vget_low_u8: return vget_low_u8(a); // CHECK-NEXT: ret }
unsigned int vpx_mse16x16_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; int64x1_t d0s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; q7s32 = vdupq_n_s32(0); q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 8; i++) { // mse16x16_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q7s32 = vmlal_s16(q7s32, d22s16, d22s16); q8s32 = vmlal_s16(q8s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q7s32 = vmlal_s16(q7s32, d26s16, d26s16); q8s32 = vmlal_s16(q8s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q10s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
f64 dotProduct(const Size2D &_size, const u8 * src0Base, ptrdiff_t src0Stride, const u8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements #define DOT_UINT_BLOCKSIZE 66050*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; uint64x2_t ws = vmovq_n_u64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; uint32x4_t s1 = vmovq_n_u32(0); uint32x4_t s2 = vmovq_n_u32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); uint8x16_t vs1 = vld1q_u8(src0 + i); uint8x16_t vs2 = vld1q_u8(src1 + i); uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); s1 = vpadalq_u16(s1, vdot1); s2 = vpadalq_u16(s2, vdot2); } ws = vpadalq_u32(ws, s1); ws = vpadalq_u32(ws, s2); } if(i + 8 <= size.width) { uint8x8_t vs1 = vld1_u8(src0 + i); uint8x8_t vs2 = vld1_u8(src1 + i); ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); i += 8; } result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vget_low_u8(uint8x16_t a) { return vget_low_u8(a); }
void vp8_mbloop_filter_horizontal_edge_uv_neon( unsigned char *u, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh, unsigned char *v) { uint8x16_t qblimit, qlimit, qthresh, q3, q4; uint8x16_t q5, q6, q7, q8, q9, q10; uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; uint8x8_t d15, d16, d17, d18, d19, d20, d21; qblimit = vdupq_n_u8(blimit); qlimit = vdupq_n_u8(limit); qthresh = vdupq_n_u8(thresh); u -= (pitch << 2); v -= (pitch << 2); d6 = vld1_u8(u); u += pitch; d7 = vld1_u8(v); v += pitch; d8 = vld1_u8(u); u += pitch; d9 = vld1_u8(v); v += pitch; d10 = vld1_u8(u); u += pitch; d11 = vld1_u8(v); v += pitch; d12 = vld1_u8(u); u += pitch; d13 = vld1_u8(v); v += pitch; d14 = vld1_u8(u); u += pitch; d15 = vld1_u8(v); v += pitch; d16 = vld1_u8(u); u += pitch; d17 = vld1_u8(v); v += pitch; d18 = vld1_u8(u); u += pitch; d19 = vld1_u8(v); v += pitch; d20 = vld1_u8(u); d21 = vld1_u8(v); q3 = vcombine_u8(d6, d7); q4 = vcombine_u8(d8, d9); q5 = vcombine_u8(d10, d11); q6 = vcombine_u8(d12, d13); q7 = vcombine_u8(d14, d15); q8 = vcombine_u8(d16, d17); q9 = vcombine_u8(d18, d19); q10 = vcombine_u8(d20, d21); vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, q10, &q4, &q5, &q6, &q7, &q8, &q9); u -= (pitch * 6); v -= (pitch * 6); vst1_u8(u, vget_low_u8(q4)); u += pitch; vst1_u8(v, vget_high_u8(q4)); v += pitch; vst1_u8(u, vget_low_u8(q5)); u += pitch; vst1_u8(v, vget_high_u8(q5)); v += pitch; vst1_u8(u, vget_low_u8(q6)); u += pitch; vst1_u8(v, vget_high_u8(q6)); v += pitch; vst1_u8(u, vget_low_u8(q7)); u += pitch; vst1_u8(v, vget_high_u8(q7)); v += pitch; vst1_u8(u, vget_low_u8(q8)); u += pitch; vst1_u8(v, vget_high_u8(q8)); v += pitch; vst1_u8(u, vget_low_u8(q9)); vst1_u8(v, vget_high_u8(q9)); return; }
unsigned int vp8_variance16x8_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 4; i++) { // variance16x8_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; __builtin_prefetch(src_ptr); q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; __builtin_prefetch(ref_ptr); q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }