uint64_t popcnt_neon_harley_seal(const uint8_t* data, const size_t size) { uint32x2_t total = vdup_n_u32(0); uint8x16_t ones, twos, fours, eights, sixteens; uint8x16_t twosA, twosB, foursA, foursB, eightsA, eightsB; uint64_t limit = size - size % (16*16); uint64_t i = 0; ones = twos = fours = eights = sixteens = vdupq_n_u8(0); uint8_t* ptr = const_cast<uint8_t*>(data); for(; i < limit; i += 16*16) { CSA(twosA, ones, ones, vld1q_u8(ptr + 16*0), vld1q_u8(ptr + 16*1)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*2), vld1q_u8(ptr + 16*3)); CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*4), vld1q_u8(ptr + 16*5)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*6), vld1q_u8(ptr + 16*7)); CSA(foursB, twos, twos, twosA, twosB); CSA(eightsA,fours, fours, foursA, foursB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*8), vld1q_u8(ptr + 16*9)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*10), vld1q_u8(ptr + 16*11)); CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*12), vld1q_u8(ptr + 16*13)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*14), vld1q_u8(ptr + 16*15)); CSA(foursB, twos, twos, twosA, twosB); CSA(eightsB, fours, fours, foursA, foursB); CSA(sixteens, eights, eights, eightsA, eightsB); total = vadd_u32(total, popcnt_neon_qreg(sixteens)); ptr += 16*16; } total = vshl_n_u32(total, 4); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(eights), 3)); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(fours), 2)); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(twos), 1)); total = vadd_u32(total, popcnt_neon_qreg(ones)); uint32_t scalar = 0; scalar += vget_lane_u32(total, 0); scalar += vget_lane_u32(total, 1); for(; i < size; i++) { scalar += lookup8bit[*ptr++]; } return scalar; }
void test_vget_laneu32 (void) { uint32_t out_uint32_t; uint32x2_t arg0_uint32x2_t; out_uint32_t = vget_lane_u32 (arg0_uint32x2_t, 1); }
static WEBP_INLINE uint32_t Average2(const uint32_t* const a, const uint32_t* const b) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t avg = vhadd_u8(a0, b0); return vget_lane_u32(vreinterpret_u32_u8(avg), 0); }
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); }
inline unsigned int GetByteMask(uint8x16_t a) { uint8x16_t am = vandq_u8(a, compaction_mask); uint8x8_t a_sum = vpadd_u8(vget_high_u8(am), vget_low_u8(am)); a_sum = vpadd_u8(a_sum, a_sum); a_sum = vpadd_u8(a_sum, a_sum); return vget_lane_u32(vreinterpret_u32_u8(a_sum), 0); }
static inline void AddLanes(uint32_t* ptr, uint32x4_t v) { #if defined(WEBRTC_ARCH_ARM64) *(ptr) = vaddvq_u32(v); #else uint32x2_t tmp_v; tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v)); tmp_v = vpadd_u32(tmp_v, tmp_v); *(ptr) = vget_lane_u32(tmp_v, 0); #endif }
static WEBP_INLINE uint32_t Average3(const uint32_t* const a, const uint32_t* const b, const uint32_t* const c) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); const uint8x8_t avg1 = vhadd_u8(a0, c0); const uint8x8_t avg2 = vhadd_u8(avg1, b0); return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); }
static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp return vget_lane_u32(vreinterpret_u32_u8(out), 0); }
unsigned int vpx_get4x4sse_cs_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); src_ptr += source_stride; d4u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d1u8 = vld1_u8(src_ptr); src_ptr += source_stride; d5u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d2u8 = vld1_u8(src_ptr); src_ptr += source_stride; d6u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d3u8 = vld1_u8(src_ptr); src_ptr += source_stride; d7u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); q13u16 = vsubl_u8(d2u8, d6u8); q14u16 = vsubl_u8(d3u8, d7u8); d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); q7s32 = vmull_s16(d22s16, d22s16); q8s32 = vmull_s16(d24s16, d24s16); q9s32 = vmull_s16(d26s16, d26s16); q10s32 = vmull_s16(d28s16, d28s16); q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q9s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q9s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, const uint32_t* const c1, const uint32_t* const c2) { const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); return vget_lane_u32(vreinterpret_u32_u8(out), 0); }
static WEBP_INLINE uint32_t Average4(const uint32_t* const a, const uint32_t* const b, const uint32_t* const c, const uint32_t* const d) { const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); const uint8x8_t avg1 = vhadd_u8(a0, b0); const uint8x8_t avg2 = vhadd_u8(c0, d0); const uint8x8_t avg3 = vhadd_u8(avg1, avg2); return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); }
KFR_SINTRIN bool bittestall(const u32neon& a) { const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a)); return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; }
size_t mempopcnt(const void *s, size_t len) { uint8x16_t v_0; uint8x16_t c; uint32x4_t v_sum; uint32x2_t v_tsum; unsigned char *p; size_t r; unsigned shift; prefetch(s); // TODO: do this in 64 bit? the mem model seems more that way... v_0 = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; v_sum = (uint32x4_t)v_0; p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ); shift = ALIGN_DOWN_DIFF(s, SOVUCQ); c = *(const uint8x16_t *)p; if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(v_0, c, SOVUCQ - shift); else c = neon_simple_alignq(c, v_0, shift); if(len >= SOVUCQ || len + shift >= SOVUCQ) { p += SOVUCQ; len -= SOVUCQ - shift; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); while(len >= SOVUCQ * 2) { uint8x16_t v_sumb = v_0; r = len / (SOVUCQ * 2); r = r > 15 ? 15 : r; len -= r * SOVUCQ * 2; /* * NEON has a vector popcnt instruction, so no compression. * We trust the speed given in the handbook (adding more * instructions would not make it faster), 1-2 cycles. */ for(; r; r--, p += SOVUCQ * 2) { c = *(const uint8x16_t *)p; v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); c = *((const uint8x16_t *)(p + SOVUCQ)); v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); } v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb)); } if(len >= SOVUCQ) { c = *(const uint8x16_t *)p; p += SOVUCQ; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); len -= SOVUCQ; } if(len) c = *(const uint8x16_t *)p; } if(len) { if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(c, v_0, SOVUCQ - len); else c = neon_simple_alignq(v_0, c, len); v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); } v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum)); v_tsum = vpadd_u32(v_tsum, v_tsum); return vget_lane_u32(v_tsum, 0); }
unsigned int vpx_mse16x16_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; int64x1_t d0s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; q7s32 = vdupq_n_s32(0); q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 8; i++) { // mse16x16_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q7s32 = vmlal_s16(q7s32, d22s16, d22s16); q8s32 = vmlal_s16(q8s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q7s32 = vmlal_s16(q7s32, d26s16, d26s16); q8s32 = vmlal_s16(q8s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q10s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
uint32_t test_vget_lane_u32(uint32x2_t v1) { // CHECK: test_vget_lane_u32 return vget_lane_u32(v1, 1); // CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1] }
unsigned int vp8_variance16x8_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); for (i = 0; i < 4; i++) { // variance16x8_neon_loop q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; __builtin_prefetch(src_ptr); q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; __builtin_prefetch(ref_ptr); q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh uint8x8_t d3u8, // p2 uint8x8_t d4u8, // p2 uint8x8_t d5u8, // p1 uint8x8_t d6u8, // p0 uint8x8_t d7u8, // q0 uint8x8_t d16u8, // q1 uint8x8_t d17u8, // q2 uint8x8_t d18u8, // q3 uint8x8_t *d0ru8, // p1 uint8x8_t *d1ru8, // p1 uint8x8_t *d2ru8, // p0 uint8x8_t *d3ru8, // q0 uint8x8_t *d4ru8, // q1 uint8x8_t *d5ru8) { // q1 uint32_t flat; uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; int16x8_t q15s16; uint16x8_t q10u16, q14u16; int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; d19u8 = vabd_u8(d3u8, d4u8); d20u8 = vabd_u8(d4u8, d5u8); d21u8 = vabd_u8(d5u8, d6u8); d22u8 = vabd_u8(d16u8, d7u8); d23u8 = vabd_u8(d17u8, d16u8); d24u8 = vabd_u8(d18u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d20u8 = vmax_u8(d21u8, d22u8); d25u8 = vabd_u8(d6u8, d4u8); d23u8 = vmax_u8(d23u8, d24u8); d26u8 = vabd_u8(d7u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d24u8 = vabd_u8(d6u8, d7u8); d27u8 = vabd_u8(d3u8, d6u8); d28u8 = vabd_u8(d18u8, d7u8); d19u8 = vmax_u8(d19u8, d23u8); d23u8 = vabd_u8(d5u8, d16u8); d24u8 = vqadd_u8(d24u8, d24u8); d19u8 = vcge_u8(dlimit, d19u8); d25u8 = vmax_u8(d25u8, d26u8); d26u8 = vmax_u8(d27u8, d28u8); d23u8 = vshr_n_u8(d23u8, 1); d25u8 = vmax_u8(d25u8, d26u8); d24u8 = vqadd_u8(d24u8, d23u8); d20u8 = vmax_u8(d20u8, d25u8); d23u8 = vdup_n_u8(1); d24u8 = vcge_u8(dblimit, d24u8); d21u8 = vcgt_u8(d21u8, dthresh); d20u8 = vcge_u8(d23u8, d20u8); d19u8 = vand_u8(d19u8, d24u8); d23u8 = vcgt_u8(d22u8, dthresh); d20u8 = vand_u8(d20u8, d19u8); d22u8 = vdup_n_u8(0x80); d23u8 = vorr_u8(d21u8, d23u8); q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); d30u8 = vshrn_n_u16(q10u16, 4); flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); if (flat == 0xffffffff) { // Check for all 1's, power_branch_only d27u8 = vdup_n_u8(3); d21u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d21u8); q14u16 = vaddw_u8(q14u16, d5u8); *d0ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); *d1ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d2ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d18u8); *d3ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d18u8); *d4ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); *d5ru8 = vqrshrn_n_u16(q14u16, 3); } else { d21u8 = veor_u8(d7u8, d22u8); d24u8 = veor_u8(d6u8, d22u8); d25u8 = veor_u8(d5u8, d22u8); d26u8 = veor_u8(d16u8, d22u8); d27u8 = vdup_n_u8(3); d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); q15s16 = vaddw_s8(q15s16, d29s8); d29u8 = vdup_n_u8(4); d28s8 = vqmovn_s16(q15s16); d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); d30s8 = vshr_n_s8(d30s8, 3); d29s8 = vshr_n_s8(d29s8, 3); d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); d29s8 = vrshr_n_s8(d29s8, 1); d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); if (flat == 0) { // filter_branch_only *d0ru8 = d4u8; *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); *d5ru8 = d17u8; return; } d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); d23u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d23u8); d0u8 = vbsl_u8(d20u8, dblimit, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); d1u8 = vbsl_u8(d20u8, dlimit, d25u8); d30u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); d2u8 = vbsl_u8(d20u8, dthresh, d24u8); d31u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); d23u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); q14u16 = vaddw_u8(q14u16, d18u8); *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); d22u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); d3u8 = vbsl_u8(d20u8, d3u8, d21u8); q14u16 = vaddw_u8(q14u16, d18u8); d4u8 = vbsl_u8(d20u8, d4u8, d26u8); d6u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); d5u8 = vbsl_u8(d20u8, d5u8, d17u8); d7u8 = vqrshrn_n_u16(q14u16, 3); *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); } return; }
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9, int16_t* inre2Q9, int32_t* outreQ16, int32_t* outimQ16) { int k; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921. // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code. int32_t fact = 16921 << 5; int32x4_t factq = vdupq_n_s32(fact); uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); int16x8_t inre1 = vld1q_s16(inre1Q9); int16x8_t inre2 = vld1q_s16(inre2Q9); kCosTab += 8; kSinTab += 8; inre1Q9 += 8; inre2Q9 += 8; // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code. int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1)); int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2)); tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2)); tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmp2 = vmull_high_s16(tmpr, inre1); int32x4_t tmp3 = vmull_high_s16(tmpr, inre2); tmp2 = vmlal_high_s16(tmp2, tmpi, inre2); tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1); #else int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1)); int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2)); tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2)); tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1)); #endif int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq); int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq); int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq); int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq); vst1q_s32(outreQ16, outr_0); outreQ16 += 4; vst1q_s32(outreQ16, outr_1); outreQ16 += 4; vst1q_s32(outimQ16, outi_0); outimQ16 += 4; vst1q_s32(outimQ16, outi_1); outimQ16 += 4; // Find the absolute maximum in the vectors. tmp0 = vabsq_s32(outr_0); tmp1 = vabsq_s32(outr_1); tmp2 = vabsq_s32(outi_0); tmp3 = vabsq_s32(outi_1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
unsigned int vp8_sub_pixel_variance16x16_neon_func( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { int i; DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); unsigned char *tmpp; unsigned char *tmpp2; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; uint8x8_t d19u8, d20u8, d21u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; tmpp2 = tmp + 272; tmpp = tmp; if (xoffset == 0) { // secondpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); q11u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } else if (yoffset == 0) { // firstpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); for (i = 4; i > 0 ; i--) { d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp2, q7u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q8u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q9u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q10u8); tmpp2 += 16; } } else { d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; // First Pass: output_height lines x output_width columns (17x16) for (i = 3; i > 0; i--) { q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; } // First-pass filtering for rest 5 lines d14u8 = vld1_u8(src_ptr); d15u8 = vld1_u8(src_ptr + 8); d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q9u16 = vmull_u8(d2u8, d0u8); q10u16 = vmull_u8(d3u8, d0u8); q11u16 = vmull_u8(d5u8, d0u8); q12u16 = vmull_u8(d6u8, d0u8); q13u16 = vmull_u8(d8u8, d0u8); q14u16 = vmull_u8(d9u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); q9u16 = vmlal_u8(q9u16, d2u8, d1u8); q11u16 = vmlal_u8(q11u16, d5u8, d1u8); q13u16 = vmlal_u8(q13u16, d8u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); q10u16 = vmlal_u8(q10u16, d3u8, d1u8); q12u16 = vmlal_u8(q12u16, d6u8, d1u8); q14u16 = vmlal_u8(q14u16, d9u8, d1u8); q1u16 = vmull_u8(d11u8, d0u8); q2u16 = vmull_u8(d12u8, d0u8); q3u16 = vmull_u8(d14u8, d0u8); q4u16 = vmull_u8(d15u8, d0u8); d11u8 = vext_u8(d11u8, d12u8, 1); d14u8 = vext_u8(d14u8, d15u8, 1); q1u16 = vmlal_u8(q1u16, d11u8, d1u8); q3u16 = vmlal_u8(q3u16, d14u8, d1u8); d12u8 = vext_u8(d12u8, d13u8, 1); d15u8 = vext_u8(d15u8, d16u8, 1); q2u16 = vmlal_u8(q2u16, d12u8, d1u8); q4u16 = vmlal_u8(q4u16, d15u8, d1u8); d10u8 = vqrshrn_n_u16(q9u16, 7); d11u8 = vqrshrn_n_u16(q10u16, 7); d12u8 = vqrshrn_n_u16(q11u16, 7); d13u8 = vqrshrn_n_u16(q12u16, 7); d14u8 = vqrshrn_n_u16(q13u16, 7); d15u8 = vqrshrn_n_u16(q14u16, 7); d16u8 = vqrshrn_n_u16(q1u16, 7); d17u8 = vqrshrn_n_u16(q2u16, 7); d18u8 = vqrshrn_n_u16(q3u16, 7); d19u8 = vqrshrn_n_u16(q4u16, 7); q5u8 = vcombine_u8(d10u8, d11u8); q6u8 = vcombine_u8(d12u8, d13u8); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); // secondpass_filter d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); tmpp = tmp; tmpp2 = tmpp + 272; q11u8 = vld1q_u8(tmpp); tmpp += 16; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(tmpp); tmpp += 16; q13u8 = vld1q_u8(tmpp); tmpp += 16; q14u8 = vld1q_u8(tmpp); tmpp += 16; q15u8 = vld1q_u8(tmpp); tmpp += 16; q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } // sub_pixel_variance16x16_neon q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); tmpp = tmp + 272; for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop q0u8 = vld1q_u8(tmpp); tmpp += 16; q1u8 = vld1q_u8(tmpp); tmpp += 16; q2u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; q3u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
uint32_t test_vget_lane_u32(uint32x2_t a) { // CHECK-LABEL: test_vget_lane_u32: // CHECK-NEXT: mov.s w0, v0[1] // CHECK-NEXT: ret return vget_lane_u32(a, 1); }
// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 // CHECK: ret i32 [[VGET_LANE]] uint32_t test_vget_lane_u32(uint32x2_t a) { return vget_lane_u32(a, 1); }
unsigned int vp8_variance_halfpixvar16x16_hv_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { int i; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; int32x4_t q13s32, q14s32, q15s32; int64x2_t q0s64, q1s64, q5s64; q13s32 = vdupq_n_s32(0); q14s32 = vdupq_n_s32(0); q15s32 = vdupq_n_s32(0); q0u8 = vld1q_u8(src_ptr); q1u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q1u8 = vextq_u8(q0u8, q1u8, 1); q0u8 = vrhaddq_u8(q0u8, q1u8); for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon q2u8 = vld1q_u8(src_ptr); q3u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q4u8 = vld1q_u8(src_ptr); q5u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q6u8 = vld1q_u8(src_ptr); q7u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q8u8 = vld1q_u8(src_ptr); q9u8 = vld1q_u8(src_ptr + 16); src_ptr += source_stride; q3u8 = vextq_u8(q2u8, q3u8, 1); q5u8 = vextq_u8(q4u8, q5u8, 1); q7u8 = vextq_u8(q6u8, q7u8, 1); q9u8 = vextq_u8(q8u8, q9u8, 1); q1u8 = vrhaddq_u8(q2u8, q3u8); q2u8 = vrhaddq_u8(q4u8, q5u8); q3u8 = vrhaddq_u8(q6u8, q7u8); q4u8 = vrhaddq_u8(q8u8, q9u8); q0u8 = vrhaddq_u8(q0u8, q1u8); q1u8 = vrhaddq_u8(q1u8, q2u8); q2u8 = vrhaddq_u8(q2u8, q3u8); q3u8 = vrhaddq_u8(q3u8, q4u8); q5u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q6u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q7u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; q8u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); d4u8 = vget_low_u8(q2u8); d5u8 = vget_high_u8(q2u8); d6u8 = vget_low_u8(q3u8); d7u8 = vget_high_u8(q3u8); q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); q14s32 = vmlal_s16(q14s32, d18s16, d18s16); q15s32 = vmlal_s16(q15s32, d19s16, d19s16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); q14s32 = vmlal_s16(q14s32, d20s16, d20s16); q15s32 = vmlal_s16(q15s32, d21s16, d21s16); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); q14s32 = vmlal_s16(q14s32, d22s16, d22s16); q15s32 = vmlal_s16(q15s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); q14s32 = vmlal_s16(q14s32, d24s16, d24s16); q15s32 = vmlal_s16(q15s32, d25s16, d25s16); d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); q14s32 = vmlal_s16(q14s32, d0s16, d0s16); q15s32 = vmlal_s16(q15s32, d1s16, d1s16); d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); q14s32 = vmlal_s16(q14s32, d2s16, d2s16); q15s32 = vmlal_s16(q15s32, d3s16, d3s16); d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); q14s32 = vmlal_s16(q14s32, d10s16, d10s16); q15s32 = vmlal_s16(q15s32, d11s16, d11s16); d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); q14s32 = vmlal_s16(q14s32, d12s16, d12s16); q15s32 = vmlal_s16(q15s32, d13s16, d13s16); q0u8 = q4u8; } q15s32 = vaddq_s32(q14s32, q15s32); q0s64 = vpaddlq_s32(q13s32); q1s64 = vpaddlq_s32(q15s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
KFR_SINTRIN bool bittestany(const u32neon& a) { const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; }
int search_line_fast (uint32x2_t t) { return vget_lane_u32 (t, 0); }