static INLINE int32x4_t add_dct_const_round_shift_low_8_bd12( const int64x2_t *const in0, const int64x2_t *const in1) { const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]); const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS); return vcombine_s32(out_lo, out_hi); }
int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size) { int64x2_t error = vdupq_n_s64(0); assert(block_size >= 8); assert((block_size % 8) == 0); do { const int16x8_t c = vld1q_s16(coeff); const int16x8_t d = vld1q_s16(dqcoeff); const int16x8_t diff = vsubq_s16(c, d); const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before // accumulating them in 64-bits. const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); error = vaddq_s64(error, err2); coeff += 8; dqcoeff += 8; block_size -= 8; } while (block_size != 0); return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); }
void test_vaddQs64 (void) { int64x2_t out_int64x2_t; int64x2_t arg0_int64x2_t; int64x2_t arg1_int64x2_t; out_int64x2_t = vaddq_s64 (arg0_int64x2_t, arg1_int64x2_t); }
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); c4 = vsubq_s64(c4, c8); c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), vrshrn_n_s64(c1, DCT_CONST_BITS)); b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), vrshrn_n_s64(c3, DCT_CONST_BITS)); b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), vrshrn_n_s64(c5, DCT_CONST_BITS)); b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
static inline void DotProductWithScaleNeon(int32_t* cross_correlation, const int16_t* vector1, const int16_t* vector2, size_t length, int scaling) { size_t i = 0; size_t len1 = length >> 3; size_t len2 = length & 7; int64x2_t sum0 = vdupq_n_s64(0); int64x2_t sum1 = vdupq_n_s64(0); for (i = len1; i > 0; i -= 1) { int16x8_t seq1_16x8 = vld1q_s16(vector1); int16x8_t seq2_16x8 = vld1q_s16(vector2); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8), vget_low_s16(seq2_16x8)); int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8); #else int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8), vget_low_s16(seq2_16x8)); int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8), vget_high_s16(seq2_16x8)); #endif sum0 = vpadalq_s32(sum0, tmp0); sum1 = vpadalq_s32(sum1, tmp1); vector1 += 8; vector2 += 8; } // Calculate the rest of the samples. int64_t sum_res = 0; for (i = len2; i > 0; i -= 1) { sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2); vector1++; vector2++; } sum0 = vaddq_s64(sum0, sum1); #if defined(WEBRTC_ARCH_ARM64) int64_t sum2 = vaddvq_s64(sum0); *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling); #else int64x1_t shift = vdup_n_s64(-scaling); int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0)); sum2 = vadd_s64(sum2, vdup_n_s64(sum_res)); sum2 = vshl_s64(sum2, shift); vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0); #endif }