static INLINE int32x4_t add_dct_const_round_shift_low_8_bd12(
    const int64x2_t *const in0, const int64x2_t *const in1) {
  const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]);
  const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]);
  const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS);
  const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS);
  return vcombine_s32(out_lo, out_hi);
}
Пример #2
0
int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
                                int block_size) {
  int64x2_t error = vdupq_n_s64(0);

  assert(block_size >= 8);
  assert((block_size % 8) == 0);

  do {
    const int16x8_t c = vld1q_s16(coeff);
    const int16x8_t d = vld1q_s16(dqcoeff);
    const int16x8_t diff = vsubq_s16(c, d);
    const int16x4_t diff_lo = vget_low_s16(diff);
    const int16x4_t diff_hi = vget_high_s16(diff);
    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
    // accumulating them in 64-bits.
    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
    error = vaddq_s64(error, err2);
    coeff += 8;
    dqcoeff += 8;
    block_size -= 8;
  } while (block_size != 0);

  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
}
Пример #3
0
void test_vaddQs64 (void)
{
  int64x2_t out_int64x2_t;
  int64x2_t arg0_int64x2_t;
  int64x2_t arg1_int64x2_t;

  out_int64x2_t = vaddq_s64 (arg0_int64x2_t, arg1_int64x2_t);
}
Пример #4
0
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
                                          int32x4_t *const a0,
                                          int32x4_t *const a1,
                                          int32x4_t *const a2,
                                          int32x4_t *const a3) {
  int32x4_t b0, b1, b2, b3;
  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;

  transpose_s32_4x4(a0, a1, a2, a3);
  b0 = vaddq_s32(*a0, *a2);
  b1 = vsubq_s32(*a0, *a2);
  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
  c4 = vsubq_s64(c4, c8);
  c5 = vsubq_s64(c5, c9);
  c6 = vaddq_s64(c6, c10);
  c7 = vaddq_s64(c7, c11);
  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
                    vrshrn_n_s64(c1, DCT_CONST_BITS));
  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
                    vrshrn_n_s64(c3, DCT_CONST_BITS));
  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
                    vrshrn_n_s64(c5, DCT_CONST_BITS));
  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
                    vrshrn_n_s64(c7, DCT_CONST_BITS));
  *a0 = vaddq_s32(b0, b3);
  *a1 = vaddq_s32(b1, b2);
  *a2 = vsubq_s32(b1, b2);
  *a3 = vsubq_s32(b0, b3);
}
static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);

  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
#endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }

  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }

  sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}