static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);

  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
#endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }

  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }

  sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}
void test_vpadalQs32 (void)
{
  int64x2_t out_int64x2_t;
  int64x2_t arg0_int64x2_t;
  int32x4_t arg1_int32x4_t;

  out_int64x2_t = vpadalq_s32 (arg0_int64x2_t, arg1_int32x4_t);
}
예제 #3
0
f64 dotProduct(const Size2D &_size,
               const s8 * src0Base, ptrdiff_t src0Stride,
               const s8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        int64x2_t ws = vmovq_n_s64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            int32x4_t s1 = vmovq_n_s32(0);
            int32x4_t s2 = vmovq_n_s32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                int8x16_t vs1 = vld1q_s8(src0 + i);
                int8x16_t vs2 = vld1q_s8(src1 + i);

                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));

                s1 = vpadalq_s16(s1, vdot1);
                s2 = vpadalq_s16(s2, vdot2);
            }

            ws = vpadalq_s32(ws, s1);
            ws = vpadalq_s32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            int8x8_t vs1 = vld1_s8(src0 + i);
            int8x8_t vs2 = vld1_s8(src1 + i);

            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}