static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void test_vpadalQu32 (void) { uint64x2_t out_uint64x2_t; uint64x2_t arg0_uint64x2_t; uint32x4_t arg1_uint32x4_t; out_uint64x2_t = vpadalq_u32 (arg0_uint64x2_t, arg1_uint32x4_t); }
f64 dotProduct(const Size2D &_size, const u8 * src0Base, ptrdiff_t src0Stride, const u8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements #define DOT_UINT_BLOCKSIZE 66050*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; uint64x2_t ws = vmovq_n_u64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; uint32x4_t s1 = vmovq_n_u32(0); uint32x4_t s2 = vmovq_n_u32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); uint8x16_t vs1 = vld1q_u8(src0 + i); uint8x16_t vs2 = vld1q_u8(src1 + i); uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); s1 = vpadalq_u16(s1, vdot1); s2 = vpadalq_u16(s2, vdot2); } ws = vpadalq_u32(ws, s1); ws = vpadalq_u32(ws, s2); } if(i + 8 <= size.width) { uint8x8_t vs1 = vld1_u8(src0 + i); uint8x8_t vs2 = vld1_u8(src1 + i); ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); i += 8; } result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }