void test_vget_highu64 (void) { uint64x1_t out_uint64x1_t; uint64x2_t arg0_uint64x2_t; out_uint64x1_t = vget_high_u64 (arg0_uint64x2_t); }
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); }
uint64x1_t test_vget_high_u64(uint64x2_t a) { // CHECK-LABEL: test_vget_high_u64: return vget_high_u64(a); // CHECK: dup d0, {{v[0-9]+}}.d[1] }
uint64x1_t test_vget_high_u64(uint64x2_t a) { // CHECK-COMMON-LABEL: test_vget_high_u64: return vget_high_u64(a); // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1] // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 }
OD_SIMD_INLINE uint64x2x2_t od_vswpq_u64(uint64x2_t a, uint64x2_t b) { uint64x2x2_t x; x.val[0] = vcombine_u64(vget_low_u64(a), vget_low_u64(b)); x.val[1] = vcombine_u64(vget_high_u64(a), vget_high_u64(b)); return x; }
f64 dotProduct(const Size2D &_size, const u8 * src0Base, ptrdiff_t src0Stride, const u8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements #define DOT_UINT_BLOCKSIZE 66050*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; uint64x2_t ws = vmovq_n_u64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; uint32x4_t s1 = vmovq_n_u32(0); uint32x4_t s2 = vmovq_n_u32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); uint8x16_t vs1 = vld1q_u8(src0 + i); uint8x16_t vs2 = vld1q_u8(src1 + i); uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); s1 = vpadalq_u16(s1, vdot1); s2 = vpadalq_u16(s2, vdot2); } ws = vpadalq_u32(ws, s1); ws = vpadalq_u32(ws, s2); } if(i + 8 <= size.width) { uint8x8_t vs1 = vld1_u8(src0 + i); uint8x8_t vs2 = vld1_u8(src1 + i); ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); i += 8; } result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1> // CHECK: ret <1 x i64> [[SHUFFLE_I]] uint64x1_t test_vget_high_u64(uint64x2_t a) { return vget_high_u64(a); }