void test_vget_laneu64 (void) { uint64_t out_uint64_t; uint64x1_t arg0_uint64x1_t; out_uint64_t = vget_lane_u64 (arg0_uint64x1_t, 0); }
int main (void) { uint64_t out_uint64_t = 0; uint64x1_t arg0_uint64x1_t = (uint64x1_t) 0xdeadbeefbadf00dLL; out_uint64_t = vget_lane_u64 (arg0_uint64x1_t, 0); if (out_uint64_t != (uint64_t)arg0_uint64x1_t) abort(); return 0; }
int main (void) { uint64_t expected; uint64_t actual; float64x1_t arg1, arg2; int i, j; for (i = 0; i < SIZE; ++i) for (j = 0; j < SIZE; ++j) { expected = __builtin_fabs (in[i]) >= __builtin_fabs (in[j]) ? -1 : 0; arg1 = (float64x1_t) { in[i] }; arg2 = (float64x1_t) { in[j] }; actual = vget_lane_u64 (vcage_f64 (arg1, arg2), 0); if (actual != expected) abort (); } return 0; }
uint64_t test_vget_lane_u64(uint64x1_t v1) { // CHECK: test_vget_lane_u64 return vget_lane_u64(v1, 0); // CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} }
inline bool isFound(uint8x16_t x) { uint8x8_t xx = vorr_u8(vget_low_u8(x), vget_high_u8(x)); return vget_lane_u64(vreinterpret_u64_u8(xx), 0); }
uint64_t test_vget_lane_u64(uint64x1_t a) { // CHECK-LABEL: test_vget_lane_u64: // CHECK-NEXT: fmov x0, d0 // CHECK-NEXT: ret return vget_lane_u64(a, 0); }
uint64_t test_vget_lane_u64_before (uint64x1_t in) { /* { dg-error "lane -1 out of range 0 - 0" "" {target *-*-*} 0 } */ return vget_lane_u64 (in, -1); }
// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> // CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 // CHECK: ret i64 [[VGET_LANE]] uint64_t test_vget_lane_u64(uint64x1_t a) { return vget_lane_u64(a, 0); }
f64 dotProduct(const Size2D &_size, const u8 * src0Base, ptrdiff_t src0Stride, const u8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements #define DOT_UINT_BLOCKSIZE 66050*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; uint64x2_t ws = vmovq_n_u64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; uint32x4_t s1 = vmovq_n_u32(0); uint32x4_t s2 = vmovq_n_u32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); uint8x16_t vs1 = vld1q_u8(src0 + i); uint8x16_t vs2 = vld1q_u8(src1 + i); uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); s1 = vpadalq_u16(s1, vdot1); s2 = vpadalq_u16(s2, vdot2); } ws = vpadalq_u32(ws, s1); ws = vpadalq_u32(ws, s2); } if(i + 8 <= size.width) { uint8x8_t vs1 = vld1_u8(src0 + i); uint8x8_t vs2 = vld1_u8(src1 + i); ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); i += 8; } result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }