Esempio n. 1
0
void test_vget_highu64 (void)
{
  uint64x1_t out_uint64x1_t;
  uint64x2_t arg0_uint64x2_t;

  out_uint64x1_t = vget_high_u64 (arg0_uint64x2_t);
}
Esempio n. 2
0
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  const uint32x4_t a = vpaddlq_u16(v_16x8);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                vreinterpret_u32_u64(vget_high_u64(b)));
  return vget_lane_u32(c, 0);
}
Esempio n. 3
0
uint64x1_t test_vget_high_u64(uint64x2_t a) {
  // CHECK-LABEL: test_vget_high_u64:
  return vget_high_u64(a);
  // CHECK: dup d0, {{v[0-9]+}}.d[1]
}
Esempio n. 4
0
uint64x1_t test_vget_high_u64(uint64x2_t a) {
  // CHECK-COMMON-LABEL: test_vget_high_u64:
  return vget_high_u64(a);
  // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1]
  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
}
Esempio n. 5
0
OD_SIMD_INLINE uint64x2x2_t od_vswpq_u64(uint64x2_t a, uint64x2_t b) {
  uint64x2x2_t x;
  x.val[0] = vcombine_u64(vget_low_u64(a), vget_low_u64(b));
  x.val[1] = vcombine_u64(vget_high_u64(a), vget_high_u64(b));
  return x;
}
Esempio n. 6
0
f64 dotProduct(const Size2D &_size,
               const u8 * src0Base, ptrdiff_t src0Stride,
               const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        uint64x2_t ws = vmovq_n_u64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            uint32x4_t s1 = vmovq_n_u32(0);
            uint32x4_t s2 = vmovq_n_u32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                uint8x16_t vs1 = vld1q_u8(src0 + i);
                uint8x16_t vs2 = vld1q_u8(src1 + i);

                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));

                s1 = vpadalq_u16(s1, vdot1);
                s2 = vpadalq_u16(s2, vdot2);
            }

            ws = vpadalq_u32(ws, s1);
            ws = vpadalq_u32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            uint8x8_t vs1 = vld1_u8(src0 + i);
            uint8x8_t vs2 = vld1_u8(src1 + i);

            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}
Esempio n. 7
0
// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
uint64x1_t test_vget_high_u64(uint64x2_t a) {
  return vget_high_u64(a);
}