Esempio n. 1
0
void test_vmovQ_nu64 (void)
{
  uint64x2_t out_uint64x2_t;
  uint64_t arg0_uint64_t;

  out_uint64x2_t = vmovq_n_u64 (arg0_uint64_t);
}
Esempio n. 2
0
uint64x2_t test_vmovq_n_u64(uint64_t v1) {
  // CHECK: test_vmovq_n_u64
  return vmovq_n_u64(v1);
  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
}
Esempio n. 3
0
f64 dotProduct(const Size2D &_size,
               const u8 * src0Base, ptrdiff_t src0Stride,
               const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        uint64x2_t ws = vmovq_n_u64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            uint32x4_t s1 = vmovq_n_u32(0);
            uint32x4_t s2 = vmovq_n_u32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                uint8x16_t vs1 = vld1q_u8(src0 + i);
                uint8x16_t vs2 = vld1q_u8(src1 + i);

                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));

                s1 = vpadalq_u16(s1, vdot1);
                s2 = vpadalq_u16(s2, vdot2);
            }

            ws = vpadalq_u32(ws, s1);
            ws = vpadalq_u32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            uint8x8_t vs1 = vld1_u8(src0 + i);
            uint8x8_t vs2 = vld1_u8(src1 + i);

            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}