コード例 #1
0
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size)
{
    const size_t chunk_size = 16 * 4 * 2;

    uint8_t* ptr = const_cast<uint8_t*>(data);

    const size_t n = size / chunk_size;
    const size_t k = size % chunk_size;

    uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0));

    for (size_t i=0; i < n; i++, ptr += chunk_size) {

        uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4);
        uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4);

        uint8x16_t t0   = vcntq_u8(input0.val[0]);
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[3]));

        t0 = vaddq_u8(t0, vcntq_u8(input1.val[0]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[3]));

        const uint16x8_t t1 = vpaddlq_u8(t0);

        sum = vpadalq_u16(sum, t1);
    }

    uint32_t scalar = 0;
    uint32_t tmp[4];

    vst1q_u32(tmp, sum);
    for (int i=0; i < 4; i++) {
        scalar += tmp[i];
    }

    for (size_t j=0; j < k; j++) {
        scalar += lookup8bit[ptr[j]];
    }

    return scalar;
}
コード例 #2
0
ファイル: movdi_1.c プロジェクト: 0day-ci/gcc
void
foo1 (uint64_t *a)
{
  uint64x1_t val18;
  uint32x2_t val19;
  uint64x1_t val20;
  val19 = vcreate_u32 (0x800000004cf3dffbUL);
  val20 = vrsra_n_u64 (val18, vreinterpret_u64_u32 (val19), 34);
  vst1_u64 (a, val20);
}
コード例 #3
0
uint32x2_t test_vcreate_u32(uint64_t v1) {
  // CHECK: test_vcreate_u32
  return vcreate_u32(v1);
  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
}