static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size)
{
  __m256i total     = _mm256_setzero_si256();
  __m512i ones      = _mm512_setzero_si512();
  __m512i twos      = _mm512_setzero_si512();
  __m512i fours     = _mm512_setzero_si512();
  __m512i eights    = _mm512_setzero_si512();
  __m512i sixteens  = _mm512_setzero_si512();
  __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;

  const uint64_t limit = size - size % 16;
  uint64_t i = 0;

  for(; i < limit; i += 16)
  {
    CSA(&twosA, &ones, ones, data[i+0], data[i+1]);
    CSA(&twosB, &ones, ones, data[i+2], data[i+3]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+4], data[i+5]);
    CSA(&twosB, &ones, ones, data[i+6], data[i+7]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsA,&fours, fours, foursA, foursB);
    CSA(&twosA, &ones, ones, data[i+8], data[i+9]);
    CSA(&twosB, &ones, ones, data[i+10], data[i+11]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+12], data[i+13]);
    CSA(&twosB, &ones, ones, data[i+14], data[i+15]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsB, &fours, fours, foursA, foursB);
    CSA(&sixteens, &eights, eights, eightsA, eightsB);

    total = _mm256_add_epi64(total, popcount(sixteens));
  }

  total = _mm256_slli_epi64(total, 4);     // * 16
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours),  2)); // += 4 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos),   1)); // += 2 * ...
  total = _mm256_add_epi64(total, popcount(ones));

  for(; i < size; i++) {
    total = _mm256_add_epi64(total, popcount(data[i]));
  }


  return avx2_sum_epu64(total);
}
std::uint64_t popcnt_AVX512BW_lookup_original(const uint8_t* data, const size_t n) {

    size_t i = 0;

    const __m512i lookup = _mm512_setr_epi64(
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu
    );

    const __m512i low_mask = _mm512_set1_epi8(0x0f);

    __m512i acc = _mm512_setzero_si512();

    while (i + 64 < n) {

        __m512i local = _mm512_setzero_si512(); 

        for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) {
            const __m512i vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(data + i));
            const __m512i lo  = _mm512_and_si512(vec, low_mask);
            const __m512i hi  = _mm512_and_si512(_mm512_srli_epi32(vec, 4), low_mask);

            const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo);
            const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi);

            local = _mm512_add_epi8(local, popcnt1);
            local = _mm512_add_epi8(local, popcnt2);
        }

        acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512()));
    }


    uint64_t result = custom::_mm512_hsum_epi64(acc);

    for (/**/; i < n; i++) {
        result += lookup8bit[data[i]];
    }

    return result;
}
 template <bool align, bool mask> void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst, __mmask64 tail = -1)
 {
     __m512i threshold = Load<false, mask>(src, tail);
     __m512i lbp = _mm512_setzero_si512();
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 - stride, tail)), threshold), (char)0x01));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src - stride, tail)), threshold), (char)0x02));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 - stride, tail)), threshold), (char)0x04));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1, tail)), threshold), (char)0x08));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 + stride, tail)), threshold), (char)0x10));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + stride, tail)), threshold), (char)0x20));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 + stride, tail)), threshold), (char)0x40));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1, tail)), threshold), (char)0x80));
     Store<false, mask>(dst, lbp, tail);
 }
void static
avx512f_test (void)
{
  int i;
  union512i_q res;
  long long res_ref[8];

  res.x = _mm512_setzero_si512 ();

  for (i = 0; i < 8; i++)
    res_ref[i] = 0;

  if (check_union512i_q (res, res_ref))
    abort ();
}
Exemple #5
0
static void
avx512f_test (void)
{
  long long v[8] = { 0x12e9e94645ad8LL, 0x851c0b39446LL, 2134, 6678,
		     0x786784645245LL, 0x9487731234LL, 41124, 86530 };
  union512i_q res;

  res.x = foo (v);

  if (check_union512i_q (res, v))
    abort ();

  res.x = _mm512_setzero_si512 ();

  res.x = foo_r (v);

  if (check_union512i_q (res, v))
    abort ();
}
int
tst_audit10_aux (void)
{
#ifdef __AVX512F__
  extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
                             __m512i, __m512i, __m512i, __m512i);

  __m512i zmm = _mm512_setzero_si512 ();
  __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm);

  zmm = _mm512_set1_epi64 (0x12349876);

  if (memcmp (&zmm, &ret, sizeof (ret)))
    abort ();
  return 0;
#else /* __AVX512F__ */
  return 77;
#endif /* __AVX512F__ */
}
Exemple #7
0
__m512i
audit_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3,
	    __m512i x4, __m512i x5, __m512i x6, __m512i x7)
{
  __m512i zmm;

  zmm = _mm512_set1_epi64 (1);
  if (memcmp (&zmm, &x0, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (2);
  if (memcmp (&zmm, &x1, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (3);
  if (memcmp (&zmm, &x2, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (4);
  if (memcmp (&zmm, &x3, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (5);
  if (memcmp (&zmm, &x4, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (6);
  if (memcmp (&zmm, &x5, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (7);
  if (memcmp (&zmm, &x6, sizeof (zmm)))
    abort ();

  zmm = _mm512_set1_epi64 (8);
  if (memcmp (&zmm, &x7, sizeof (zmm)))
    abort ();

  return _mm512_setzero_si512 ();
}
Exemple #8
0
static void
avx512f_test (void)
{
  int i;
  int e = 0xabadbeef;
  int v[16];
  union512i_d res;

  for (i = 0; i < 16; i++)
    v[i] = e;

  res.x = foo (e);

  if (check_union512i_d (res, v))
    abort ();

  res.x = _mm512_setzero_si512 ();

  res.x = foo_r (e);

  if (check_union512i_d (res, v))
    abort ();
}
Exemple #9
0
__m512i __attribute__((__target__("avx512f"))) mm512_setzero_si512_wrap(void) {
  return _mm512_setzero_si512();
}