static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size) { __m256i total = _mm256_setzero_si256(); __m512i ones = _mm512_setzero_si512(); __m512i twos = _mm512_setzero_si512(); __m512i fours = _mm512_setzero_si512(); __m512i eights = _mm512_setzero_si512(); __m512i sixteens = _mm512_setzero_si512(); __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; const uint64_t limit = size - size % 16; uint64_t i = 0; for(; i < limit; i += 16) { CSA(&twosA, &ones, ones, data[i+0], data[i+1]); CSA(&twosB, &ones, ones, data[i+2], data[i+3]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+4], data[i+5]); CSA(&twosB, &ones, ones, data[i+6], data[i+7]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsA,&fours, fours, foursA, foursB); CSA(&twosA, &ones, ones, data[i+8], data[i+9]); CSA(&twosB, &ones, ones, data[i+10], data[i+11]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+12], data[i+13]); CSA(&twosB, &ones, ones, data[i+14], data[i+15]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsB, &fours, fours, foursA, foursB); CSA(&sixteens, &eights, eights, eightsA, eightsB); total = _mm256_add_epi64(total, popcount(sixteens)); } total = _mm256_slli_epi64(total, 4); // * 16 total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours), 2)); // += 4 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos), 1)); // += 2 * ... total = _mm256_add_epi64(total, popcount(ones)); for(; i < size; i++) { total = _mm256_add_epi64(total, popcount(data[i])); } return avx2_sum_epu64(total); }
std::uint64_t popcnt_AVX512BW_lookup_original(const uint8_t* data, const size_t n) { size_t i = 0; const __m512i lookup = _mm512_setr_epi64( 0x0302020102010100llu, 0x0403030203020201llu, 0x0302020102010100llu, 0x0403030203020201llu, 0x0302020102010100llu, 0x0403030203020201llu, 0x0302020102010100llu, 0x0403030203020201llu ); const __m512i low_mask = _mm512_set1_epi8(0x0f); __m512i acc = _mm512_setzero_si512(); while (i + 64 < n) { __m512i local = _mm512_setzero_si512(); for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) { const __m512i vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(data + i)); const __m512i lo = _mm512_and_si512(vec, low_mask); const __m512i hi = _mm512_and_si512(_mm512_srli_epi32(vec, 4), low_mask); const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo); const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi); local = _mm512_add_epi8(local, popcnt1); local = _mm512_add_epi8(local, popcnt2); } acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512())); } uint64_t result = custom::_mm512_hsum_epi64(acc); for (/**/; i < n; i++) { result += lookup8bit[data[i]]; } return result; }
template <bool align, bool mask> void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst, __mmask64 tail = -1) { __m512i threshold = Load<false, mask>(src, tail); __m512i lbp = _mm512_setzero_si512(); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 - stride, tail)), threshold), (char)0x01)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src - stride, tail)), threshold), (char)0x02)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 - stride, tail)), threshold), (char)0x04)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1, tail)), threshold), (char)0x08)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 + stride, tail)), threshold), (char)0x10)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + stride, tail)), threshold), (char)0x20)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 + stride, tail)), threshold), (char)0x40)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1, tail)), threshold), (char)0x80)); Store<false, mask>(dst, lbp, tail); }
void static avx512f_test (void) { int i; union512i_q res; long long res_ref[8]; res.x = _mm512_setzero_si512 (); for (i = 0; i < 8; i++) res_ref[i] = 0; if (check_union512i_q (res, res_ref)) abort (); }
static void avx512f_test (void) { long long v[8] = { 0x12e9e94645ad8LL, 0x851c0b39446LL, 2134, 6678, 0x786784645245LL, 0x9487731234LL, 41124, 86530 }; union512i_q res; res.x = foo (v); if (check_union512i_q (res, v)) abort (); res.x = _mm512_setzero_si512 (); res.x = foo_r (v); if (check_union512i_q (res, v)) abort (); }
int tst_audit10_aux (void) { #ifdef __AVX512F__ extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i, __m512i, __m512i, __m512i, __m512i); __m512i zmm = _mm512_setzero_si512 (); __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm); zmm = _mm512_set1_epi64 (0x12349876); if (memcmp (&zmm, &ret, sizeof (ret))) abort (); return 0; #else /* __AVX512F__ */ return 77; #endif /* __AVX512F__ */ }
__m512i audit_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3, __m512i x4, __m512i x5, __m512i x6, __m512i x7) { __m512i zmm; zmm = _mm512_set1_epi64 (1); if (memcmp (&zmm, &x0, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (2); if (memcmp (&zmm, &x1, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (3); if (memcmp (&zmm, &x2, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (4); if (memcmp (&zmm, &x3, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (5); if (memcmp (&zmm, &x4, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (6); if (memcmp (&zmm, &x5, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (7); if (memcmp (&zmm, &x6, sizeof (zmm))) abort (); zmm = _mm512_set1_epi64 (8); if (memcmp (&zmm, &x7, sizeof (zmm))) abort (); return _mm512_setzero_si512 (); }
static void avx512f_test (void) { int i; int e = 0xabadbeef; int v[16]; union512i_d res; for (i = 0; i < 16; i++) v[i] = e; res.x = foo (e); if (check_union512i_d (res, v)) abort (); res.x = _mm512_setzero_si512 (); res.x = foo_r (e); if (check_union512i_d (res, v)) abort (); }
__m512i __attribute__((__target__("avx512f"))) mm512_setzero_si512_wrap(void) { return _mm512_setzero_si512(); }