uint64_t popcnt_neon_harley_seal(const uint8_t* data, const size_t size) { uint32x2_t total = vdup_n_u32(0); uint8x16_t ones, twos, fours, eights, sixteens; uint8x16_t twosA, twosB, foursA, foursB, eightsA, eightsB; uint64_t limit = size - size % (16*16); uint64_t i = 0; ones = twos = fours = eights = sixteens = vdupq_n_u8(0); uint8_t* ptr = const_cast<uint8_t*>(data); for(; i < limit; i += 16*16) { CSA(twosA, ones, ones, vld1q_u8(ptr + 16*0), vld1q_u8(ptr + 16*1)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*2), vld1q_u8(ptr + 16*3)); CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*4), vld1q_u8(ptr + 16*5)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*6), vld1q_u8(ptr + 16*7)); CSA(foursB, twos, twos, twosA, twosB); CSA(eightsA,fours, fours, foursA, foursB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*8), vld1q_u8(ptr + 16*9)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*10), vld1q_u8(ptr + 16*11)); CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, vld1q_u8(ptr + 16*12), vld1q_u8(ptr + 16*13)); CSA(twosB, ones, ones, vld1q_u8(ptr + 16*14), vld1q_u8(ptr + 16*15)); CSA(foursB, twos, twos, twosA, twosB); CSA(eightsB, fours, fours, foursA, foursB); CSA(sixteens, eights, eights, eightsA, eightsB); total = vadd_u32(total, popcnt_neon_qreg(sixteens)); ptr += 16*16; } total = vshl_n_u32(total, 4); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(eights), 3)); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(fours), 2)); total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(twos), 1)); total = vadd_u32(total, popcnt_neon_qreg(ones)); uint32_t scalar = 0; scalar += vget_lane_u32(total, 0); scalar += vget_lane_u32(total, 1); for(; i < size; i++) { scalar += lookup8bit[*ptr++]; } return scalar; }
int64_t BitUtil::pop_xor(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords) { int32_t n = wordOffset + numWords; int64_t tot = 0; int64_t tot8 = 0; int64_t ones = 0; int64_t twos = 0; int64_t fours = 0; int32_t i = wordOffset; for (; i <= n - 8; i += 8) { int64_t twosA; CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1])); int64_t twosB; CSA(twosB, ones, ones, (A[i + 2] ^ B[i + 2]), (A[i + 3] ^ B[i + 3])); int64_t foursA; CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, (A[i + 4] ^ B[i + 4]), (A[i + 5] ^ B[i + 5])); CSA(twosB, ones, ones, (A[i + 6] ^ B[i + 6]), (A[i + 7] ^ B[i + 7])); int64_t foursB; CSA(foursB, twos, twos, twosA, twosB); int64_t eights; CSA(eights, fours, fours, foursA, foursB); tot8 += pop(eights); } if (i <= n - 4) { int64_t twosA; CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1])); int64_t twosB; CSA(twosB, ones, ones, (A[i + 2] ^ B[i + 2]), (A[i + 3] ^ B[i + 3])); int64_t foursA; CSA(foursA, twos, twos, twosA, twosB); int64_t eights = fours & foursA; fours = fours ^ foursA; tot8 += pop(eights); i += 4; } if (i <= n - 2) { int64_t twosA; CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1])); int64_t foursA = twos & twosA; twos = twos ^ twosA; int64_t eights = fours & foursA; fours = fours ^ foursA; tot8 += pop(eights); i += 2; } if (i < n) tot += pop((A[i] ^ B[i])); tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3); return tot; }
int64_t BitUtil::pop_array(const int64_t* A, int32_t wordOffset, int32_t numWords) { int32_t n = wordOffset + numWords; int64_t tot = 0; int64_t tot8 = 0; int64_t ones = 0; int64_t twos = 0; int64_t fours = 0; int32_t i = wordOffset; for (; i <= n - 8; i += 8) { int64_t twosA; CSA(twosA, ones, ones, A[i], A[i + 1]); int64_t twosB; CSA(twosB, ones, ones, A[i + 2], A[i + 3]); int64_t foursA; CSA(foursA, twos, twos, twosA, twosB); CSA(twosA, ones, ones, A[i + 4], A[i + 5]); CSA(twosB, ones, ones, A[i + 6], A[i + 7]); int64_t foursB; CSA(foursB, twos, twos, twosA, twosB); int64_t eights; CSA(eights, fours, fours, foursA, foursB); tot8 += pop(eights); } // Handle trailing words in a binary-search manner. // Derived from the loop above by setting specific elements to 0. if (i <= n - 4) { int64_t twosA; CSA(twosA, ones, ones, A[i], A[i + 1]); int64_t twosB; CSA(twosB, ones, ones, A[i + 2], A[i + 3]); int64_t foursA; CSA(foursA, twos, twos, twosA, twosB); int64_t eights = fours & foursA; fours = fours ^ foursA; tot8 += pop(eights); i += 4; } if (i <= n - 2) { int64_t twosA; CSA(twosA, ones, ones, A[i], A[i + 1]); int64_t foursA = twos & twosA; twos = twos ^ twosA; int64_t eights = fours & foursA; fours = fours ^ foursA; tot8 += pop(eights); i += 2; } if (i < n) tot += pop(A[i]); tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3); return tot; }
static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size) { __m256i total = _mm256_setzero_si256(); __m512i ones = _mm512_setzero_si512(); __m512i twos = _mm512_setzero_si512(); __m512i fours = _mm512_setzero_si512(); __m512i eights = _mm512_setzero_si512(); __m512i sixteens = _mm512_setzero_si512(); __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; const uint64_t limit = size - size % 16; uint64_t i = 0; for(; i < limit; i += 16) { CSA(&twosA, &ones, ones, data[i+0], data[i+1]); CSA(&twosB, &ones, ones, data[i+2], data[i+3]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+4], data[i+5]); CSA(&twosB, &ones, ones, data[i+6], data[i+7]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsA,&fours, fours, foursA, foursB); CSA(&twosA, &ones, ones, data[i+8], data[i+9]); CSA(&twosB, &ones, ones, data[i+10], data[i+11]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+12], data[i+13]); CSA(&twosB, &ones, ones, data[i+14], data[i+15]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsB, &fours, fours, foursA, foursB); CSA(&sixteens, &eights, eights, eightsA, eightsB); total = _mm256_add_epi64(total, popcount(sixteens)); } total = _mm256_slli_epi64(total, 4); // * 16 total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours), 2)); // += 4 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos), 1)); // += 2 * ... total = _mm256_add_epi64(total, popcount(ones)); for(; i < size; i++) { total = _mm256_add_epi64(total, popcount(data[i])); } return avx2_sum_epu64(total); }