示例#1
0
uint64_t popcnt_neon_harley_seal(const uint8_t* data, const size_t size)
{
  uint32x2_t total = vdup_n_u32(0);
  uint8x16_t ones, twos, fours, eights, sixteens;
  uint8x16_t twosA, twosB, foursA, foursB, eightsA, eightsB;
  uint64_t limit = size - size % (16*16);
  uint64_t i = 0;

  ones = twos = fours = eights = sixteens = vdupq_n_u8(0);

  uint8_t* ptr = const_cast<uint8_t*>(data);

  for(; i < limit; i += 16*16)
  {
    CSA(twosA, ones, ones, vld1q_u8(ptr + 16*0), vld1q_u8(ptr + 16*1));
    CSA(twosB, ones, ones, vld1q_u8(ptr + 16*2), vld1q_u8(ptr + 16*3));
    CSA(foursA, twos, twos, twosA, twosB);
    CSA(twosA, ones, ones, vld1q_u8(ptr + 16*4), vld1q_u8(ptr + 16*5));
    CSA(twosB, ones, ones, vld1q_u8(ptr + 16*6), vld1q_u8(ptr + 16*7));
    CSA(foursB, twos, twos, twosA, twosB);
    CSA(eightsA,fours, fours, foursA, foursB);
    CSA(twosA, ones, ones, vld1q_u8(ptr + 16*8), vld1q_u8(ptr + 16*9));
    CSA(twosB, ones, ones, vld1q_u8(ptr + 16*10), vld1q_u8(ptr + 16*11));
    CSA(foursA, twos, twos, twosA, twosB);
    CSA(twosA, ones, ones, vld1q_u8(ptr + 16*12), vld1q_u8(ptr + 16*13));
    CSA(twosB, ones, ones, vld1q_u8(ptr + 16*14), vld1q_u8(ptr + 16*15));
    CSA(foursB, twos, twos, twosA, twosB);
    CSA(eightsB, fours, fours, foursA, foursB);
    CSA(sixteens, eights, eights, eightsA, eightsB);

    total = vadd_u32(total, popcnt_neon_qreg(sixteens));

    ptr += 16*16;
  }

  total  = vshl_n_u32(total, 4);
  total  = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(eights), 3));
  total  = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(fours), 2));
  total  = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(twos), 1));
  total  = vadd_u32(total, popcnt_neon_qreg(ones));

  uint32_t scalar = 0;
  
  scalar += vget_lane_u32(total, 0);
  scalar += vget_lane_u32(total, 1);

  for(; i < size; i++) {
    scalar += lookup8bit[*ptr++];
  }

  return scalar;
}
示例#2
0
 int64_t BitUtil::pop_xor(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords)
 {
     int32_t n = wordOffset + numWords;
     int64_t tot = 0;
     int64_t tot8 = 0;
     int64_t ones = 0;
     int64_t twos = 0;
     int64_t fours = 0;
     
     int32_t i = wordOffset;
     for (; i <= n - 8; i += 8)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1]));
         
         int64_t twosB;
         CSA(twosB, ones, ones, (A[i + 2] ^ B[i + 2]), (A[i + 3] ^ B[i + 3]));
         
         int64_t foursA;
         CSA(foursA, twos, twos, twosA, twosB);
         
         CSA(twosA, ones, ones, (A[i + 4] ^ B[i + 4]), (A[i + 5] ^ B[i + 5]));
         
         CSA(twosB, ones, ones, (A[i + 6] ^ B[i + 6]), (A[i + 7] ^ B[i + 7]));
         
         int64_t foursB;
         CSA(foursB, twos, twos, twosA, twosB);
         
         int64_t eights;
         CSA(eights, fours, fours, foursA, foursB);
         
         tot8 += pop(eights);
     }
     
     if (i <= n - 4)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1]));
         
         int64_t twosB;
         CSA(twosB, ones, ones, (A[i + 2] ^ B[i + 2]), (A[i + 3] ^ B[i + 3]));
         
         int64_t foursA;
         CSA(foursA, twos, twos, twosA, twosB);
         
         int64_t eights = fours & foursA;
         fours = fours ^ foursA;
         
         tot8 += pop(eights);
         i += 4;
     }
     
     if (i <= n - 2)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i + 1] ^ B[i + 1]));
         
         int64_t foursA = twos & twosA;
         twos = twos ^ twosA;
         
         int64_t eights = fours & foursA;
         fours = fours ^ foursA;
         
         tot8 += pop(eights);
         i += 2;
     }
     
     if (i < n)
         tot += pop((A[i] ^ B[i]));
     
     tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3);
     
     return tot;
 }
示例#3
0
 int64_t BitUtil::pop_array(const int64_t* A, int32_t wordOffset, int32_t numWords)
 {
     int32_t n = wordOffset + numWords;
     int64_t tot = 0;
     int64_t tot8 = 0;
     int64_t ones = 0;
     int64_t twos = 0;
     int64_t fours = 0;
     
     int32_t i = wordOffset;
     for (; i <= n - 8; i += 8)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, A[i], A[i + 1]);
         
         int64_t twosB;
         CSA(twosB, ones, ones, A[i + 2], A[i + 3]);
         
         int64_t foursA;
         CSA(foursA, twos, twos, twosA, twosB);
         
         CSA(twosA, ones, ones, A[i + 4], A[i + 5]);
         
         CSA(twosB, ones, ones, A[i + 6], A[i + 7]);
         
         int64_t foursB;
         CSA(foursB, twos, twos, twosA, twosB);
         
         int64_t eights;
         CSA(eights, fours, fours, foursA, foursB);
         
         tot8 += pop(eights);
     }
     
     // Handle trailing words in a binary-search manner.
     // Derived from the loop above by setting specific elements to 0.
     
     if (i <= n - 4)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, A[i], A[i + 1]);
         
         int64_t twosB;
         CSA(twosB, ones, ones, A[i + 2], A[i + 3]);
         
         int64_t foursA;
         CSA(foursA, twos, twos, twosA, twosB);
         
         int64_t eights = fours & foursA;
         fours = fours ^ foursA;
         
         tot8 += pop(eights);
         i += 4;
     }
     
     if (i <= n - 2)
     {
         int64_t twosA;
         CSA(twosA, ones, ones, A[i], A[i + 1]);
         
         int64_t foursA = twos & twosA;
         twos = twos ^ twosA;
         
         int64_t eights = fours & foursA;
         fours = fours ^ foursA;
         
         tot8 += pop(eights);
         i += 2;
     }
     
     if (i < n)
         tot += pop(A[i]);
     
     tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3);
     
     return tot;
 }
static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size)
{
  __m256i total     = _mm256_setzero_si256();
  __m512i ones      = _mm512_setzero_si512();
  __m512i twos      = _mm512_setzero_si512();
  __m512i fours     = _mm512_setzero_si512();
  __m512i eights    = _mm512_setzero_si512();
  __m512i sixteens  = _mm512_setzero_si512();
  __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;

  const uint64_t limit = size - size % 16;
  uint64_t i = 0;

  for(; i < limit; i += 16)
  {
    CSA(&twosA, &ones, ones, data[i+0], data[i+1]);
    CSA(&twosB, &ones, ones, data[i+2], data[i+3]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+4], data[i+5]);
    CSA(&twosB, &ones, ones, data[i+6], data[i+7]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsA,&fours, fours, foursA, foursB);
    CSA(&twosA, &ones, ones, data[i+8], data[i+9]);
    CSA(&twosB, &ones, ones, data[i+10], data[i+11]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+12], data[i+13]);
    CSA(&twosB, &ones, ones, data[i+14], data[i+15]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsB, &fours, fours, foursA, foursB);
    CSA(&sixteens, &eights, eights, eightsA, eightsB);

    total = _mm256_add_epi64(total, popcount(sixteens));
  }

  total = _mm256_slli_epi64(total, 4);     // * 16
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours),  2)); // += 4 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos),   1)); // += 2 * ...
  total = _mm256_add_epi64(total, popcount(ones));

  for(; i < size; i++) {
    total = _mm256_add_epi64(total, popcount(data[i]));
  }


  return avx2_sum_epu64(total);
}