static inline __attribute__((pure, const)) uint32_t xpopcnt32(uint32_t v) { return _popcnt32(v); }
FORCE_INLINE static inline void fast_scan_1(const std::uint8_t* partition, const unsigned* labels, const float* dists, const __m128i (&min4)[4], __m128i (&ft4)[4][16], const float qmin, const float qmax, binheap* bh, unsigned scan_pqcode_count) { const unsigned simd_pqcode_count = 16; const int comp_block_size = 16; const unsigned simd_block_size = simd_pqcode_count * (4 * 1 + 4 * 0.5); const group_header* hdr; float bh_bound = qmax; __m128i bh_bound_quant = _mm_set1_epi8(Q127(bh_bound, qmin, bh_bound)); // CHK. Is 127 for (;;) { // Parse group header hdr = reinterpret_cast<const group_header*>(partition); // Check if last group (All bits of size set to 1) if (hdr->size == std::numeric_limits<decltype(hdr->size)>::max()) { return; } partition += sizeof(*hdr); unsigned simd_block_count = (static_cast<unsigned>(hdr->size) + simd_pqcode_count - 1) / simd_pqcode_count; // Load tables __m128i ft4_group[4]; ft4_group[0] = ft4[0][hdr->values[0] >> 4]; ft4_group[1] = ft4[1][hdr->values[1] >> 4]; ft4_group[2] = ft4[2][hdr->values[2] >> 4]; ft4_group[3] = ft4[3][hdr->values[3] >> 4]; // Scan SIMD Blocks while (simd_block_count--) { const __m128i low_bits_mask = _mm_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); // Component 0 const __m128i comps_0 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition)); const __m128i masked_comps_0 = _mm_and_si128(comps_0, low_bits_mask); __m128i candidates = _mm_shuffle_epi8(min4[0], masked_comps_0); // Components 1..3 for (int comp_i = 1; comp_i < 4; ++comp_i) { const __m128i comps = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + comp_i * comp_block_size)); const __m128i masked_comps = _mm_and_si128(comps, low_bits_mask); const __m128i partial = _mm_shuffle_epi8(min4[comp_i], masked_comps); candidates = _mm_adds_epi8(candidates, partial); } // Components 4-5 __m128i comps_45 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + 4 * comp_block_size)); const __m128i masked_comps_4 = _mm_and_si128(comps_45, low_bits_mask); const __m128i partial_4 = _mm_shuffle_epi8(ft4_group[0], masked_comps_4); candidates = _mm_adds_epi8(candidates, partial_4); comps_45 = _mm_srli_epi64(comps_45, 4); const __m128i masked_comps_5 = _mm_and_si128(comps_45, low_bits_mask); const __m128i partial_5 = _mm_shuffle_epi8(ft4_group[1], masked_comps_5); candidates = _mm_adds_epi8(candidates, partial_5); // Components 6-7 __m128i comps_67 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + 5 * comp_block_size)); const __m128i masked_comps_6 = _mm_and_si128(comps_67, low_bits_mask); const __m128i partial_6 = _mm_shuffle_epi8(ft4_group[2], masked_comps_6); candidates = _mm_adds_epi8(candidates, partial_6); const __m128i comps_7 = _mm_srli_epi64(comps_67, 4); const __m128i masked_comp_7 = _mm_and_si128(comps_7, low_bits_mask); const __m128i partial_7 = _mm_shuffle_epi8(ft4_group[3], masked_comp_7); candidates = _mm_adds_epi8(candidates, partial_7); // Compare const __m128i compare = _mm_cmplt_epi8(candidates, bh_bound_quant); int cmp = _mm_movemask_epi8(compare); //std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare)); //std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1)); // Compute current block size int current_block_actual_size = 0; if(simd_block_count == 0) { current_block_actual_size = hdr->size % simd_pqcode_count; if(current_block_actual_size == 0) { current_block_actual_size = simd_pqcode_count; } else { /*__m128i mask; compute_simd_mask(current_block_actual_size, mask); compare = _mm_and_si128(compare, mask);*/ /* std::uint64_t low_mask; std::uint64_t high_mask; compute_high_low_mask(current_block_actual_size, low_mask, high_mask); cmp_low = cmp_low & low_mask; cmp_high = cmp_high & high_mask; */ cmp = cmp & BITMASK(current_block_actual_size); } } else { current_block_actual_size = simd_pqcode_count; } if(cmp) { // Check low quadword const std::uint8_t cmp_low = cmp & 0xff; if (cmp_low) { /*const std::uint64_t low_possible_positions = 0x0706050403020100; const std::uint64_t match_positions = _pext_u64( low_possible_positions, cmp_low);*/ const int match_count = _popcnt32(cmp_low); std::uint64_t match_pos = masktable[cmp_low]; for (int i = 0; i < match_count; ++i) { const std::uint8_t pos = match_pos & 0xff; match_pos >>= 8; const float candidate = scan_pqcode_in_simd_block_1(pos, partition, hdr->values, dists); if (candidate < bh_bound) { bh->push(labels[scan_pqcode_count + pos], candidate); bh_bound = bh->max(); bh_bound_quant = _mm_set1_epi8( Q127(bh_bound, qmin, qmax)); } } } // Check high quadword const std::uint8_t cmp_high = (cmp >> 8); if (cmp_high) { /*const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908; const std::uint64_t match_positions = _pext_u64( high_possible_positions, cmp_high);*/ const int match_count = _popcnt32(cmp_high); std::uint64_t match_pos = masktable[cmp_high] + 0x0808080808080808; for (int i = 0; i < match_count; ++i) { const std::uint8_t pos = match_pos & 0xff; match_pos >>= 8; const float candidate = scan_pqcode_in_simd_block_1(pos, partition, hdr->values, dists); if (candidate < bh_bound) { bh->push(labels[scan_pqcode_count + pos], candidate); bh_bound = bh->max(); bh_bound_quant = _mm_set1_epi8( Q127(bh_bound, qmin, qmax)); } } } } partition += simd_block_size; scan_pqcode_count += current_block_actual_size; } } }