Example #1
0
static inline __attribute__((pure, const)) uint32_t
xpopcnt32(uint32_t v)
{
	return _popcnt32(v);
}
Example #2
0
FORCE_INLINE
static inline void fast_scan_1(const std::uint8_t* partition, const unsigned* labels,
		const float* dists, const __m128i (&min4)[4], __m128i (&ft4)[4][16],
		const float qmin, const float qmax, binheap* bh, unsigned scan_pqcode_count) {
	const unsigned simd_pqcode_count = 16;
	const int comp_block_size = 16;
	const unsigned simd_block_size = simd_pqcode_count * (4 * 1 + 4 * 0.5);
	const group_header* hdr;

	float bh_bound = qmax;
	__m128i bh_bound_quant = _mm_set1_epi8(Q127(bh_bound, qmin, bh_bound)); // CHK. Is 127

	for (;;) {
		// Parse group header
		hdr = reinterpret_cast<const group_header*>(partition);
		// Check if last group (All bits of size set to 1)
		if (hdr->size == std::numeric_limits<decltype(hdr->size)>::max()) {
			return;
		}
		partition += sizeof(*hdr);
		unsigned simd_block_count = (static_cast<unsigned>(hdr->size)
				+ simd_pqcode_count - 1) / simd_pqcode_count;
		// Load tables
		__m128i ft4_group[4];
		ft4_group[0] = ft4[0][hdr->values[0] >> 4];
		ft4_group[1] = ft4[1][hdr->values[1] >> 4];
		ft4_group[2] = ft4[2][hdr->values[2] >> 4];
		ft4_group[3] = ft4[3][hdr->values[3] >> 4];
		// Scan SIMD Blocks
		while (simd_block_count--) {
			const __m128i low_bits_mask = _mm_set_epi64x(0x0f0f0f0f0f0f0f0f,
					0x0f0f0f0f0f0f0f0f);

			// Component 0
			const __m128i comps_0 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition));
			const __m128i masked_comps_0 = _mm_and_si128(comps_0, low_bits_mask);
			__m128i candidates = _mm_shuffle_epi8(min4[0], masked_comps_0);
			// Components 1..3
			for (int comp_i = 1; comp_i < 4; ++comp_i) {
				const __m128i comps = _mm_loadu_si128(
						reinterpret_cast<const __m128i *>(partition
								+ comp_i * comp_block_size));
				const __m128i masked_comps = _mm_and_si128(comps, low_bits_mask);
				const __m128i partial = _mm_shuffle_epi8(min4[comp_i], masked_comps);
				candidates = _mm_adds_epi8(candidates, partial);
			}
			// Components 4-5
			__m128i comps_45 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 4 * comp_block_size));
			const __m128i masked_comps_4 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_4 = _mm_shuffle_epi8(ft4_group[0], masked_comps_4);
			candidates = _mm_adds_epi8(candidates, partial_4);

			comps_45 = _mm_srli_epi64(comps_45, 4);
			const __m128i masked_comps_5 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_5 = _mm_shuffle_epi8(ft4_group[1], masked_comps_5);
			candidates = _mm_adds_epi8(candidates, partial_5);

			// Components 6-7
			__m128i comps_67 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 5 * comp_block_size));
			const __m128i masked_comps_6 = _mm_and_si128(comps_67, low_bits_mask);
			const __m128i partial_6 = _mm_shuffle_epi8(ft4_group[2], masked_comps_6);
			candidates = _mm_adds_epi8(candidates, partial_6);

			const __m128i comps_7 = _mm_srli_epi64(comps_67, 4);
			const __m128i masked_comp_7 = _mm_and_si128(comps_7, low_bits_mask);
			const __m128i partial_7 = _mm_shuffle_epi8(ft4_group[3], masked_comp_7);
			candidates = _mm_adds_epi8(candidates, partial_7);

			// Compare
			const __m128i compare = _mm_cmplt_epi8(candidates, bh_bound_quant);
			int cmp = _mm_movemask_epi8(compare);
			//std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare));
			//std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1));

			// Compute current block size
			int current_block_actual_size = 0;
			if(simd_block_count == 0) {
				current_block_actual_size = hdr->size % simd_pqcode_count;
				if(current_block_actual_size == 0) {
					current_block_actual_size = simd_pqcode_count;
				} else {
					/*__m128i mask;
					compute_simd_mask(current_block_actual_size, mask);
					compare = _mm_and_si128(compare, mask);*/
					/*
					std::uint64_t low_mask;
					std::uint64_t high_mask;
					compute_high_low_mask(current_block_actual_size, low_mask, high_mask);
					cmp_low = cmp_low & low_mask;
					cmp_high = cmp_high & high_mask;
					*/
					cmp = cmp & BITMASK(current_block_actual_size);
				}
			} else {
				current_block_actual_size = simd_pqcode_count;
			}

			if(cmp) {
				// Check low quadword
				const std::uint8_t cmp_low = cmp & 0xff;
				if (cmp_low) {
					/*const std::uint64_t low_possible_positions = 0x0706050403020100;
					const std::uint64_t match_positions = _pext_u64(
							low_possible_positions, cmp_low);*/
					const int match_count = _popcnt32(cmp_low);
					std::uint64_t match_pos = masktable[cmp_low];


					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}

				// Check high quadword
				const std::uint8_t cmp_high = (cmp >> 8);
				if (cmp_high) {
					/*const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908;
					const std::uint64_t match_positions = _pext_u64(
							high_possible_positions, cmp_high);*/
					const int match_count = _popcnt32(cmp_high);
					std::uint64_t match_pos = masktable[cmp_high] + 0x0808080808080808;

					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}
			}

			partition += simd_block_size;
			scan_pqcode_count += current_block_actual_size;
		}
	}
}