int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16*4) { const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16)); const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16)); const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16)); const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16)); { const __m128i v = v0; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v1; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v2; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v3; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16) { const __m128i v = _mm_loadu_si128((__m128i*)(array + i)); const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i va = _mm_abs_epi8(v); // sum just positive numbers const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); // sum just negative numbers const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
__m128i test_mm_cmplt_epi8(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmplt_epi8 // DAG: icmp sgt <16 x i8> // // ASM-LABEL: test_mm_cmplt_epi8 // ASM: pcmpgtb return _mm_cmplt_epi8(A, B); }
SIMDValue SIMDInt8x16Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? return X86SIMDValue::ToSIMDValue(x86Result); }
inline __m128i load_aligned_int32(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi8_epi32(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi16(tmp1, mask); #endif return res; }
int main() { ssereg a(0xffffffff); ssereg b(0x00000000); ssereg c = _mm_cmplt_epi8(a, b); a.print(); b.print(); c.print(); return 0; }
SIMDValue SIMDUint8x16Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue signBits = { {0x80808080,0x80808080, 0x80808080, 0x80808080} }; // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1 tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value); tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value); x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? return X86SIMDValue::ToSIMDValue(x86Result); }
/// any (*p > 2) is set to be 3 COREARRAY_DLL_DEFAULT void vec_u8_geno_valid(C_UInt8 *p, size_t n) { #if defined(COREARRAY_SIMD_SSE2) // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p > 3) *p = 3; const __m128i zero = _mm_setzero_si128(); const __m128i three = _mm_set1_epi8(3); for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i*)p); __m128i mask = _mm_or_si128(_mm_cmplt_epi8(v, zero), _mm_cmplt_epi8(three, v)); if (_mm_movemask_epi8(mask) > 0) _mm_store_si128((__m128i*)p, _mm_min_epu8(v, three)); } #endif for (; n > 0; n--, p++) if (*p > 3) *p = 3; }
// // If we assumed AVX512, this routine could be implemented more efficiently // and straightforwardly with _mm_mask_storeu_epi8(). // inline void _assembler_store_partial(__m128i *dst, int s, int p, __m128i x0, __m128i x1, __m128i x2, __m128i x3, __m128i x4, __m128i x5, __m128i x6, __m128i x7) { static const __m128i r = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); __m128i mask = _mm_cmplt_epi8(r, _mm_set1_epi8(p)); x0 = _mm_and_si128(mask, x0); x1 = _mm_and_si128(mask, x1); x2 = _mm_and_si128(mask, x2); x3 = _mm_and_si128(mask, x3); x4 = _mm_and_si128(mask, x4); x5 = _mm_and_si128(mask, x5); x6 = _mm_and_si128(mask, x6); x7 = _mm_and_si128(mask, x7); __m128i y0 = _mm_loadu_si128(dst); __m128i y1 = _mm_loadu_si128(dst+s); __m128i y2 = _mm_loadu_si128(dst+2*s); __m128i y3 = _mm_loadu_si128(dst+3*s); __m128i y4 = _mm_loadu_si128(dst+4*s); __m128i y5 = _mm_loadu_si128(dst+5*s); __m128i y6 = _mm_loadu_si128(dst+6*s); __m128i y7 = _mm_loadu_si128(dst+7*s); y0 = _mm_andnot_si128(mask, y0); y1 = _mm_andnot_si128(mask, y1); y2 = _mm_andnot_si128(mask, y2); y3 = _mm_andnot_si128(mask, y3); y4 = _mm_andnot_si128(mask, y4); y5 = _mm_andnot_si128(mask, y5); y6 = _mm_andnot_si128(mask, y6); y7 = _mm_andnot_si128(mask, y7); y0 = _mm_or_si128(x0, y0); y1 = _mm_or_si128(x1, y1); y2 = _mm_or_si128(x2, y2); y3 = _mm_or_si128(x3, y3); y4 = _mm_or_si128(x4, y4); y5 = _mm_or_si128(x5, y5); y6 = _mm_or_si128(x6, y6); y7 = _mm_or_si128(x7, y7); _assembler_store_full(dst, s, y0, y1, y2, y3, y4, y5, y6, y7); }
int exponent_sum_square_error_sse2(uint8_t *exp0, uint8_t *exp1, int ncoefs) { int i, err; int exp_error = 0; union { __m128i v; int32_t res[4]; } ures; __m128i vzero = _mm_setzero_si128(); __m128i vres = vzero; for (i = 0; i < (ncoefs & ~15); i+=16) { __m128i vexp = _mm_loadu_si128((__m128i*)&exp0[i]); __m128i vexp2 = _mm_loadu_si128((__m128i*)&exp1[i]); #if 0 //safer but needed? __m128i vexphi = _mm_unpackhi_epi8(vexp, vzero); __m128i vexp2hi = _mm_unpackhi_epi8(vexp2, vzero); __m128i vexplo = _mm_unpacklo_epi8(vexp, vzero); __m128i vexp2lo = _mm_unpacklo_epi8(vexp2, vzero); __m128i verrhi = _mm_sub_epi16(vexphi, vexp2hi); __m128i verrlo = _mm_sub_epi16(vexplo, vexp2lo); #else __m128i verr = _mm_sub_epi8(vexp, vexp2); __m128i vsign = _mm_cmplt_epi8(verr, vzero); __m128i verrhi = _mm_unpackhi_epi8(verr, vsign); __m128i verrlo = _mm_unpacklo_epi8(verr, vsign); #endif verrhi = _mm_madd_epi16(verrhi, verrhi); verrlo = _mm_madd_epi16(verrlo, verrlo); verrhi = _mm_add_epi32(verrhi, verrlo); vres = _mm_add_epi32(vres, verrhi); } _mm_store_si128(&ures.v, vres); ures.res[0]+=ures.res[1]; ures.res[2]+=ures.res[3]; exp_error += ures.res[0]+ures.res[2]; for (; i < ncoefs; ++i) { err = exp0[i] - exp1[i]; exp_error += (err * err); } return exp_error; }
void filterScanlinesSSE( unsigned char* filtered, unsigned char* image, unsigned int WIDTH, unsigned int HEIGHT ) { int blocks = 3*WIDTH/16; // Create move-mask for last block of each scanline __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ), _mm_set1_epi8( 3*WIDTH-16*blocks ) ); { const unsigned char* in = image; unsigned char* out = filtered; *out++ = 0; for(int b=0; b<blocks; b++ ) { _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } for( unsigned int j=1; j<HEIGHT; j++ ) { const unsigned char* in = image + 3*WIDTH*(j-1); unsigned char* out = filtered + (3*WIDTH+1)*j; *out++ = 2; for(int b=0; b<blocks; b++ ) { __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in ); __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) ); _mm_storeu_si128( (__m128i*)out, _mm_sub_epi8( _t1, _t0 ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } }
static void add_child16(art_node16 *n, art_node **ref, unsigned char c, void *child) { if (n->n.num_children < 16) { __m128i cmp; // Compare the key to all 16 stored keys cmp = _mm_cmplt_epi8(_mm_set1_epi8(c), _mm_loadu_si128((__m128i*)n->keys)); // Use a mask to ignore children that don't exist unsigned mask = (1 << n->n.num_children) - 1; unsigned bitfield = _mm_movemask_epi8(cmp) & mask; // Check if less than any unsigned idx; if (bitfield) { idx = __builtin_ctz(bitfield); memmove(n->keys+idx+1,n->keys+idx,n->n.num_children-idx); memmove(n->children+idx+1,n->children+idx, (n->n.num_children-idx)*sizeof(void*)); } else idx = n->n.num_children; // Set the child n->keys[idx] = c; n->children[idx] = (art_node*)child; n->n.num_children++; } else { art_node48 *new_node = (art_node48*)alloc_node(NODE48); // Copy the child pointers and populate the key map memcpy(new_node->children, n->children, sizeof(void*)*n->n.num_children); for (int i=0;i<n->n.num_children;i++) { new_node->keys[n->keys[i]] = i + 1; } copy_header((art_node*)new_node, (art_node*)n); *ref = (art_node*)new_node; free(n); add_child48(new_node, ref, c, child); } }
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base) { const signed char* src_data = (signed char*)(src->imageData + base); unsigned char * dst_data = (unsigned char*)(dst->imageData + base); const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep; __m128i pixels[3]; // Load first two rows //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit src_data += src->widthStep; //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q); src_data += src->widthStep; int phase = 2; __m128i * phase_map[3][3] = { {pixels+1, pixels+2, pixels}, {pixels+2, pixels, pixels+1}, {pixels, pixels+1, pixels+2}, }; while (src_data < src_end) { register __m128i weight = ones.q; register __m128i code = _mm_setzero_si128(); //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q); //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q); pixels[phase] = _mm_lddqu_si128((__m128i*)src_data); src_data += src->widthStep; dst_data += dst->widthStep; _mm_prefetch(src_data, _MM_HINT_T0); register __m128i a = *(phase_map[phase][0]); register __m128i b = *(phase_map[phase][1]); register __m128i c = *(phase_map[phase][2]); phase++; phase = (phase == 3) ? 0 : phase; // X . . A // . o . B // . . . C code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . X . // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight)); weight = _mm_slli_epi64(weight, 1); // . . X // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . X // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . . X code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . X . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // X . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // X . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write } }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
FORCE_INLINE static inline void fast_scan_1(const std::uint8_t* partition, const unsigned* labels, const float* dists, const __m128i (&min4)[4], __m128i (&ft4)[4][16], const float qmin, const float qmax, binheap* bh, unsigned scan_pqcode_count) { const unsigned simd_pqcode_count = 16; const int comp_block_size = 16; const unsigned simd_block_size = simd_pqcode_count * (4 * 1 + 4 * 0.5); const group_header* hdr; float bh_bound = qmax; __m128i bh_bound_quant = _mm_set1_epi8(Q127(bh_bound, qmin, bh_bound)); // CHK. Is 127 for (;;) { // Parse group header hdr = reinterpret_cast<const group_header*>(partition); // Check if last group (All bits of size set to 1) if (hdr->size == std::numeric_limits<decltype(hdr->size)>::max()) { return; } partition += sizeof(*hdr); unsigned simd_block_count = (static_cast<unsigned>(hdr->size) + simd_pqcode_count - 1) / simd_pqcode_count; // Load tables __m128i ft4_group[4]; ft4_group[0] = ft4[0][hdr->values[0] >> 4]; ft4_group[1] = ft4[1][hdr->values[1] >> 4]; ft4_group[2] = ft4[2][hdr->values[2] >> 4]; ft4_group[3] = ft4[3][hdr->values[3] >> 4]; // Scan SIMD Blocks while (simd_block_count--) { const __m128i low_bits_mask = _mm_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); // Component 0 const __m128i comps_0 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition)); const __m128i masked_comps_0 = _mm_and_si128(comps_0, low_bits_mask); __m128i candidates = _mm_shuffle_epi8(min4[0], masked_comps_0); // Components 1..3 for (int comp_i = 1; comp_i < 4; ++comp_i) { const __m128i comps = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + comp_i * comp_block_size)); const __m128i masked_comps = _mm_and_si128(comps, low_bits_mask); const __m128i partial = _mm_shuffle_epi8(min4[comp_i], masked_comps); candidates = _mm_adds_epi8(candidates, partial); } // Components 4-5 __m128i comps_45 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + 4 * comp_block_size)); const __m128i masked_comps_4 = _mm_and_si128(comps_45, low_bits_mask); const __m128i partial_4 = _mm_shuffle_epi8(ft4_group[0], masked_comps_4); candidates = _mm_adds_epi8(candidates, partial_4); comps_45 = _mm_srli_epi64(comps_45, 4); const __m128i masked_comps_5 = _mm_and_si128(comps_45, low_bits_mask); const __m128i partial_5 = _mm_shuffle_epi8(ft4_group[1], masked_comps_5); candidates = _mm_adds_epi8(candidates, partial_5); // Components 6-7 __m128i comps_67 = _mm_loadu_si128( reinterpret_cast<const __m128i *>(partition + 5 * comp_block_size)); const __m128i masked_comps_6 = _mm_and_si128(comps_67, low_bits_mask); const __m128i partial_6 = _mm_shuffle_epi8(ft4_group[2], masked_comps_6); candidates = _mm_adds_epi8(candidates, partial_6); const __m128i comps_7 = _mm_srli_epi64(comps_67, 4); const __m128i masked_comp_7 = _mm_and_si128(comps_7, low_bits_mask); const __m128i partial_7 = _mm_shuffle_epi8(ft4_group[3], masked_comp_7); candidates = _mm_adds_epi8(candidates, partial_7); // Compare const __m128i compare = _mm_cmplt_epi8(candidates, bh_bound_quant); int cmp = _mm_movemask_epi8(compare); //std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare)); //std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1)); // Compute current block size int current_block_actual_size = 0; if(simd_block_count == 0) { current_block_actual_size = hdr->size % simd_pqcode_count; if(current_block_actual_size == 0) { current_block_actual_size = simd_pqcode_count; } else { /*__m128i mask; compute_simd_mask(current_block_actual_size, mask); compare = _mm_and_si128(compare, mask);*/ /* std::uint64_t low_mask; std::uint64_t high_mask; compute_high_low_mask(current_block_actual_size, low_mask, high_mask); cmp_low = cmp_low & low_mask; cmp_high = cmp_high & high_mask; */ cmp = cmp & BITMASK(current_block_actual_size); } } else { current_block_actual_size = simd_pqcode_count; } if(cmp) { // Check low quadword const std::uint8_t cmp_low = cmp & 0xff; if (cmp_low) { /*const std::uint64_t low_possible_positions = 0x0706050403020100; const std::uint64_t match_positions = _pext_u64( low_possible_positions, cmp_low);*/ const int match_count = _popcnt32(cmp_low); std::uint64_t match_pos = masktable[cmp_low]; for (int i = 0; i < match_count; ++i) { const std::uint8_t pos = match_pos & 0xff; match_pos >>= 8; const float candidate = scan_pqcode_in_simd_block_1(pos, partition, hdr->values, dists); if (candidate < bh_bound) { bh->push(labels[scan_pqcode_count + pos], candidate); bh_bound = bh->max(); bh_bound_quant = _mm_set1_epi8( Q127(bh_bound, qmin, qmax)); } } } // Check high quadword const std::uint8_t cmp_high = (cmp >> 8); if (cmp_high) { /*const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908; const std::uint64_t match_positions = _pext_u64( high_possible_positions, cmp_high);*/ const int match_count = _popcnt32(cmp_high); std::uint64_t match_pos = masktable[cmp_high] + 0x0808080808080808; for (int i = 0; i < match_count; ++i) { const std::uint8_t pos = match_pos & 0xff; match_pos >>= 8; const float candidate = scan_pqcode_in_simd_block_1(pos, partition, hdr->values, dists); if (candidate < bh_bound) { bh->push(labels[scan_pqcode_count + pos], candidate); bh_bound = bh->max(); bh_bound_quant = _mm_set1_epi8( Q127(bh_bound, qmin, qmax)); } } } } partition += simd_block_size; scan_pqcode_count += current_block_actual_size; } } }