static inline bool avxcontains(hashset_t * set, uint64_t target) { __m256i vtarget = _mm256_set1_epi64x(target); __m256i vlocation = _mm256_and_si256(avxhash(vtarget, set->vmultiplier),set->sizemask); __m256i svalue = _mm256_i64gather_epi64((const long long int *) set->data,vlocation,8); __m256i eq = _mm256_cmpeq_epi64(vtarget,svalue); return _mm256_testz_si256(eq,eq) == 0; }
__m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) { // CHECK: icmp eq <4 x i64> return _mm256_cmpeq_epi64(a, b); }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }