uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; __m256i global_sum = _mm256_setzero_si256(); __m256i local_sum; // 1. blocks of 256 registers while (ptr + 255*32 < end) { local_sum = _mm256_setzero_si256(); // update 32 x 8-bit counter for (int i=0; i < 255; i++, ptr += 32) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1 local_sum = _mm256_sub_epi8(local_sum, eq); } // update the global accumulator 2 x 64-bit const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); } // 2. tail of < 256 registers local_sum = _mm256_setzero_si256(); while (ptr + 32 < end) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); local_sum = _mm256_sub_epi8(local_sum, eq); ptr += 32; } const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); // 3. process tail < 32 bytes uint64_t result = _mm256_extract_epi64(global_sum, 0) + _mm256_extract_epi64(global_sum, 1) + _mm256_extract_epi64(global_sum, 2) + _mm256_extract_epi64(global_sum, 3); return result + scalar_count_bytes(ptr, end - ptr, byte); }
size_t __FASTCALL strlen_fast_v2_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)cur & 0x3F; // If the misalignment bytes is < 32 bytes? if (misalignment < 0x20) { if (misalignment == 0) { // If misalignment is 0, skip this step. goto main_loop; } // Align address to 64 bytes for main loop. cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F)); // Load 32 bytes from target string to YMM register. src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 32 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); // Remove last missalign bits. zero_mask_low >>= misalignment; zero_mask_low <<= misalignment; if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); goto strlen_exit; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop. cur += 64; } else {
void* xmemchr(const void* src, int c, size_t n) { if (n < 32) { return xmemchr_tiny(src, c, n); } __m256i ymm0 = _mm256_set1_epi8((char)c), ymm1; int mask; size_t rem = n % 32; n /= 32; for (size_t i = 0; i < n; i++) { ymm1 = _mm256_loadu_si256((const __m256i*)src + i); ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0); mask = _mm256_movemask_epi8(ymm1); if (mask) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"0"(mask) ); return (void*)((unsigned long)((const __m256i*)src + i) + mask); } } return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem); }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m256i K = LOADU(key); __m256i A, B, C, D; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, K); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); DECRYPT_DATA(A, B, C, D, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D, K); /* Verify tag */ D = _mm256_cmpeq_epi8(D, LOADU(c + clen - BYTES(NORX_T))); return (((_mm256_movemask_epi8(D) & 0xFFFFFFFFULL) + 1) >> 32) - 1; }
SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst) { const __m256i _src = Load<srcAlign>((__m256i*)(src + offset)); const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01); __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); }
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; uint64_t result = 0; // 1. blocks of 8 registers while (ptr + 8*32 < end) { const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32))); const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32))); const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32))); const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32))); const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32))); const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32))); const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32))); const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32))); const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01)); const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02)); const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04)); const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08)); const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10)); const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20)); const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40)); const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80))); const __m256i m01 = _mm256_or_si256(eq0bit, eq1bit); const __m256i m23 = _mm256_or_si256(eq2bit, eq3bit); const __m256i m45 = _mm256_or_si256(eq4bit, eq5bit); const __m256i m67 = _mm256_or_si256(eq6bit, eq7bit); const __m256i m0123 = _mm256_or_si256(m01, m23); const __m256i m4567 = _mm256_or_si256(m45, m67); const __m256i merged = _mm256_or_si256(m0123, m4567); result += __builtin_popcountll(_mm256_extract_epi64(merged, 0)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 1)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 2)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 3)); ptr += 8 * 32; } return result + scalar_count_bytes(ptr, end - ptr, byte); }
size_t xstrlen(const char* src) { __m256i m0 = _mm256_setzero_si256(); __m256i m1 ; int mask; for (size_t count = 0;; count += 32){ m1 = _mm256_loadu_si256((const __m256i*)((unsigned long)src + count)); m1 = _mm256_cmpeq_epi8(m1, m0); mask = _mm256_movemask_epi8(m1); if (mask != 0) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"r"(mask) ); return count + (size_t)mask; } }
/* * Do or undo the 'E8' preprocessing used in LZX. Before compression, the * uncompressed data is preprocessed by changing the targets of x86 CALL * instructions from relative offsets to absolute offsets. After decompression, * the translation is undone by changing the targets of x86 CALL instructions * from absolute offsets to relative offsets. * * Note that despite its intent, E8 preprocessing can be done on any data even * if it is not actually x86 machine code. In fact, E8 preprocessing appears to * always be used in LZX-compressed resources in WIM files; there is no bit to * indicate whether it is used or not, unlike in the LZX compressed format as * used in cabinet files, where a bit is reserved for that purpose. * * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data, * which really means the 5-byte call instruction cannot start in the last 10 * bytes of the uncompressed data. This is one of the errors in the LZX * documentation. * * E8 preprocessing does not appear to be disabled after the 32768th chunk of a * WIM resource, which apparently is another difference from the LZX compression * used in cabinet files. * * E8 processing is supposed to take the file size as a parameter, as it is used * in calculating the translated jump targets. But in WIM files, this file size * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000). */ static void lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) { #if !defined(__SSE2__) && !defined(__AVX2__) /* * A worthwhile optimization is to push the end-of-buffer check into the * relatively rare E8 case. This is possible if we replace the last six * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte * before reaching end-of-buffer. In addition, this scheme guarantees * that no translation can begin following an E8 byte in the last 10 * bytes because a 4-byte offset containing E8 as its high byte is a * large negative number that is not valid for translation. That is * exactly what we need. */ u8 *tail; u8 saved_bytes[6]; u8 *p; if (size <= 10) return; tail = &data[size - 6]; memcpy(saved_bytes, tail, 6); memset(tail, 0xE8, 6); p = data; for (;;) { while (*p != 0xE8) p++; if (p >= tail) break; (*process_target)(p + 1, p - data); p += 5; } memcpy(tail, saved_bytes, 6); #else /* SSE2 or AVX-2 optimized version for x86_64 */ u8 *p = data; u64 valid_mask = ~0; if (size <= 10) return; #ifdef __AVX2__ # define ALIGNMENT_REQUIRED 32 #else # define ALIGNMENT_REQUIRED 16 #endif /* Process one byte at a time until the pointer is properly aligned. */ while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { if (p >= data + size - 10) return; if (*p == 0xE8 && (valid_mask & 1)) { (*process_target)(p + 1, p - data); valid_mask &= ~0x1F; } p++; valid_mask >>= 1; valid_mask |= (u64)1 << 63; } if (data + size - p >= 64) { /* Vectorized processing */ /* Note: we use a "trap" E8 byte to eliminate the need to check * for end-of-buffer in the inner loop. This byte is carefully * positioned so that it will never be changed by a previous * translation before it is detected. */ u8 *trap = p + ((data + size - p) & ~31) - 32 + 4; u8 saved_byte = *trap; *trap = 0xE8; for (;;) { u32 e8_mask; u8 *orig_p = p; #ifdef __AVX2__ const __m256i e8_bytes = _mm256_set1_epi8(0xE8); for (;;) { __m256i bytes = *(const __m256i *)p; __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); e8_mask = _mm256_movemask_epi8(cmpresult); if (e8_mask) break; p += 32; } #else const __m128i e8_bytes = _mm_set1_epi8(0xE8); for (;;) { /* Read the next 32 bytes of data and test them * for E8 bytes. */ __m128i bytes1 = *(const __m128i *)p; __m128i bytes2 = *(const __m128i *)(p + 16); __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); u32 mask1 = _mm_movemask_epi8(cmpresult1); u32 mask2 = _mm_movemask_epi8(cmpresult2); /* The masks have a bit set for each E8 byte. * We stay in this fast inner loop as long as * there are no E8 bytes. */ if (mask1 | mask2) { e8_mask = mask1 | (mask2 << 16); break; } p += 32; } #endif /* Did we pass over data with no E8 bytes? */ if (p != orig_p) valid_mask = ~0; /* Are we nearing end-of-buffer? */ if (p == trap - 4) break; /* Process the E8 bytes. However, the AND with * 'valid_mask' ensures we never process an E8 byte that * was itself part of a translation target. */ while ((e8_mask &= valid_mask)) { unsigned bit = bsf32(e8_mask); (*process_target)(p + bit + 1, p + bit - data); valid_mask &= ~((u64)0x1F << bit); } valid_mask >>= 32; valid_mask |= 0xFFFFFFFF00000000; p += 32; } *trap = saved_byte; }
_mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); /* The bits have now been shifted to the right locations; * translate their values 0..63 to the Base64 alphabet. * Because AVX2 can only compare 'greater than', start from end of alphabet: */ /* set 5: 63, "/" */ s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63)); blockmask = s5mask; /* set 4: 62, "+" */ s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62)); blockmask = _mm256_or_si256(blockmask, s4mask); /* set 3: 52..61, "0123456789" */ s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51))); blockmask = _mm256_or_si256(blockmask, s3mask); /* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */ s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25))); blockmask = _mm256_or_si256(blockmask, s2mask); /* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
void vec_i8_count3(const char *p, size_t n, char val1, char val2, char val3, size_t *out_n1, size_t *out_n2, size_t *out_n3) { size_t n1 = 0, n2 = 0, n3 = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask1 = _mm256_set1_epi8(val1); const __m256i mask2 = _mm256_set1_epi8(val2); const __m256i mask3 = _mm256_set1_epi8(val3); __m256i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = MM_SET_M128(_mm_sub_epi8(zeros, c2), zeros); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = MM_SET_M128(_mm_sub_epi8(zeros, c3), zeros); n -= 16; p += 16; } for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); sum1 = _mm256_sub_epi8(sum1, _mm256_cmpeq_epi8(v, mask1)); sum2 = _mm256_sub_epi8(sum2, _mm256_cmpeq_epi8(v, mask2)); sum3 = _mm256_sub_epi8(sum3, _mm256_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); sum1 = sum2 = sum3 = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = _mm256_sub_epi8(sum1, MM_SET_M128(c1, zeros)); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = _mm256_sub_epi8(sum2, MM_SET_M128(c2, zeros)); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = _mm256_sub_epi8(sum3, MM_SET_M128(c3, zeros)); n -= 16; p += 16; } if (offset > 0) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); } # else // body, SSE2 const __m128i mask1 = _mm_set1_epi8(val1); const __m128i mask2 = _mm_set1_epi8(val2); const __m128i mask3 = _mm_set1_epi8(val3); __m128i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm_setzero_si128(); size_t offset = 0; for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum1 = _mm_sub_epi8(sum1, _mm_cmpeq_epi8(v, mask1)); sum2 = _mm_sub_epi8(sum2, _mm_cmpeq_epi8(v, mask2)); sum3 = _mm_sub_epi8(sum3, _mm_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); sum1 = sum2 = sum3 = _mm_setzero_si128(); offset = 0; } } if (offset > 0) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); } #endif #endif // tail for (; n > 0; n--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } if (out_n1) *out_n1 = n1; if (out_n2) *out_n2 = n2; if (out_n3) *out_n3 = n3; }
/// get the number of non-zero size_t vec_i8_cnt_nonzero(const int8_t *p, size_t n) { size_t ans = 0; #ifdef COREARRAY_SIMD_SSE2 const __m128i ZERO = { 0LL, 0LL }; const __m128i ONES = { 0x0101010101010101LL, 0x0101010101010101LL }; const __m128i ONE = { 1LL, 1LL }; // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) ans += (*p++) ? 1 : 0; # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; n -= 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 16 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } const __m256i ZERO2 = { 0LL, 0LL, 0LL, 0LL }; const __m256i ONES2 = { 0x0101010101010101LL, 0x0101010101010101LL, 0x0101010101010101LL, 0x0101010101010101LL }; // body, AVX2 for (; n >= 256; n -= 256) { __m256i c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); __m256i bit = _mm256_and_si256(c, ONES2); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; uint64_t array[4] __attribute__((aligned(32))); *((__m256i*)array) = bit; ans += 256 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]) - POPCNT_U64(array[2]) - POPCNT_U64(array[3]); } # endif // body, SSE2 for (; n >= 128; n -= 128) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 128 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } for (; n >= 16; n -= 16) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 16 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } #else // header, 8-byte aligned size_t h = (8 - ((size_t)p & 0x07)) & 0x07; for (; (n > 0) && (h > 0); n--, h--) ans += (*p++) ? 1 : 0; // body, unroll for (; n >= 8; n -= 8) { ans += (p[0] ? 1 : 0) + (p[1] ? 1 : 0) + (p[2] ? 1 : 0) + (p[3] ? 1 : 0) + (p[4] ? 1 : 0) + (p[5] ? 1 : 0) + (p[6] ? 1 : 0) + (p[7] ? 1 : 0); p += 8; } #endif // tail for (; n > 0; n--) ans += (*p++) ? 1 : 0; return ans; }
size_t vec_i8_count(const char *p, size_t n, char val) { size_t num = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) if (*p++ == val) num++; # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask = _mm256_set1_epi8(val); __m256i sum = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); n -= 16; p += 16; } for (; n >= 128; n-=128) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } for (; n >= 32; n-=32) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = _mm256_sub_epi8(sum, MM_SET_M128(zeros, c1)); n -= 16; p += 16; } if (offset > 0) num += vec_avx_sum_u8(sum); # else // body, SSE2 const __m128i mask = _mm_set1_epi8(val); __m128i sum = _mm_setzero_si128(); size_t offset = 0; for (; n >= 64; n-=64) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } if (offset > 0) num += vec_sum_u8(sum); #endif #endif // tail for (; n > 0; n--) if (*p++ == val) num++; return num; }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and the one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * <ox> will be resized if needed. It's fine if it was * just <_Reuse()'d> from a previous, smaller profile. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - filter DP matrix (one row) * ret_sc - RETURN: MSV score (in nats) * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * <ox> may have been resized. * * Throws: <eslEMEML> if <ox> reallocation fails. */ int p7_MSVFilter_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_FILTERMX *ox, float *ret_sc) { #ifdef HAVE_AVX2 uint8_t xJ; /* special states' scores */ register __m256i mpv_AVX; /* previous row values */ register __m256i xEv_AVX; /* E state: keeps max for Mk->E as we go */ register __m256i xBv_AVX; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m256i sv_AVX; /* temp storage of 1 curr row value in progress */ register __m256i biasv_AVX; /* emission bias in a vector */ __m256i *dp_AVX; /* the dp row memory */ __m256i *rsc_AVX; /* will point at om->rbv[x] for residue x[i] */ __m256i xJv_AVX; /* vector for states score */ __m256i tjbmv_AVX; /* vector for cost of moving {JN}->B->M */ __m256i tecv_AVX; /* vector for E->C cost */ __m256i basev_AVX; /* offset for scores */ __m256i ceilingv_AVX; /* saturated simd value used to test for overflow */ __m256i tempv_AVX; /* work vector */ int Q_AVX = P7_NVB_AVX(om->M); /* segment length: # of vectors */ int q_AVX; /* counter over vectors 0..nq-1 */ int i; /* counter over sequence positions 1..L */ int cmp; int status; //printf("Starting MSVFilter\n"); /* Contract checks */ ESL_DASSERT1(( om->mode == p7_LOCAL )); /* Production code assumes multilocal mode w/ length model <L> */ ESL_DASSERT1(( om->L == L )); /* ... and it's easy to forget to set <om> that way */ ESL_DASSERT1(( om->nj == 1.0f )); /* ... hence the check */ /* ... which you can disable, if you're playing w/ config */ /* note however that it makes no sense to run MSV w/ a model in glocal mode */ /* Try highly optimized Knudsen SSV filter first. * Note that SSV doesn't use any main memory (from <ox>) at all! */ //extern uint64_t SSV_time; uint64_t filter_start_time = __rdtsc(); status = p7_SSVFilter_avx(dsq, L, om, ret_sc); uint64_t filter_end_time = __rdtsc(); //SSV_time += (filter_end_time - filter_start_time); if (status != eslENORESULT) return status; extern uint64_t full_MSV_calls; full_MSV_calls++; /* Resize the filter mx as needed */ if (( status = p7_filtermx_GrowTo(ox, om->M)) != eslOK) ESL_EXCEPTION(status, "Reallocation of MSV filter matrix failed"); dp_AVX = ox->dp_AVX; /* ditto this */ /* Matrix type and size must be set early, not late: debugging dump functions need this information. */ ox->M = om->M; ox->type = p7F_MSVFILTER; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv_AVX = _mm256_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) dp_AVX[q_AVX] = _mm256_setzero_si256(); /* saturate simd register for overflow test */ ceilingv_AVX = _mm256_cmpeq_epi8(biasv_AVX, biasv_AVX); basev_AVX = _mm256_set1_epi8((int8_t) om->base_b); tjbmv_AVX = _mm256_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); tecv_AVX = _mm256_set1_epi8((int8_t) om->tec_b); xJv_AVX = _mm256_subs_epu8(biasv_AVX, biasv_AVX); xBv_AVX = _mm256_subs_epu8(basev_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB; xB = _mm_extract_epi16(xBv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) /* Outer loop over residues*/ { rsc_AVX = om->rbv_AVX[dsq[i]]; xEv_AVX = _mm256_setzero_si256(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ __m256i dp_temp_AVX = dp_AVX[Q_AVX -1]; mpv_AVX = esl_avx_leftshift_one(dp_temp_AVX); for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv_AVX = _mm256_max_epu8(mpv_AVX, xBv_AVX); sv_AVX = _mm256_adds_epu8(sv_AVX, biasv_AVX); sv_AVX = _mm256_subs_epu8(sv_AVX, *rsc_AVX); rsc_AVX++; xEv_AVX = _mm256_max_epu8(xEv_AVX, sv_AVX); mpv_AVX = dp_AVX[q_AVX]; /* Load {MDI}(i-1,q) into mpv */ dp_AVX[q_AVX] = sv_AVX; /* Do delayed store of M(i,q) now that memory is usable */ } /* test for the overflow condition */ tempv_AVX = _mm256_adds_epu8(xEv_AVX, biasv_AVX); tempv_AVX = _mm256_cmpeq_epi8(tempv_AVX, ceilingv_AVX); cmp = _mm256_movemask_epi8(tempv_AVX); /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use shuffles instead of shifts so when the last max has completed, * the last four elements of the simd register will contain the * max value. Then the last shuffle will broadcast the max value * to all simd elements. */ xEv_AVX = _mm256_set1_epi8(esl_avx_hmax_epu8(xEv_AVX)); // broadcast the max byte from original xEv_AVX // to all bytes of xEv_AVX /* immediately detect overflow */ if (cmp != 0x0000) { // MSV_end_time = __rdtsc(); // MSV_time += (MSV_end_time - MSV_start_time); *ret_sc = eslINFINITY; return eslERANGE; } xEv_AVX = _mm256_subs_epu8(xEv_AVX, tecv_AVX); xJv_AVX = _mm256_max_epu8(xJv_AVX,xEv_AVX); xBv_AVX = _mm256_max_epu8(basev_AVX, xJv_AVX); xBv_AVX = _mm256_subs_epu8(xBv_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB, xE; xB = _mm_extract_epi16(xBv, 0); xE = _mm_extract_epi16(xEv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ xJ = _mm256_extract_epi8(xJv_AVX, 0); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ /* MSV_end_time = __rdtsc(); MSV_time += (MSV_end_time - MSV_start_time); */ return eslOK; #endif #ifndef HAVE_AVX2 return eslENORESULT; // Stub so we have something to link if we build without AVX2 support #endif }
size_t __FASTCALL strlen_fast_v1b_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)str & 0x3F; if (misalignment != 0) { misalignment = (size_t)str & 0x1F; // Scan the null terminator in first missalign bytes. register const char * end = cur + ((size_t)16UL - misalignment); while (cur < end) { // Find out the null terminator. if (*cur == '\0') { return (size_t)(cur - str); } cur++; } // Align address to 64 bytes for main loop. end = (const char *)((size_t)str & ((size_t)~(size_t)0x3F)) + 64; register __m128i zero16, src16; register uint32_t zero_mask16; // Set the zero masks (16 bytes). INIT_ZERO_16(zero16); zero16 = _mm_xor_si128(zero16, zero16); // Minor 16 bytes loop while (cur < end) { // Load the src 16 bytes to XMM register src16 = _mm_load_si128((__m128i *)(cur)); // Compare with zero16 masks per byte. src16 = _mm_cmpeq_epi8(src16, zero16); // Package the compare result (16 bytes) to 16 bits. zero_mask16 = (uint32_t)_mm_movemask_epi8(src16); // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 16 bytes). if (zero_mask16 != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask16); goto strlen_exit; } // One minor loop scan 16 bytes. cur += 16; } } // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Main loop do { // Load the src 32 bytes to XMM register src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 16 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); #if defined(_WIN64) || defined(WIN64) || defined(_M_X64) || defined(_M_AMD64) \ || defined(_M_IA64) || defined(__amd64__) || defined(__x86_64__) // Combin the mask of the low 32 bits and high 32 bits. zero_mask = (zero_mask_high << 32) | zero_mask_low; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask != 0) { // Get the index of the first bit on set to 1. __BitScanForward64(zero_index, zero_mask); break; } #else (void)zero_mask; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); break; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; break; } #endif // _M_X64 || __x86_64__ // One loop scan 64 bytes. cur += 64; } while (1); strlen_exit: len = cur - str; len += zero_index; return len; }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }
__m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) { // CHECK: icmp eq <32 x i8> return _mm256_cmpeq_epi8(a, b); }
// Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop. cur += 64; } else { // Align address to 64 bytes, and offset 32 bytes for misalignment. cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F)); // Load the src 32 bytes to XMM register src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 32 bits. zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); // Skip 32 bytes. misalignment -= 32; // Remove last misalignment bits. zero_mask_high >>= misalignment; zero_mask_high <<= misalignment; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit;