int main() { const ssize_t A = 3; const size_t Awidth = 2; const size_t Dwidth = 4; const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1)); const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1; const ssize_t Cwidth = Awidth + Dwidth; const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1); const size_t numCodewords = (1ull << Cwidth); std::cout << "numCodewords: " << numCodewords << std::endl; const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate? int * pNonCodewordMasks = new int[numMasks]; const int16_t c = ~((1ll << (Cwidth - 1)) - 1); std::cout << "c = 0x" << std::hex << c << std::dec << std::endl; for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) { int tmpMask = 0; for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) { if ((cw % A) != 0) { // we want the non-codewords // std::cout << "cw % A != 0: " << cw << std::endl; tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits } } pNonCodewordMasks[posMask] = tmpMask; } std::cout << "numMasks: " << numMasks << std::endl; std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0'); for (size_t posMask = 0; posMask < numMasks; ++posMask) { std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':'; } std::cout << std::dec << std::endl << std::setfill(' '); auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c); auto mmAddUp = _mm256_set1_epi16(16); auto mmAinv = _mm256_set1_epi16(AInv); auto mmDmin = _mm256_set1_epi16(Dmin); auto mmDmax = _mm256_set1_epi16(Dmax); const size_t posEnd = (1ull << Cwidth); __m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl; std::cout << std::setfill('0') << std::hex; for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) { auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords)); auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]); auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin); auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1); auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1); auto mm5 = _mm256_or_si256(mm3, mm4); auto mm6 = _mm256_and_si256(mm2, mm5); auto mask = _mm256_movemask_epi8(mm6); if (mask & pNonCodewordMasks[posMask]) { std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } else { std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp); } std::cout << std::setfill(' ') << std::dec; }
size_t __FASTCALL strlen_fast_v2_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)cur & 0x3F; // If the misalignment bytes is < 32 bytes? if (misalignment < 0x20) { if (misalignment == 0) { // If misalignment is 0, skip this step. goto main_loop; } // Align address to 64 bytes for main loop. cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F)); // Load 32 bytes from target string to YMM register. src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 32 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); // Remove last missalign bits. zero_mask_low >>= misalignment; zero_mask_low <<= misalignment; if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); goto strlen_exit; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop. cur += 64; } else {
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m256i K = LOADU(key); __m256i A, B, C, D; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, K); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); DECRYPT_DATA(A, B, C, D, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D, K); /* Verify tag */ D = _mm256_cmpeq_epi8(D, LOADU(c + clen - BYTES(NORX_T))); return (((_mm256_movemask_epi8(D) & 0xFFFFFFFFULL) + 1) >> 32) - 1; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_AVX(void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. char* in_b = (char*) in; char* out_b = (char*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; __m256i ymm; int32_t bt; if (elem_size % 4) { return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); } else { for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); * (int32_t *) &out_b[ind] = bt; } } } } return size * elem_size; }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; char* in_b = (char*) in; char* out_b = (char*) out; int32_t* out_i32; size_t nbyte = elem_size * size; int64_t count; __m256i ymm; int32_t bt; for (ii = 0; ii + 31 < nbyte; ii += 32) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_i32 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 32); return count; }
int main(void) { for (int a = 0; a < 1000; a++) { for (int b = 0; b < 1000; b++) { uint32_t lhs_ab = 1000 * 1000 * a + 1000 * b; m256u_t lhs_ab_v = {.u = {lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab}}; uint32_t rhs_ab = a * a * a + b * b * b; m256u_t rhs_ab_v = {.u = {rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab}}; m256u_t c_v = {.u = {0, 1, 2, 3, 4, 5, 6, 7}}; m256u_t c_inc_v = {.u = {8, 8, 8, 8, 8, 8, 8, 8}}; m256u_t lhs_v, rhs_v, cmp_v; for (int c = 0; c < 1000; c += 8) { lhs_v.m = _mm256_add_epi32(lhs_ab_v.m, c_v.m); rhs_v.m = _mm256_mullo_epi32(c_v.m, c_v.m); rhs_v.m = _mm256_mullo_epi32(rhs_v.m, c_v.m); rhs_v.m = _mm256_add_epi32(rhs_v.m, rhs_ab_v.m); cmp_v.m = _mm256_cmpeq_epi32(lhs_v.m, rhs_v.m); if (_mm256_movemask_epi8(cmp_v.m)) { for (int i = 0; i < 8; i++) if (cmp_v.u[i] != 0) printf("%09u\n", lhs_v.u[i]); } c_v.m = _mm256_add_epi32(c_v.m, c_inc_v.m); } } } return 0; }
void* xmemchr(const void* src, int c, size_t n) { if (n < 32) { return xmemchr_tiny(src, c, n); } __m256i ymm0 = _mm256_set1_epi8((char)c), ymm1; int mask; size_t rem = n % 32; n /= 32; for (size_t i = 0; i < n; i++) { ymm1 = _mm256_loadu_si256((const __m256i*)src + i); ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0); mask = _mm256_movemask_epi8(ymm1); if (mask) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"0"(mask) ); return (void*)((unsigned long)((const __m256i*)src + i) + mask); } } return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem); }
inline void matrix32x8::transpose(square128& output, int x, int y) { for (int j = 0; j < 8; j++) { int row = _mm256_movemask_epi8(whole); whole = _mm256_slli_epi64(whole, 1); // _mm_movemask_epi8 uses most significant bit, hence +7-j output.words[8*x+7-j][y] = row; } }
void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) { uint32_t* output = (uint32_t*)out; const size_t N = 8*4; // unrolled 4 times const size_t chunks = n / N; const size_t tail = n % N; const __m256i vkey = _mm256_set1_epi32(key); for (size_t i=0; i < chunks; i++) { const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8)); const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8)); const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8)); const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8)); const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey); const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey); const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey); const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey); // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword) // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword) // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword) // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword) // t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word) const __m256i t0 = _mm256_packs_epi32(eq0, eq2); // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word) const __m256i m02 = _mm256_permutevar8x32_epi32(t0, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word) const __m256i t1 = _mm256_packs_epi32(eq1, eq3); // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word) const __m256i m13 = _mm256_permutevar8x32_epi32(t1, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // m = [a0..7 b0..7 c0..7 d0..7] (packed byte) const __m256i m = _mm256_packs_epi16(m02, m13); *output++ = _mm256_movemask_epi8(m); } if (tail > 0) { bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N); } }
size_t xstrlen(const char* src) { __m256i m0 = _mm256_setzero_si256(); __m256i m1 ; int mask; for (size_t count = 0;; count += 32){ m1 = _mm256_loadu_si256((const __m256i*)((unsigned long)src + count)); m1 = _mm256_cmpeq_epi8(m1, m0); mask = _mm256_movemask_epi8(m1); if (mask != 0) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"r"(mask) ); return count + (size_t)mask; } }
/* * Do or undo the 'E8' preprocessing used in LZX. Before compression, the * uncompressed data is preprocessed by changing the targets of x86 CALL * instructions from relative offsets to absolute offsets. After decompression, * the translation is undone by changing the targets of x86 CALL instructions * from absolute offsets to relative offsets. * * Note that despite its intent, E8 preprocessing can be done on any data even * if it is not actually x86 machine code. In fact, E8 preprocessing appears to * always be used in LZX-compressed resources in WIM files; there is no bit to * indicate whether it is used or not, unlike in the LZX compressed format as * used in cabinet files, where a bit is reserved for that purpose. * * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data, * which really means the 5-byte call instruction cannot start in the last 10 * bytes of the uncompressed data. This is one of the errors in the LZX * documentation. * * E8 preprocessing does not appear to be disabled after the 32768th chunk of a * WIM resource, which apparently is another difference from the LZX compression * used in cabinet files. * * E8 processing is supposed to take the file size as a parameter, as it is used * in calculating the translated jump targets. But in WIM files, this file size * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000). */ static void lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) { #if !defined(__SSE2__) && !defined(__AVX2__) /* * A worthwhile optimization is to push the end-of-buffer check into the * relatively rare E8 case. This is possible if we replace the last six * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte * before reaching end-of-buffer. In addition, this scheme guarantees * that no translation can begin following an E8 byte in the last 10 * bytes because a 4-byte offset containing E8 as its high byte is a * large negative number that is not valid for translation. That is * exactly what we need. */ u8 *tail; u8 saved_bytes[6]; u8 *p; if (size <= 10) return; tail = &data[size - 6]; memcpy(saved_bytes, tail, 6); memset(tail, 0xE8, 6); p = data; for (;;) { while (*p != 0xE8) p++; if (p >= tail) break; (*process_target)(p + 1, p - data); p += 5; } memcpy(tail, saved_bytes, 6); #else /* SSE2 or AVX-2 optimized version for x86_64 */ u8 *p = data; u64 valid_mask = ~0; if (size <= 10) return; #ifdef __AVX2__ # define ALIGNMENT_REQUIRED 32 #else # define ALIGNMENT_REQUIRED 16 #endif /* Process one byte at a time until the pointer is properly aligned. */ while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { if (p >= data + size - 10) return; if (*p == 0xE8 && (valid_mask & 1)) { (*process_target)(p + 1, p - data); valid_mask &= ~0x1F; } p++; valid_mask >>= 1; valid_mask |= (u64)1 << 63; } if (data + size - p >= 64) { /* Vectorized processing */ /* Note: we use a "trap" E8 byte to eliminate the need to check * for end-of-buffer in the inner loop. This byte is carefully * positioned so that it will never be changed by a previous * translation before it is detected. */ u8 *trap = p + ((data + size - p) & ~31) - 32 + 4; u8 saved_byte = *trap; *trap = 0xE8; for (;;) { u32 e8_mask; u8 *orig_p = p; #ifdef __AVX2__ const __m256i e8_bytes = _mm256_set1_epi8(0xE8); for (;;) { __m256i bytes = *(const __m256i *)p; __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); e8_mask = _mm256_movemask_epi8(cmpresult); if (e8_mask) break; p += 32; } #else const __m128i e8_bytes = _mm_set1_epi8(0xE8); for (;;) { /* Read the next 32 bytes of data and test them * for E8 bytes. */ __m128i bytes1 = *(const __m128i *)p; __m128i bytes2 = *(const __m128i *)(p + 16); __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); u32 mask1 = _mm_movemask_epi8(cmpresult1); u32 mask2 = _mm_movemask_epi8(cmpresult2); /* The masks have a bit set for each E8 byte. * We stay in this fast inner loop as long as * there are no E8 bytes. */ if (mask1 | mask2) { e8_mask = mask1 | (mask2 << 16); break; } p += 32; } #endif /* Did we pass over data with no E8 bytes? */ if (p != orig_p) valid_mask = ~0; /* Are we nearing end-of-buffer? */ if (p == trap - 4) break; /* Process the E8 bytes. However, the AND with * 'valid_mask' ensures we never process an E8 byte that * was itself part of a translation target. */ while ((e8_mask &= valid_mask)) { unsigned bit = bsf32(e8_mask); (*process_target)(p + bit + 1, p + bit - data); valid_mask &= ~((u64)0x1F << bit); } valid_mask >>= 32; valid_mask |= 0xFFFFFFFF00000000; p += 32; } *trap = saved_byte; }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and the one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * <ox> will be resized if needed. It's fine if it was * just <_Reuse()'d> from a previous, smaller profile. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - filter DP matrix (one row) * ret_sc - RETURN: MSV score (in nats) * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * <ox> may have been resized. * * Throws: <eslEMEML> if <ox> reallocation fails. */ int p7_MSVFilter_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_FILTERMX *ox, float *ret_sc) { #ifdef HAVE_AVX2 uint8_t xJ; /* special states' scores */ register __m256i mpv_AVX; /* previous row values */ register __m256i xEv_AVX; /* E state: keeps max for Mk->E as we go */ register __m256i xBv_AVX; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m256i sv_AVX; /* temp storage of 1 curr row value in progress */ register __m256i biasv_AVX; /* emission bias in a vector */ __m256i *dp_AVX; /* the dp row memory */ __m256i *rsc_AVX; /* will point at om->rbv[x] for residue x[i] */ __m256i xJv_AVX; /* vector for states score */ __m256i tjbmv_AVX; /* vector for cost of moving {JN}->B->M */ __m256i tecv_AVX; /* vector for E->C cost */ __m256i basev_AVX; /* offset for scores */ __m256i ceilingv_AVX; /* saturated simd value used to test for overflow */ __m256i tempv_AVX; /* work vector */ int Q_AVX = P7_NVB_AVX(om->M); /* segment length: # of vectors */ int q_AVX; /* counter over vectors 0..nq-1 */ int i; /* counter over sequence positions 1..L */ int cmp; int status; //printf("Starting MSVFilter\n"); /* Contract checks */ ESL_DASSERT1(( om->mode == p7_LOCAL )); /* Production code assumes multilocal mode w/ length model <L> */ ESL_DASSERT1(( om->L == L )); /* ... and it's easy to forget to set <om> that way */ ESL_DASSERT1(( om->nj == 1.0f )); /* ... hence the check */ /* ... which you can disable, if you're playing w/ config */ /* note however that it makes no sense to run MSV w/ a model in glocal mode */ /* Try highly optimized Knudsen SSV filter first. * Note that SSV doesn't use any main memory (from <ox>) at all! */ //extern uint64_t SSV_time; uint64_t filter_start_time = __rdtsc(); status = p7_SSVFilter_avx(dsq, L, om, ret_sc); uint64_t filter_end_time = __rdtsc(); //SSV_time += (filter_end_time - filter_start_time); if (status != eslENORESULT) return status; extern uint64_t full_MSV_calls; full_MSV_calls++; /* Resize the filter mx as needed */ if (( status = p7_filtermx_GrowTo(ox, om->M)) != eslOK) ESL_EXCEPTION(status, "Reallocation of MSV filter matrix failed"); dp_AVX = ox->dp_AVX; /* ditto this */ /* Matrix type and size must be set early, not late: debugging dump functions need this information. */ ox->M = om->M; ox->type = p7F_MSVFILTER; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv_AVX = _mm256_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) dp_AVX[q_AVX] = _mm256_setzero_si256(); /* saturate simd register for overflow test */ ceilingv_AVX = _mm256_cmpeq_epi8(biasv_AVX, biasv_AVX); basev_AVX = _mm256_set1_epi8((int8_t) om->base_b); tjbmv_AVX = _mm256_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); tecv_AVX = _mm256_set1_epi8((int8_t) om->tec_b); xJv_AVX = _mm256_subs_epu8(biasv_AVX, biasv_AVX); xBv_AVX = _mm256_subs_epu8(basev_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB; xB = _mm_extract_epi16(xBv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) /* Outer loop over residues*/ { rsc_AVX = om->rbv_AVX[dsq[i]]; xEv_AVX = _mm256_setzero_si256(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ __m256i dp_temp_AVX = dp_AVX[Q_AVX -1]; mpv_AVX = esl_avx_leftshift_one(dp_temp_AVX); for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv_AVX = _mm256_max_epu8(mpv_AVX, xBv_AVX); sv_AVX = _mm256_adds_epu8(sv_AVX, biasv_AVX); sv_AVX = _mm256_subs_epu8(sv_AVX, *rsc_AVX); rsc_AVX++; xEv_AVX = _mm256_max_epu8(xEv_AVX, sv_AVX); mpv_AVX = dp_AVX[q_AVX]; /* Load {MDI}(i-1,q) into mpv */ dp_AVX[q_AVX] = sv_AVX; /* Do delayed store of M(i,q) now that memory is usable */ } /* test for the overflow condition */ tempv_AVX = _mm256_adds_epu8(xEv_AVX, biasv_AVX); tempv_AVX = _mm256_cmpeq_epi8(tempv_AVX, ceilingv_AVX); cmp = _mm256_movemask_epi8(tempv_AVX); /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use shuffles instead of shifts so when the last max has completed, * the last four elements of the simd register will contain the * max value. Then the last shuffle will broadcast the max value * to all simd elements. */ xEv_AVX = _mm256_set1_epi8(esl_avx_hmax_epu8(xEv_AVX)); // broadcast the max byte from original xEv_AVX // to all bytes of xEv_AVX /* immediately detect overflow */ if (cmp != 0x0000) { // MSV_end_time = __rdtsc(); // MSV_time += (MSV_end_time - MSV_start_time); *ret_sc = eslINFINITY; return eslERANGE; } xEv_AVX = _mm256_subs_epu8(xEv_AVX, tecv_AVX); xJv_AVX = _mm256_max_epu8(xJv_AVX,xEv_AVX); xBv_AVX = _mm256_max_epu8(basev_AVX, xJv_AVX); xBv_AVX = _mm256_subs_epu8(xBv_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB, xE; xB = _mm_extract_epi16(xBv, 0); xE = _mm_extract_epi16(xEv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ xJ = _mm256_extract_epi8(xJv_AVX, 0); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ /* MSV_end_time = __rdtsc(); MSV_time += (MSV_end_time - MSV_start_time); */ return eslOK; #endif #ifndef HAVE_AVX2 return eslENORESULT; // Stub so we have something to link if we build without AVX2 support #endif }
size_t __FASTCALL strlen_fast_v1b_avx(const char * str) { size_t len; register __m256i zero32, src32_low, src32_high; register size_t zero_mask_low, zero_mask_high; register uint64_t zero_mask; unsigned long zero_index; register const char * cur = str; // Get the misalignment bytes last 6 bits. size_t misalignment = (size_t)str & 0x3F; if (misalignment != 0) { misalignment = (size_t)str & 0x1F; // Scan the null terminator in first missalign bytes. register const char * end = cur + ((size_t)16UL - misalignment); while (cur < end) { // Find out the null terminator. if (*cur == '\0') { return (size_t)(cur - str); } cur++; } // Align address to 64 bytes for main loop. end = (const char *)((size_t)str & ((size_t)~(size_t)0x3F)) + 64; register __m128i zero16, src16; register uint32_t zero_mask16; // Set the zero masks (16 bytes). INIT_ZERO_16(zero16); zero16 = _mm_xor_si128(zero16, zero16); // Minor 16 bytes loop while (cur < end) { // Load the src 16 bytes to XMM register src16 = _mm_load_si128((__m128i *)(cur)); // Compare with zero16 masks per byte. src16 = _mm_cmpeq_epi8(src16, zero16); // Package the compare result (16 bytes) to 16 bits. zero_mask16 = (uint32_t)_mm_movemask_epi8(src16); // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 16 bytes). if (zero_mask16 != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask16); goto strlen_exit; } // One minor loop scan 16 bytes. cur += 16; } } // Set the zero masks (32 bytes). INIT_ZERO_32(zero32); zero32 = _mm256_xor_si256(zero32, zero32); // Main loop do { // Load the src 32 bytes to XMM register src32_low = _mm256_load_si256((__m256i *)(cur)); src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_low = _mm256_cmpeq_epi8(src32_low, zero32); src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 16 bits. zero_mask_low = (size_t)_mm256_movemask_epi8(src32_low); zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); #if defined(_WIN64) || defined(WIN64) || defined(_M_X64) || defined(_M_AMD64) \ || defined(_M_IA64) || defined(__amd64__) || defined(__x86_64__) // Combin the mask of the low 32 bits and high 32 bits. zero_mask = (zero_mask_high << 32) | zero_mask_low; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask != 0) { // Get the index of the first bit on set to 1. __BitScanForward64(zero_index, zero_mask); break; } #else (void)zero_mask; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask_low != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_low); break; } else if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; break; } #endif // _M_X64 || __x86_64__ // One loop scan 64 bytes. cur += 64; } while (1); strlen_exit: len = cur - str; len += zero_index; return len; }
int test_mm256_movemask_epi8(__m256i a) { // CHECK-LABEL: test_mm256_movemask_epi8 // CHECK: call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}}) return _mm256_movemask_epi8(a); }
int test_mm256_movemask_epi8(__m256i a) { // CHECK: @llvm.x86.avx2.pmovmskb return _mm256_movemask_epi8(a); }
zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop. cur += 64; } else { // Align address to 64 bytes, and offset 32 bytes for misalignment. cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F)); // Load the src 32 bytes to XMM register src32_high = _mm256_load_si256((__m256i *)(cur + 32)); // Compare with zero32 masks per byte. src32_high = _mm256_cmpeq_epi8(src32_high, zero32); // Package the compare result (32 bytes) to 32 bits. zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high); // Skip 32 bytes. misalignment -= 32; // Remove last misalignment bits. zero_mask_high >>= misalignment; zero_mask_high <<= misalignment; // If it have any one bit is 1, mean it have a null terminator // inside this scaned strings (per 64 bytes). if (zero_mask_high != 0) { // Get the index of the first bit on set to 1. __BitScanForward(zero_index, zero_mask_high); zero_index += 32; goto strlen_exit; } // Align address to the next 64 bytes for main loop.