void mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m256i in, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region+length; region<end; region+=32) { in = _mm256_load_si256((void *)region); l = _mm256_and_si256(in, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h, l); _mm256_store_si256((void *)region, out); } }
template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) { assert(width >= A); if(align) { assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); assert(Aligned(mask) && Aligned(maskStride)); } const __m256i _threshold = _mm256_set1_epi8((char)threshold); size_t alignedWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + alignedWidth, 1); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += A) EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); if(alignedWidth != width) EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); backgroundValue += backgroundValueStride; backgroundCount += backgroundCountStride; mask += maskStride; } }
void* xmemchr(const void* src, int c, size_t n) { if (n < 32) { return xmemchr_tiny(src, c, n); } __m256i ymm0 = _mm256_set1_epi8((char)c), ymm1; int mask; size_t rem = n % 32; n /= 32; for (size_t i = 0; i < n; i++) { ymm1 = _mm256_loadu_si256((const __m256i*)src + i); ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0); mask = _mm256_movemask_epi8(ymm1); if (mask) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"0"(mask) ); return (void*)((unsigned long)((const __m256i*)src + i) + mask); } } return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem); }
static inline __m256i enc_translate (const __m256i in) { // LUT contains Absolute offset for all ranges: const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Index Characters // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 [2..11] 0123456789 // 3 [62] [43] -19 12 + // 4 [63] [47] -16 13 / // Create LUT indices from input: // the index for range #0 is right, others are 1 less than expected: __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: __m256i mask = CMPGT(in, 25); // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: indices = _mm256_sub_epi8(indices, mask); // Add offsets to input values: __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); return out; }
void maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i reg1, reg2, ri[4], sp[4], mi[4]; const uint8_t *p = pt[constant]; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } mi[0] = _mm256_set1_epi8(0x11); mi[1] = _mm256_set1_epi8(0x22); mi[2] = _mm256_set1_epi8(0x44); mi[3] = _mm256_set1_epi8(0x88); sp[0] = _mm256_set1_epi16(p[0]); sp[1] = _mm256_set1_epi16(p[1]); sp[2] = _mm256_set1_epi16(p[2]); sp[3] = _mm256_set1_epi16(p[3]); for (end=region1+length; region1<end; region1+=32, region2+=32) { reg2 = _mm256_load_si256((void *)region2); reg1 = _mm256_load_si256((void *)region1); ri[0] = _mm256_and_si256(reg2, mi[0]); ri[1] = _mm256_and_si256(reg2, mi[1]); ri[2] = _mm256_and_si256(reg2, mi[2]); ri[3] = _mm256_and_si256(reg2, mi[3]); ri[1] = _mm256_srli_epi16(ri[1], 1); ri[2] = _mm256_srli_epi16(ri[2], 2); ri[3] = _mm256_srli_epi16(ri[3], 3); ri[0] = _mm256_mullo_epi16(ri[0], sp[0]); ri[1] = _mm256_mullo_epi16(ri[1], sp[1]); ri[2] = _mm256_mullo_epi16(ri[2], sp[2]); ri[3] = _mm256_mullo_epi16(ri[3], sp[3]); ri[0] = _mm256_xor_si256(ri[0], ri[1]); ri[2] = _mm256_xor_si256(ri[2], ri[3]); ri[0] = _mm256_xor_si256(ri[0], ri[2]); ri[0] = _mm256_xor_si256(ri[0], reg1); _mm256_store_si256((void *)region1, ri[0]); } }
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; uint64_t result = 0; // 1. blocks of 8 registers while (ptr + 8*32 < end) { const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32))); const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32))); const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32))); const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32))); const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32))); const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32))); const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32))); const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32))); const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01)); const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02)); const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04)); const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08)); const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10)); const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20)); const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40)); const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80))); const __m256i m01 = _mm256_or_si256(eq0bit, eq1bit); const __m256i m23 = _mm256_or_si256(eq2bit, eq3bit); const __m256i m45 = _mm256_or_si256(eq4bit, eq5bit); const __m256i m67 = _mm256_or_si256(eq6bit, eq7bit); const __m256i m0123 = _mm256_or_si256(m01, m23); const __m256i m4567 = _mm256_or_si256(m45, m67); const __m256i merged = _mm256_or_si256(m0123, m4567); result += __builtin_popcountll(_mm256_extract_epi64(merged, 0)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 1)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 2)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 3)); ptr += 8 * 32; } return result + scalar_count_bytes(ptr, end - ptr, byte); }
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; __m256i global_sum = _mm256_setzero_si256(); __m256i local_sum; // 1. blocks of 256 registers while (ptr + 255*32 < end) { local_sum = _mm256_setzero_si256(); // update 32 x 8-bit counter for (int i=0; i < 255; i++, ptr += 32) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1 local_sum = _mm256_sub_epi8(local_sum, eq); } // update the global accumulator 2 x 64-bit const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); } // 2. tail of < 256 registers local_sum = _mm256_setzero_si256(); while (ptr + 32 < end) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); local_sum = _mm256_sub_epi8(local_sum, eq); ptr += 32; } const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); // 3. process tail < 32 bytes uint64_t result = _mm256_extract_epi64(global_sum, 0) + _mm256_extract_epi64(global_sum, 1) + _mm256_extract_epi64(global_sum, 2) + _mm256_extract_epi64(global_sum, 3); return result + scalar_count_bytes(ptr, end - ptr, byte); }
uint8_t get_xE_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om) { #ifdef HAVE_AVX2 __m256i xEv_AVX; /* E state: keeps max for Mk->E as we go */ __m256i beginv_AVX; /* begin scores */ uint8_t retval_AVX; int q_AVX; /* counter over vectors 0..nq-1 */ int Q_AVX = P7_NVB_AVX(om->M); /* segment length: # of vectors */ int bands_AVX; /* the number of bands (rounds) to use */ beginv_AVX = _mm256_set1_epi8(128); xEv_AVX = beginv_AVX; /* function pointers for the various number of vectors to use */ __m256i (*fs_AVX[MAX_BANDS + 1]) (const ESL_DSQ *, int, const P7_OPROFILE *, int, register __m256i, __m256i) = {NULL , calc_band_1_AVX, calc_band_2_AVX, calc_band_3_AVX, calc_band_4_AVX, calc_band_5_AVX, calc_band_6_AVX #if MAX_BANDS > 6 , calc_band_7_AVX, calc_band_8_AVX, calc_band_9_AVX, calc_band_10_AVX, calc_band_11_AVX, calc_band_12_AVX , calc_band_13_AVX, calc_band_14_AVX #endif #if MAX_BANDS > 14 , calc_band_15_AVX, calc_band_16_AVX, calc_band_17_AVX, calc_band_18_AVX #endif }; int last_q; /* for saving the last q value to find band width */ int i; /* counter for bands */ last_q = 0; // reset in case we also ran SSE code /* Use the highest number of bands but no more than MAX_BANDS */ bands_AVX = (Q_AVX + MAX_BANDS - 1) / MAX_BANDS; for (i = 0; i < bands_AVX; i++) { q_AVX = (Q_AVX * (i + 1)) / bands_AVX; xEv_AVX = fs_AVX[q_AVX-last_q](dsq, L, om, last_q, beginv_AVX, xEv_AVX); last_q = q_AVX; } retval_AVX = esl_avx_hmax_epu8(xEv_AVX); return retval_AVX; #endif // HAVE_AVX2 #ifndef HAVE_AVX2 return 0; // Stub so there's something to link if we don't have AVX2 support #endif }
void maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i in1, in2, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=32, region2+=32) { in2 = _mm256_load_si256((void *)region2); in1 = _mm256_load_si256((void *)region1); l = _mm256_and_si256(in2, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in2, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h,l); out = _mm256_xor_si256(out, in1); _mm256_store_si256((void *)region1, out); } }
static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return (sad + 31) >> 6; }
template <bool align> void SquaredDifferenceSumMasked( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); assert(Aligned(mask) && Aligned(maskStride)); } size_t bodyWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF); __m256i fullSum = _mm256_setzero_si256(); __m256i index_= _mm256_set1_epi8(index); for(size_t row = 0; row < height; ++row) { __m256i rowSum = _mm256_setzero_si256(); for(size_t col = 0; col < bodyWidth; col += A) { const __m256i mask_ = LoadMaskI8<align>((__m256i*)(mask + col), index_); const __m256i a_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(a + col))); const __m256i b_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(b + col))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8<false>((__m256i*)(mask + width - A), index_)); const __m256i a_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(a + width - A))); const __m256i b_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(b + width - A))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; mask += maskStride; } *sum = ExtractSum<uint64_t>(fullSum); }
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); const __m256i ma = _mm256_sub_epi8(alpha_max, a); const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); _mm256_storeu_si256((__m256i *)(comp_pred), roundA); }
static __m256i avx2_popcount(const __m256i vec) { const __m256i lookup = _mm256_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); const __m256i low_mask = _mm256_set1_epi8(0x0f); const __m256i lo = _mm256_and_si256(vec, low_mask); const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); return _mm256_add_epi8(popcnt1, popcnt2); }
template<bool align> void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram) { Buffer<uint16_t> buffer(AlignHi(width, A), HISTOGRAM_SIZE + 8); size_t widthAligned4 = Simd::AlignLo(width, 4); size_t widthAlignedA = Simd::AlignLo(width, A); size_t widthAlignedDA = Simd::AlignLo(width, DA); __m256i _index = _mm256_set1_epi8(index); for(size_t row = 0; row < height; ++row) { size_t col = 0; for(; col < widthAlignedDA; col += DA) { MaskSrc<align, true>(src, mask, _index, col, buffer.v); MaskSrc<align, true>(src, mask, _index, col + A, buffer.v); } for(; col < widthAlignedA; col += A) MaskSrc<align, true>(src, mask, _index, col, buffer.v); if(width != widthAlignedA) MaskSrc<false, false>(src, mask, _index, width - A, buffer.v); for(col = 0; col < widthAligned4; col += 4) { ++buffer.h[0][buffer.v[col + 0]]; ++buffer.h[1][buffer.v[col + 1]]; ++buffer.h[2][buffer.v[col + 2]]; ++buffer.h[3][buffer.v[col + 3]]; } for(; col < width; ++col) ++buffer.h[0][buffer.v[col]]; src += srcStride; mask += maskStride; } SumHistograms(buffer.h[0], 8, histogram); }
int normHamming(const uchar* a, const uchar* b, int n) { CV_AVX_GUARD; int i = 0; int result = 0; #if CV_AVX2 { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); for(; i <= n - 32; i+= 32) { __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); __m256i _xor = _mm256_xor_si256(_a0, _b0); __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); } _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } #endif // CV_AVX2 #if CV_POPCNT { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) { result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); } # endif for(; i <= n - 4; i += 4) { result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } #endif // CV_POPCNT #if CV_SIMD128 { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } result += v_reduce_sum(t); } #endif // CV_SIMD128 #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } #endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; } return result; }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and the one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * <ox> will be resized if needed. It's fine if it was * just <_Reuse()'d> from a previous, smaller profile. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - filter DP matrix (one row) * ret_sc - RETURN: MSV score (in nats) * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * <ox> may have been resized. * * Throws: <eslEMEML> if <ox> reallocation fails. */ int p7_MSVFilter_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_FILTERMX *ox, float *ret_sc) { #ifdef HAVE_AVX2 uint8_t xJ; /* special states' scores */ register __m256i mpv_AVX; /* previous row values */ register __m256i xEv_AVX; /* E state: keeps max for Mk->E as we go */ register __m256i xBv_AVX; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m256i sv_AVX; /* temp storage of 1 curr row value in progress */ register __m256i biasv_AVX; /* emission bias in a vector */ __m256i *dp_AVX; /* the dp row memory */ __m256i *rsc_AVX; /* will point at om->rbv[x] for residue x[i] */ __m256i xJv_AVX; /* vector for states score */ __m256i tjbmv_AVX; /* vector for cost of moving {JN}->B->M */ __m256i tecv_AVX; /* vector for E->C cost */ __m256i basev_AVX; /* offset for scores */ __m256i ceilingv_AVX; /* saturated simd value used to test for overflow */ __m256i tempv_AVX; /* work vector */ int Q_AVX = P7_NVB_AVX(om->M); /* segment length: # of vectors */ int q_AVX; /* counter over vectors 0..nq-1 */ int i; /* counter over sequence positions 1..L */ int cmp; int status; //printf("Starting MSVFilter\n"); /* Contract checks */ ESL_DASSERT1(( om->mode == p7_LOCAL )); /* Production code assumes multilocal mode w/ length model <L> */ ESL_DASSERT1(( om->L == L )); /* ... and it's easy to forget to set <om> that way */ ESL_DASSERT1(( om->nj == 1.0f )); /* ... hence the check */ /* ... which you can disable, if you're playing w/ config */ /* note however that it makes no sense to run MSV w/ a model in glocal mode */ /* Try highly optimized Knudsen SSV filter first. * Note that SSV doesn't use any main memory (from <ox>) at all! */ //extern uint64_t SSV_time; uint64_t filter_start_time = __rdtsc(); status = p7_SSVFilter_avx(dsq, L, om, ret_sc); uint64_t filter_end_time = __rdtsc(); //SSV_time += (filter_end_time - filter_start_time); if (status != eslENORESULT) return status; extern uint64_t full_MSV_calls; full_MSV_calls++; /* Resize the filter mx as needed */ if (( status = p7_filtermx_GrowTo(ox, om->M)) != eslOK) ESL_EXCEPTION(status, "Reallocation of MSV filter matrix failed"); dp_AVX = ox->dp_AVX; /* ditto this */ /* Matrix type and size must be set early, not late: debugging dump functions need this information. */ ox->M = om->M; ox->type = p7F_MSVFILTER; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv_AVX = _mm256_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) dp_AVX[q_AVX] = _mm256_setzero_si256(); /* saturate simd register for overflow test */ ceilingv_AVX = _mm256_cmpeq_epi8(biasv_AVX, biasv_AVX); basev_AVX = _mm256_set1_epi8((int8_t) om->base_b); tjbmv_AVX = _mm256_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); tecv_AVX = _mm256_set1_epi8((int8_t) om->tec_b); xJv_AVX = _mm256_subs_epu8(biasv_AVX, biasv_AVX); xBv_AVX = _mm256_subs_epu8(basev_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB; xB = _mm_extract_epi16(xBv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) /* Outer loop over residues*/ { rsc_AVX = om->rbv_AVX[dsq[i]]; xEv_AVX = _mm256_setzero_si256(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ __m256i dp_temp_AVX = dp_AVX[Q_AVX -1]; mpv_AVX = esl_avx_leftshift_one(dp_temp_AVX); for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv_AVX = _mm256_max_epu8(mpv_AVX, xBv_AVX); sv_AVX = _mm256_adds_epu8(sv_AVX, biasv_AVX); sv_AVX = _mm256_subs_epu8(sv_AVX, *rsc_AVX); rsc_AVX++; xEv_AVX = _mm256_max_epu8(xEv_AVX, sv_AVX); mpv_AVX = dp_AVX[q_AVX]; /* Load {MDI}(i-1,q) into mpv */ dp_AVX[q_AVX] = sv_AVX; /* Do delayed store of M(i,q) now that memory is usable */ } /* test for the overflow condition */ tempv_AVX = _mm256_adds_epu8(xEv_AVX, biasv_AVX); tempv_AVX = _mm256_cmpeq_epi8(tempv_AVX, ceilingv_AVX); cmp = _mm256_movemask_epi8(tempv_AVX); /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use shuffles instead of shifts so when the last max has completed, * the last four elements of the simd register will contain the * max value. Then the last shuffle will broadcast the max value * to all simd elements. */ xEv_AVX = _mm256_set1_epi8(esl_avx_hmax_epu8(xEv_AVX)); // broadcast the max byte from original xEv_AVX // to all bytes of xEv_AVX /* immediately detect overflow */ if (cmp != 0x0000) { // MSV_end_time = __rdtsc(); // MSV_time += (MSV_end_time - MSV_start_time); *ret_sc = eslINFINITY; return eslERANGE; } xEv_AVX = _mm256_subs_epu8(xEv_AVX, tecv_AVX); xJv_AVX = _mm256_max_epu8(xJv_AVX,xEv_AVX); xBv_AVX = _mm256_max_epu8(basev_AVX, xJv_AVX); xBv_AVX = _mm256_subs_epu8(xBv_AVX, tjbmv_AVX); #ifdef p7_DEBUGGING if (ox->do_dumping) { uint8_t xB, xE; xB = _mm_extract_epi16(xBv, 0); xE = _mm_extract_epi16(xEv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_filtermx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ xJ = _mm256_extract_epi8(xJv_AVX, 0); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ /* MSV_end_time = __rdtsc(); MSV_time += (MSV_end_time - MSV_start_time); */ return eslOK; #endif #ifndef HAVE_AVX2 return eslENORESULT; // Stub so we have something to link if we build without AVX2 support #endif }
size_t vec_i8_count(const char *p, size_t n, char val) { size_t num = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) if (*p++ == val) num++; # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask = _mm256_set1_epi8(val); __m256i sum = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); n -= 16; p += 16; } for (; n >= 128; n-=128) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } for (; n >= 32; n-=32) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = _mm256_sub_epi8(sum, MM_SET_M128(zeros, c1)); n -= 16; p += 16; } if (offset > 0) num += vec_avx_sum_u8(sum); # else // body, SSE2 const __m128i mask = _mm_set1_epi8(val); __m128i sum = _mm_setzero_si128(); size_t offset = 0; for (; n >= 64; n-=64) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } if (offset > 0) num += vec_sum_u8(sum); #endif #endif // tail for (; n > 0; n--) if (*p++ == val) num++; return num; }
/*! * \brief Fill a packed vector by replicating a value */ ETL_STATIC_INLINE(avx_simd_byte) set(int8_t value) { return _mm256_set1_epi8(value); }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
/* * Do or undo the 'E8' preprocessing used in LZX. Before compression, the * uncompressed data is preprocessed by changing the targets of x86 CALL * instructions from relative offsets to absolute offsets. After decompression, * the translation is undone by changing the targets of x86 CALL instructions * from absolute offsets to relative offsets. * * Note that despite its intent, E8 preprocessing can be done on any data even * if it is not actually x86 machine code. In fact, E8 preprocessing appears to * always be used in LZX-compressed resources in WIM files; there is no bit to * indicate whether it is used or not, unlike in the LZX compressed format as * used in cabinet files, where a bit is reserved for that purpose. * * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data, * which really means the 5-byte call instruction cannot start in the last 10 * bytes of the uncompressed data. This is one of the errors in the LZX * documentation. * * E8 preprocessing does not appear to be disabled after the 32768th chunk of a * WIM resource, which apparently is another difference from the LZX compression * used in cabinet files. * * E8 processing is supposed to take the file size as a parameter, as it is used * in calculating the translated jump targets. But in WIM files, this file size * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000). */ static void lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) { #if !defined(__SSE2__) && !defined(__AVX2__) /* * A worthwhile optimization is to push the end-of-buffer check into the * relatively rare E8 case. This is possible if we replace the last six * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte * before reaching end-of-buffer. In addition, this scheme guarantees * that no translation can begin following an E8 byte in the last 10 * bytes because a 4-byte offset containing E8 as its high byte is a * large negative number that is not valid for translation. That is * exactly what we need. */ u8 *tail; u8 saved_bytes[6]; u8 *p; if (size <= 10) return; tail = &data[size - 6]; memcpy(saved_bytes, tail, 6); memset(tail, 0xE8, 6); p = data; for (;;) { while (*p != 0xE8) p++; if (p >= tail) break; (*process_target)(p + 1, p - data); p += 5; } memcpy(tail, saved_bytes, 6); #else /* SSE2 or AVX-2 optimized version for x86_64 */ u8 *p = data; u64 valid_mask = ~0; if (size <= 10) return; #ifdef __AVX2__ # define ALIGNMENT_REQUIRED 32 #else # define ALIGNMENT_REQUIRED 16 #endif /* Process one byte at a time until the pointer is properly aligned. */ while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { if (p >= data + size - 10) return; if (*p == 0xE8 && (valid_mask & 1)) { (*process_target)(p + 1, p - data); valid_mask &= ~0x1F; } p++; valid_mask >>= 1; valid_mask |= (u64)1 << 63; } if (data + size - p >= 64) { /* Vectorized processing */ /* Note: we use a "trap" E8 byte to eliminate the need to check * for end-of-buffer in the inner loop. This byte is carefully * positioned so that it will never be changed by a previous * translation before it is detected. */ u8 *trap = p + ((data + size - p) & ~31) - 32 + 4; u8 saved_byte = *trap; *trap = 0xE8; for (;;) { u32 e8_mask; u8 *orig_p = p; #ifdef __AVX2__ const __m256i e8_bytes = _mm256_set1_epi8(0xE8); for (;;) { __m256i bytes = *(const __m256i *)p; __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); e8_mask = _mm256_movemask_epi8(cmpresult); if (e8_mask) break; p += 32; } #else const __m128i e8_bytes = _mm_set1_epi8(0xE8); for (;;) { /* Read the next 32 bytes of data and test them * for E8 bytes. */ __m128i bytes1 = *(const __m128i *)p; __m128i bytes2 = *(const __m128i *)(p + 16); __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); u32 mask1 = _mm_movemask_epi8(cmpresult1); u32 mask2 = _mm_movemask_epi8(cmpresult2); /* The masks have a bit set for each E8 byte. * We stay in this fast inner loop as long as * there are no E8 bytes. */ if (mask1 | mask2) { e8_mask = mask1 | (mask2 << 16); break; } p += 32; } #endif /* Did we pass over data with no E8 bytes? */ if (p != orig_p) valid_mask = ~0; /* Are we nearing end-of-buffer? */ if (p == trap - 4) break; /* Process the E8 bytes. However, the AND with * 'valid_mask' ensures we never process an E8 byte that * was itself part of a translation target. */ while ((e8_mask &= valid_mask)) { unsigned bit = bsf32(e8_mask); (*process_target)(p + bit + 1, p + bit - data); valid_mask &= ~((u64)0x1F << bit); } valid_mask >>= 32; valid_mask |= 0xFFFFFFFF00000000; p += 32; } *trap = saved_byte; }
#else parasail_result_t *result = parasail_result_new_stats(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; const int8_t NEG_LIMIT = (-open < matrix->min ? INT8_MIN + open : INT8_MIN - matrix->min) + 1; const int8_t POS_LIMIT = INT8_MAX - matrix->max - 1; int8_t score = NEG_LIMIT; int8_t matches = NEG_LIMIT; int8_t similar = NEG_LIMIT; int8_t length = NEG_LIMIT; __m256i vNegLimit = _mm256_set1_epi8(NEG_LIMIT); __m256i vPosLimit = _mm256_set1_epi8(POS_LIMIT); __m256i vSaturationCheckMin = vPosLimit; __m256i vSaturationCheckMax = vNegLimit; __m256i vNegInf = _mm256_set1_epi8(NEG_LIMIT); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi8(open); __m256i vGap = _mm256_set1_epi8(gap); __m256i vZero = _mm256_set1_epi8(0); __m256i vOne = _mm256_set1_epi8(1); __m256i vOne16 = _mm256_set1_epi16(1); __m256i vNegOne16 = _mm256_set1_epi16(-1); __m256i vN16 = _mm256_set1_epi16(N); __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
_mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); /* The bits have now been shifted to the right locations; * translate their values 0..63 to the Base64 alphabet. * Because AVX2 can only compare 'greater than', start from end of alphabet: */ /* set 5: 63, "/" */ s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63)); blockmask = s5mask; /* set 4: 62, "+" */ s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62)); blockmask = _mm256_or_si256(blockmask, s4mask); /* set 3: 52..61, "0123456789" */ s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51))); blockmask = _mm256_or_si256(blockmask, s3mask); /* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */ s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25))); blockmask = _mm256_or_si256(blockmask, s2mask); /* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
void vec_i8_count3(const char *p, size_t n, char val1, char val2, char val3, size_t *out_n1, size_t *out_n2, size_t *out_n3) { size_t n1 = 0, n2 = 0, n3 = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask1 = _mm256_set1_epi8(val1); const __m256i mask2 = _mm256_set1_epi8(val2); const __m256i mask3 = _mm256_set1_epi8(val3); __m256i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = MM_SET_M128(_mm_sub_epi8(zeros, c2), zeros); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = MM_SET_M128(_mm_sub_epi8(zeros, c3), zeros); n -= 16; p += 16; } for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); sum1 = _mm256_sub_epi8(sum1, _mm256_cmpeq_epi8(v, mask1)); sum2 = _mm256_sub_epi8(sum2, _mm256_cmpeq_epi8(v, mask2)); sum3 = _mm256_sub_epi8(sum3, _mm256_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); sum1 = sum2 = sum3 = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = _mm256_sub_epi8(sum1, MM_SET_M128(c1, zeros)); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = _mm256_sub_epi8(sum2, MM_SET_M128(c2, zeros)); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = _mm256_sub_epi8(sum3, MM_SET_M128(c3, zeros)); n -= 16; p += 16; } if (offset > 0) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); } # else // body, SSE2 const __m128i mask1 = _mm_set1_epi8(val1); const __m128i mask2 = _mm_set1_epi8(val2); const __m128i mask3 = _mm_set1_epi8(val3); __m128i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm_setzero_si128(); size_t offset = 0; for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum1 = _mm_sub_epi8(sum1, _mm_cmpeq_epi8(v, mask1)); sum2 = _mm_sub_epi8(sum2, _mm_cmpeq_epi8(v, mask2)); sum3 = _mm_sub_epi8(sum3, _mm_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); sum1 = sum2 = sum3 = _mm_setzero_si128(); offset = 0; } } if (offset > 0) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); } #endif #endif // tail for (; n > 0; n--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } if (out_n1) *out_n1 = n1; if (out_n2) *out_n2 = n2; if (out_n3) *out_n3 = n3; }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
int8_t * const restrict del_pr = _del_pr+PAD; #ifdef PARASAIL_TABLE parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len); #else #ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int8_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi8(NEG_INF); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi8(open); __m256i vGap = _mm256_set1_epi8(gap); __m256i vZero = _mm256_set1_epi8(0); __m256i vOne16 = _mm256_set1_epi16(1); __m256i vNegOne16 = _mm256_set1_epi16(-1); __m256i vN16 = _mm256_set1_epi16(N); __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); __m256i vJresetHi16 = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); __m256i vMax = vNegInf; __m256i vEndILo = vNegInf; __m256i vEndIHi = vNegInf; __m256i vEndJLo = vNegInf;
void* xmemset(void* dest, int c, size_t n) { void* ret = dest; if (n < 16) { xmemset_lt16(dest, c, n); return ret; } __m256i mm = _mm256_set1_epi8((char)c); if (((unsigned long)dest & 31) == 0) { for ( ; n >= 256; n -= 256) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); _mm256_store_si256((__m256i*)dest + 2, mm); _mm256_store_si256((__m256i*)dest + 3, mm); _mm256_store_si256((__m256i*)dest + 4, mm); _mm256_store_si256((__m256i*)dest + 5, mm); _mm256_store_si256((__m256i*)dest + 6, mm); _mm256_store_si256((__m256i*)dest + 7, mm); // 8 dest = (void*)((__m256i*)dest + 8); } if (n >= 128) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); _mm256_store_si256((__m256i*)dest + 2, mm); _mm256_store_si256((__m256i*)dest + 3, mm); dest = (void*)((__m256i*)dest + 4); n -= 128; } if (n >= 64) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); dest = (void*)((__m256i*)dest + 2); n -= 64; } if (n >= 32) { _mm256_store_si256((__m256i*)dest, mm); dest = (void*)((__m256i*)dest + 1); n -= 32; } if (n >= 16) { _mm_store_si128((__m128i*)dest, _mm_set1_epi8((char)c)); dest = (void*)((__m128i*)dest + 1); n -= 16; } } else { for ( ; n >= 256; n -= 256) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); _mm256_storeu_si256((__m256i*)dest + 2, mm); _mm256_storeu_si256((__m256i*)dest + 3, mm); _mm256_storeu_si256((__m256i*)dest + 4, mm); _mm256_storeu_si256((__m256i*)dest + 5, mm); _mm256_storeu_si256((__m256i*)dest + 6, mm); _mm256_storeu_si256((__m256i*)dest + 7, mm); // 8 dest = (void*)((__m256i*)dest + 8); } if (n >= 128) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); _mm256_storeu_si256((__m256i*)dest + 2, mm); _mm256_storeu_si256((__m256i*)dest + 3, mm); dest = (void*)((__m256i*)dest + 4); n -= 128; } if (n >= 64) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); dest = (void*)((__m256i*)dest + 2); n -= 64; } if (n >= 32) { _mm256_storeu_si256((__m256i*)dest, mm); dest = (void*)((__m256i*)dest + 1); n -= 32; } if (n >= 16) { _mm_storeu_si128((__m128i*)dest, _mm_set1_epi8((char)c)); dest = (void*)((__m128i*)dest + 1); n -= 16; } } xmemset_lt16(dest, c, n); return ret; }
// count genotype sum and number of calls, not requiring 16-aligned p COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p, size_t n, C_Int32 &out_sum, C_Int32 &out_num) { C_Int32 sum=0, num=0; #if defined(COREARRAY_SIMD_AVX2) const __m256i three = _mm256_set1_epi8(3); const __m256i zero = _mm256_setzero_si256(); __m256i sum32 = zero, num32 = zero; size_t limit_by_U8 = 0; for (; n >= 32; ) { __m256i v = _mm256_loadu_si256((__m256i const*)p); p += 32; __m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three)); sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m)); num32 = _mm256_sub_epi8(num32, m); n -= 32; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 32)) { // add to sum sum32 = _mm256_sad_epu8(sum32, zero); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2))); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32)); // add to num num32 = _mm256_sad_epu8(num32, zero); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2))); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1))); num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32)); // reset sum32 = num32 = zero; limit_by_U8 = 0; } } #elif defined(COREARRAY_SIMD_SSE2) // header, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p <= 2) { sum += *p; num++; } const __m128i three = _mm_set1_epi8(3); const __m128i zero = _mm_setzero_si128(); __m128i sum16=zero, num16=zero; size_t limit_by_U8 = 0; for (; n >= 16; ) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; __m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three)); sum16 = _mm_add_epi8(sum16, v & m); num16 = _mm_sub_epi8(num16, m); n -= 16; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 16)) { // add to sum sum16 = _mm_sad_epu8(sum16, zero); sum += _mm_cvtsi128_si32(sum16); sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2)); // add to num num16 = _mm_sad_epu8(num16, zero); num += _mm_cvtsi128_si32(num16); num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2)); // reset sum16 = num16 = zero; limit_by_U8 = 0; } } #endif for (; n > 0; n--, p++) if (*p <= 2) { sum += *p; num++; } out_sum = sum; out_num = num; return p; }