uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; __m256i global_sum = _mm256_setzero_si256(); __m256i local_sum; // 1. blocks of 256 registers while (ptr + 255*32 < end) { local_sum = _mm256_setzero_si256(); // update 32 x 8-bit counter for (int i=0; i < 255; i++, ptr += 32) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1 local_sum = _mm256_sub_epi8(local_sum, eq); } // update the global accumulator 2 x 64-bit const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); } // 2. tail of < 256 registers local_sum = _mm256_setzero_si256(); while (ptr + 32 < end) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); local_sum = _mm256_sub_epi8(local_sum, eq); ptr += 32; } const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); // 3. process tail < 32 bytes uint64_t result = _mm256_extract_epi64(global_sum, 0) + _mm256_extract_epi64(global_sum, 1) + _mm256_extract_epi64(global_sum, 2) + _mm256_extract_epi64(global_sum, 3); return result + scalar_count_bytes(ptr, end - ptr, byte); }
static inline __m256i enc_translate (const __m256i in) { // LUT contains Absolute offset for all ranges: const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Index Characters // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 [2..11] 0123456789 // 3 [62] [43] -19 12 + // 4 [63] [47] -16 13 / // Create LUT indices from input: // the index for range #0 is right, others are 1 less than expected: __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: __m256i mask = CMPGT(in, 25); // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: indices = _mm256_sub_epi8(indices, mask); // Add offsets to input values: __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); return out; }
static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return (sad + 31) >> 6; }
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); const __m256i ma = _mm256_sub_epi8(alpha_max, a); const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); _mm256_storeu_si256((__m256i *)(comp_pred), roundA); }
__m256i test_mm256_sub_epi8(__m256i a, __m256i b) { // CHECK: sub <32 x i8> return _mm256_sub_epi8(a, b); }
/*! * \brief Subtract the two given values and return the result. */ ETL_STATIC_INLINE(avx_simd_byte) sub(avx_simd_byte lhs, avx_simd_byte rhs) { return _mm256_sub_epi8(lhs.value, rhs.value); }
void extern avx2_test (void) { x = _mm256_sub_epi8 (x, x); }
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
void vec_i8_count3(const char *p, size_t n, char val1, char val2, char val3, size_t *out_n1, size_t *out_n2, size_t *out_n3) { size_t n1 = 0, n2 = 0, n3 = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask1 = _mm256_set1_epi8(val1); const __m256i mask2 = _mm256_set1_epi8(val2); const __m256i mask3 = _mm256_set1_epi8(val3); __m256i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = MM_SET_M128(_mm_sub_epi8(zeros, c2), zeros); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = MM_SET_M128(_mm_sub_epi8(zeros, c3), zeros); n -= 16; p += 16; } for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); sum1 = _mm256_sub_epi8(sum1, _mm256_cmpeq_epi8(v, mask1)); sum2 = _mm256_sub_epi8(sum2, _mm256_cmpeq_epi8(v, mask2)); sum3 = _mm256_sub_epi8(sum3, _mm256_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); sum1 = sum2 = sum3 = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1)); sum1 = _mm256_sub_epi8(sum1, MM_SET_M128(c1, zeros)); __m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2)); sum2 = _mm256_sub_epi8(sum2, MM_SET_M128(c2, zeros)); __m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3)); sum3 = _mm256_sub_epi8(sum3, MM_SET_M128(c3, zeros)); n -= 16; p += 16; } if (offset > 0) { n1 += vec_avx_sum_u8(sum1); n2 += vec_avx_sum_u8(sum2); n3 += vec_avx_sum_u8(sum3); } # else // body, SSE2 const __m128i mask1 = _mm_set1_epi8(val1); const __m128i mask2 = _mm_set1_epi8(val2); const __m128i mask3 = _mm_set1_epi8(val3); __m128i sum1, sum2, sum3; sum1 = sum2 = sum3 = _mm_setzero_si128(); size_t offset = 0; for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum1 = _mm_sub_epi8(sum1, _mm_cmpeq_epi8(v, mask1)); sum2 = _mm_sub_epi8(sum2, _mm_cmpeq_epi8(v, mask2)); sum3 = _mm_sub_epi8(sum3, _mm_cmpeq_epi8(v, mask3)); if ((++offset) >= 252) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); sum1 = sum2 = sum3 = _mm_setzero_si128(); offset = 0; } } if (offset > 0) { n1 += vec_sum_u8(sum1); n2 += vec_sum_u8(sum2); n3 += vec_sum_u8(sum3); } #endif #endif // tail for (; n > 0; n--) { char v = *p++; if (v == val1) n1++; if (v == val2) n2++; if (v == val3) n3++; } if (out_n1) *out_n1 = n1; if (out_n2) *out_n2 = n2; if (out_n3) *out_n3 = n3; }
size_t vec_i8_count(const char *p, size_t n, char val) { size_t num = 0; #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) if (*p++ == val) num++; # ifdef COREARRAY_SIMD_AVX2 // body, AVX2 const __m128i zeros = _mm_setzero_si128(); const __m256i mask = _mm256_set1_epi8(val); __m256i sum = _mm256_setzero_si256(); size_t offset = 0; // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros); n -= 16; p += 16; } for (; n >= 128; n-=128) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } for (; n >= 32; n-=32) { __m256i v = _mm256_load_si256((__m256i const*)p); p += 32; sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_avx_sum_u8(sum); sum = _mm256_setzero_si256(); offset = 0; } } if (n >= 16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask)); sum = _mm256_sub_epi8(sum, MM_SET_M128(zeros, c1)); n -= 16; p += 16; } if (offset > 0) num += vec_avx_sum_u8(sum); # else // body, SSE2 const __m128i mask = _mm_set1_epi8(val); __m128i sum = _mm_setzero_si128(); size_t offset = 0; for (; n >= 64; n-=64) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); v = _mm_load_si128((__m128i const*)p); p += 16; sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); offset += 4; if (offset >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask)); if ((++offset) >= 252) { num += vec_sum_u8(sum); sum = _mm_setzero_si128(); offset = 0; } } if (offset > 0) num += vec_sum_u8(sum); #endif #endif // tail for (; n > 0; n--) if (*p++ == val) num++; return num; }
// count genotype sum and number of calls, not requiring 16-aligned p COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p, size_t n, C_Int32 &out_sum, C_Int32 &out_num) { C_Int32 sum=0, num=0; #if defined(COREARRAY_SIMD_AVX2) const __m256i three = _mm256_set1_epi8(3); const __m256i zero = _mm256_setzero_si256(); __m256i sum32 = zero, num32 = zero; size_t limit_by_U8 = 0; for (; n >= 32; ) { __m256i v = _mm256_loadu_si256((__m256i const*)p); p += 32; __m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three)); sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m)); num32 = _mm256_sub_epi8(num32, m); n -= 32; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 32)) { // add to sum sum32 = _mm256_sad_epu8(sum32, zero); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2))); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32)); // add to num num32 = _mm256_sad_epu8(num32, zero); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2))); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1))); num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32)); // reset sum32 = num32 = zero; limit_by_U8 = 0; } } #elif defined(COREARRAY_SIMD_SSE2) // header, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p <= 2) { sum += *p; num++; } const __m128i three = _mm_set1_epi8(3); const __m128i zero = _mm_setzero_si128(); __m128i sum16=zero, num16=zero; size_t limit_by_U8 = 0; for (; n >= 16; ) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; __m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three)); sum16 = _mm_add_epi8(sum16, v & m); num16 = _mm_sub_epi8(num16, m); n -= 16; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 16)) { // add to sum sum16 = _mm_sad_epu8(sum16, zero); sum += _mm_cvtsi128_si32(sum16); sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2)); // add to num num16 = _mm_sad_epu8(num16, zero); num += _mm_cvtsi128_si32(num16); num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2)); // reset sum16 = num16 = zero; limit_by_U8 = 0; } } #endif for (; n > 0; n--, p++) if (*p <= 2) { sum += *p; num++; } out_sum = sum; out_num = num; return p; }