static INLINE unsigned int highbd_masked_sad16xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
static uint32_t maxbitas32int(const __m256i accumulator) { const __m256i _tmp1 = _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); uint32_t ans1 = _mm256_extract_epi32(_tmp2, 0); uint32_t ans2 = _mm256_extract_epi32(_tmp2, 4); uint32_t ans = ans1 > ans2 ? ans1 : ans2; return ans; }
int32_t avx2_sumsignedbytes(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m128i lo = _mm256_extracti128_si256(v, 0); const __m128i hi = _mm256_extracti128_si256(v, 1); const __m256i t0 = _mm256_cvtepi8_epi32(lo); const __m256i t1 = _mm256_cvtepi8_epi32(hi); const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8)); const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8)); accumulator = _mm256_add_epi32(accumulator, t0); accumulator = _mm256_add_epi32(accumulator, t1); accumulator = _mm256_add_epi32(accumulator, t2); accumulator = _mm256_add_epi32(accumulator, t3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
int32_t avx2_sumsignedbytes_variant2(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m256i v0 = _mm256_srai_epi32(v, 3*8); const __m256i v1 = _mm256_srai_epi32(_mm256_slli_epi32(v, 1*8), 3*8); const __m256i v2 = _mm256_srai_epi32(_mm256_slli_epi32(v, 2*8), 3*8); const __m256i v3 = _mm256_srai_epi32(_mm256_slli_epi32(v, 3*8), 3*8); accumulator = _mm256_add_epi32(accumulator, v0); accumulator = _mm256_add_epi32(accumulator, v1); accumulator = _mm256_add_epi32(accumulator, v2); accumulator = _mm256_add_epi32(accumulator, v3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
void inline Write8(unsigned char* out, int offset, __m256i v) { v = _mm256_shuffle_epi8(v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL)); WriteLE32(out + 0 + offset, _mm256_extract_epi32(v, 7)); WriteLE32(out + 32 + offset, _mm256_extract_epi32(v, 6)); WriteLE32(out + 64 + offset, _mm256_extract_epi32(v, 5)); WriteLE32(out + 96 + offset, _mm256_extract_epi32(v, 4)); WriteLE32(out + 128 + offset, _mm256_extract_epi32(v, 3)); WriteLE32(out + 160 + offset, _mm256_extract_epi32(v, 2)); WriteLE32(out + 192 + offset, _mm256_extract_epi32(v, 1)); WriteLE32(out + 224 + offset, _mm256_extract_epi32(v, 0)); }
// Extracts and converts 8x32-bit results from result, adding the bias from wi // and scaling by scales, before storing in *v. Note that wi, scales and v are // expected to contain 8 consecutive elements or num_out if less. inline void ExtractResults(__m256i& result, __m256i& shift_id, const int8_t*& wi, const double*& scales, int num_out, double*& v) { for (int out = 0; out < num_out; ++out) { int32_t res = _mm256_extract_epi32(result, 0); *v++ = (static_cast<double>(res) / MAX_INT8 + *wi++) * *scales++; // Rotate the results in int32_t units, so the next result is ready. result = _mm256_permutevar8x32_epi32(result, shift_id); } }
static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return (sad + 31) >> 6; }
int __attribute__((target("avx"))) bar(__m256i a) { return _mm256_extract_epi32(a, 3); }
int main(int argc, char **argv) { int w, h, bit_num = 0; char byte_acc = 0; long byte_total = 0; int i, iter = 50; double x, y, limit = 2.0; double Zr, Zi, Cr, Ci, Tr, Ti; w = h = argc > 1 ? atoi(argv[1]) : 32000; printf("P4\n%d %d\n", w, h); #ifdef USEAVX512 __m512i a = _mm512_set1_epi32(0); __m512i b = _mm512_set1_epi32(1); __m512i t; #endif for (y = 0; y < h; ++y) { #ifdef USEAVX512 t = a; a = b; #ifdef USEHEAVYAVX512 b = _mm512_mul_epi32(b, t); #else b = _mm512_add_epi32(b, t); #endif #endif for (x = 0; x < w; ++x) { Zr = Zi = Tr = Ti = 0.0; Cr = (2.0 * x / w - 1.5); Ci = (2.0 * y / h - 1.0); for (i = 0; i < iter && (Tr + Ti <= limit * limit); ++i) { Zi = 2.0 * Zr * Zi + Ci; Zr = Tr - Ti + Cr; Tr = Zr * Zr; Ti = Zi * Zi; } byte_acc <<= 1; if (Tr + Ti <= limit * limit) byte_acc |= 0x01; ++bit_num; if (bit_num == 8) { byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } else if (x == w - 1) { byte_acc <<= (8 - w % 8); byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } } } #ifdef USEAVX512 printf("we used avx512 %d \n", _mm256_extract_epi32(_mm512_extracti64x4_epi64(b, 1), 7)); #else printf("we did not use avx512\n"); #endif return byte_total; }