void mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m256i in, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region+length; region<end; region+=32) { in = _mm256_load_si256((void *)region); l = _mm256_and_si256(in, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h, l); _mm256_store_si256((void *)region, out); } }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
static inline __m256i mulhi_epu64(__m256i x, __m256i y) { __m256i x_hi = _mm256_srli_epi64(x, 32); __m256i y_hi = _mm256_srli_epi64(y, 32); // __m256i mask = _mm256_set1_epi64x(0xFFFFFFFFL); // __m256i x_lo = _mm256_and_si256(x, mask); // __m256i y_lo = _mm256_and_si256(y, mask); /// masking is unnecessary because _mm256_mul_epu32 does it for us (for free): __m256i x_lo = x; __m256i y_lo = y; ///////////// __m256i result = _mm256_mul_epu32(x_lo,y_lo); result = _mm256_srli_epi64(result, 32); __m256i result1 = _mm256_mul_epu32(x_hi,y_lo); __m256i result2 = _mm256_mul_epu32(x_lo,y_hi); result = _mm256_add_epi64(result, result1); result = _mm256_add_epi64(result, result2); result = _mm256_srli_epi64(result, 32); __m256i result3 = _mm256_mul_epu32(x_hi,y_hi); result = _mm256_add_epi64(result, result3); return result; }
// assume N is divisible by 4 uint32_t vectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N); __m128i sum = _mm_setzero_si128(); for(uint32_t j = 0; j < nmbr ; j+=4) { __m256i fourints = _mm256_loadu_si256((const __m256i *)(accesses + j)); __m256i four64bitsproducts = _mm256_mul_epu32(fourints, Nvec); __m256i fourtop32ints = _mm256_srli_epi64(four64bitsproducts,32); __m128i four32ints = _mm256_i64gather_epi32 (z,fourtop32ints , 4); sum = _mm_add_epi32(sum, four32ints); } uint32_t buffer[4]; _mm_storeu_si128((__m128i *)buffer,sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3]; }
int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { __m256i accum = _mm256_setzero_si256(); int i; for (i = 0; i < length; i += 8, coeff += 8) { const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); const __m256i abs = _mm256_abs_epi32(src_line); accum = _mm256_add_epi32(accum, abs); } { // 32 bit horizontal add const __m256i a = _mm256_srli_si256(accum, 8); const __m256i b = _mm256_add_epi32(accum, a); const __m256i c = _mm256_srli_epi64(b, 32); const __m256i d = _mm256_add_epi32(b, c); const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), _mm256_extractf128_si256(d, 1)); return _mm_cvtsi128_si32(accum_128); } }
int vpx_satd_avx2(const tran_low_t *coeff, int length) { const __m256i one = _mm256_set1_epi16(1); __m256i accum = _mm256_setzero_si256(); int i; for (i = 0; i < length; i += 16) { const __m256i src_line = load_tran_low(coeff); const __m256i abs = _mm256_abs_epi16(src_line); const __m256i sum = _mm256_madd_epi16(abs, one); accum = _mm256_add_epi32(accum, sum); coeff += 16; } { // 32 bit horizontal add const __m256i a = _mm256_srli_si256(accum, 8); const __m256i b = _mm256_add_epi32(accum, a); const __m256i c = _mm256_srli_epi64(b, 32); const __m256i d = _mm256_add_epi32(b, c); const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), _mm256_extractf128_si256(d, 1)); return _mm_cvtsi128_si32(accum_128); } }
void maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i in1, in2, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=32, region2+=32) { in2 = _mm256_load_si256((void *)region2); in1 = _mm256_load_si256((void *)region1); l = _mm256_and_si256(in2, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in2, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h,l); out = _mm256_xor_si256(out, in1); _mm256_store_si256((void *)region1, out); } }
__m256i test_mm256_srli_epi64(__m256i a) { // CHECK: @llvm.x86.avx2.psrli.q return _mm256_srli_epi64(a, 3); }
__m256i test_mm256_srli_epi64(__m256i a) { // CHECK-LABEL: test_mm256_srli_epi64 // CHECK: call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %{{.*}}, i32 %{{.*}}) return _mm256_srli_epi64(a, 3); }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }