static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size) { __m256i total = _mm256_setzero_si256(); __m512i ones = _mm512_setzero_si512(); __m512i twos = _mm512_setzero_si512(); __m512i fours = _mm512_setzero_si512(); __m512i eights = _mm512_setzero_si512(); __m512i sixteens = _mm512_setzero_si512(); __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; const uint64_t limit = size - size % 16; uint64_t i = 0; for(; i < limit; i += 16) { CSA(&twosA, &ones, ones, data[i+0], data[i+1]); CSA(&twosB, &ones, ones, data[i+2], data[i+3]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+4], data[i+5]); CSA(&twosB, &ones, ones, data[i+6], data[i+7]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsA,&fours, fours, foursA, foursB); CSA(&twosA, &ones, ones, data[i+8], data[i+9]); CSA(&twosB, &ones, ones, data[i+10], data[i+11]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+12], data[i+13]); CSA(&twosB, &ones, ones, data[i+14], data[i+15]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsB, &fours, fours, foursA, foursB); CSA(&sixteens, &eights, eights, eightsA, eightsB); total = _mm256_add_epi64(total, popcount(sixteens)); } total = _mm256_slli_epi64(total, 4); // * 16 total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours), 2)); // += 4 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos), 1)); // += 2 * ... total = _mm256_add_epi64(total, popcount(ones)); for(; i < size; i++) { total = _mm256_add_epi64(total, popcount(data[i])); } return avx2_sum_epu64(total); }
inline void matrix32x8::transpose(square128& output, int x, int y) { for (int j = 0; j < 8; j++) { int row = _mm256_movemask_epi8(whole); whole = _mm256_slli_epi64(whole, 1); // _mm_movemask_epi8 uses most significant bit, hence +7-j output.words[8*x+7-j][y] = row; } }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
__m256i test_mm256_slli_epi64(__m256i a) { // CHECK: @llvm.x86.avx2.pslli.q return _mm256_slli_epi64(a, 3); }
__m256i test_mm256_slli_epi64(__m256i a) { // CHECK-LABEL: test_mm256_slli_epi64 // CHECK: call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %{{.*}}, i32 %{{.*}}) return _mm256_slli_epi64(a, 3); }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
void extern avx2_test (void) { x = _mm256_slli_epi64 (x, 13); }