template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF); __m256i fullSum = _mm256_setzero_si256(); for(size_t row = 0; row < height; ++row) { __m256i rowSum = _mm256_setzero_si256(); for(size_t col = 0; col < bodyWidth; col += A) { const __m256i a_ = Load<align>((__m256i*)(a + col)); const __m256i b_ = Load<align>((__m256i*)(b + col)); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m256i a_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(a + width - A))); const __m256i b_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(b + width - A))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractSum<uint64_t>(fullSum); }
static inline void blend_unorm8_argb(struct reg *src, __m256i dst_argb) { if (gt.blend.enable) { const __m256i mask = _mm256_set1_epi32(0xff); const __m256 scale = _mm256_set1_ps(1.0f / 255.0f); struct reg dst[4]; /* Convert to float */ dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */ const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg); src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg), _mm256_mul_ps(inv_alpha, dst[0].reg)); src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg), _mm256_mul_ps(inv_alpha, dst[1].reg)); src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg), _mm256_mul_ps(inv_alpha, dst[2].reg)); src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg), _mm256_mul_ps(inv_alpha, dst[3].reg)); } }
SIMD_INLINE __m256i BgraToGray32(__m256i bgra) { const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); }
inline void avx2_positive_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // ringid = positive_hexid_to_ringid(hexid); // unsigned iring = hexid - ringid_to_nsites_contained(ringid-1); // segid = int(iring/ringid); // runid = iring - segid*ringid; const __m256i one = _mm256_set1_epi32(1); ringid = avx2_positive_hexid_to_ringid(hexid); runid = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); segid = _mm256_setzero_si256(); const __m256i ringid_minus_one = _mm256_sub_epi32(ringid, one); __m256i mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); }
void mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m256i in, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region+length; region<end; region+=32) { in = _mm256_load_si256((void *)region); l = _mm256_and_si256(in, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h, l); _mm256_store_si256((void *)region, out); } }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha) { __m256i _blue = _mm256_and_si256(LoadPermuted<align>((__m256i*)(blue + offset)), K16_00FF); __m256i _green = _mm256_and_si256(LoadPermuted<align>((__m256i*)(green + offset)), K16_00FF); __m256i _red = _mm256_and_si256(LoadPermuted<align>((__m256i*)(red + offset)), K16_00FF); __m256i bg = _mm256_or_si256(_blue, _mm256_slli_si256(_green, 1)); __m256i ra = _mm256_or_si256(_red, alpha); Store<align>((__m256i*)bgra + 0, _mm256_unpacklo_epi16(bg, ra)); Store<align>((__m256i*)bgra + 1, _mm256_unpackhi_epi16(bg, ra)); }
template <bool align> SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset) { const __m256i _value = Load<align>((__m256i*)(value + offset)); const __m256i _background = Load<align>((__m256i*)(background + offset)); const __m256i _mask = Load<align>((const __m256i*)(mask + offset)); Store<align>((__m256i*)(background + offset), _mm256_or_si256(_mm256_and_si256(_mask, _value), _mm256_andnot_si256(_mask, _background))); }
template <bool align> SIMD_INLINE void EdgeBackgroundGrowRangeSlow(const uint8_t * value, uint8_t * background, __m256i tailMask) { const __m256i _value = Load<align>((__m256i*)value); const __m256i _background = Load<align>((__m256i*)background); const __m256i inc = _mm256_and_si256(tailMask, Greater8u(_value, _background)); Store<align>((__m256i*)background, _mm256_adds_epu8(_background, inc)); }
static inline bool avxcontains(hashset_t * set, uint64_t target) { __m256i vtarget = _mm256_set1_epi64x(target); __m256i vlocation = _mm256_and_si256(avxhash(vtarget, set->vmultiplier),set->sizemask); __m256i svalue = _mm256_i64gather_epi64((const long long int *) set->data,vlocation,8); __m256i eq = _mm256_cmpeq_epi64(vtarget,svalue); return _mm256_testz_si256(eq,eq) == 0; }
void maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i reg1, reg2, ri[4], sp[4], mi[4]; const uint8_t *p = pt[constant]; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } mi[0] = _mm256_set1_epi8(0x11); mi[1] = _mm256_set1_epi8(0x22); mi[2] = _mm256_set1_epi8(0x44); mi[3] = _mm256_set1_epi8(0x88); sp[0] = _mm256_set1_epi16(p[0]); sp[1] = _mm256_set1_epi16(p[1]); sp[2] = _mm256_set1_epi16(p[2]); sp[3] = _mm256_set1_epi16(p[3]); for (end=region1+length; region1<end; region1+=32, region2+=32) { reg2 = _mm256_load_si256((void *)region2); reg1 = _mm256_load_si256((void *)region1); ri[0] = _mm256_and_si256(reg2, mi[0]); ri[1] = _mm256_and_si256(reg2, mi[1]); ri[2] = _mm256_and_si256(reg2, mi[2]); ri[3] = _mm256_and_si256(reg2, mi[3]); ri[1] = _mm256_srli_epi16(ri[1], 1); ri[2] = _mm256_srli_epi16(ri[2], 2); ri[3] = _mm256_srli_epi16(ri[3], 3); ri[0] = _mm256_mullo_epi16(ri[0], sp[0]); ri[1] = _mm256_mullo_epi16(ri[1], sp[1]); ri[2] = _mm256_mullo_epi16(ri[2], sp[2]); ri[3] = _mm256_mullo_epi16(ri[3], sp[3]); ri[0] = _mm256_xor_si256(ri[0], ri[1]); ri[2] = _mm256_xor_si256(ri[2], ri[3]); ri[0] = _mm256_xor_si256(ri[0], ri[2]); ri[0] = _mm256_xor_si256(ri[0], reg1); _mm256_store_si256((void *)region1, ri[0]); } }
int main() { const ssize_t A = 3; const size_t Awidth = 2; const size_t Dwidth = 4; const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1)); const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1; const ssize_t Cwidth = Awidth + Dwidth; const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1); const size_t numCodewords = (1ull << Cwidth); std::cout << "numCodewords: " << numCodewords << std::endl; const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate? int * pNonCodewordMasks = new int[numMasks]; const int16_t c = ~((1ll << (Cwidth - 1)) - 1); std::cout << "c = 0x" << std::hex << c << std::dec << std::endl; for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) { int tmpMask = 0; for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) { if ((cw % A) != 0) { // we want the non-codewords // std::cout << "cw % A != 0: " << cw << std::endl; tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits } } pNonCodewordMasks[posMask] = tmpMask; } std::cout << "numMasks: " << numMasks << std::endl; std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0'); for (size_t posMask = 0; posMask < numMasks; ++posMask) { std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':'; } std::cout << std::dec << std::endl << std::setfill(' '); auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c); auto mmAddUp = _mm256_set1_epi16(16); auto mmAinv = _mm256_set1_epi16(AInv); auto mmDmin = _mm256_set1_epi16(Dmin); auto mmDmax = _mm256_set1_epi16(Dmax); const size_t posEnd = (1ull << Cwidth); __m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl; std::cout << std::setfill('0') << std::hex; for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) { auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords)); auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]); auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin); auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1); auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1); auto mm5 = _mm256_or_si256(mm3, mm4); auto mm6 = _mm256_and_si256(mm2, mm5); auto mask = _mm256_movemask_epi8(mm6); if (mask & pNonCodewordMasks[posMask]) { std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } else { std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp); } std::cout << std::setfill(' ') << std::dec; }
SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst) { const __m256i _src = Load<srcAlign>((__m256i*)(src + offset)); const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01); __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); }
static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); }
// 32bpp optimized for 8-bit ARGB/RGBA. rmask should be 0x00FF,0x00FF,... etc static inline __m256i stretchblt_line_bilinear_pixel_blend_avx_argb8(const __m256i cur,const __m256i nxt,const __m256i mul,const __m256i rmask) { __m256i rc,gc; __m256i rn,gn; __m256i d,sum; rc = _mm256_and_si256( cur ,rmask); gc = _mm256_and_si256(_mm256_srli_epi16(cur,8),rmask); rn = _mm256_and_si256( nxt ,rmask); gn = _mm256_and_si256(_mm256_srli_epi16(nxt,8),rmask); d = _mm256_sub_epi16(rn,rc); sum = _mm256_add_epi16(rc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)); d = _mm256_sub_epi16(gn,gc); sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(gc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),8),sum); return sum; }
template <bool align> SIMD_INLINE void EdgeBackgroundIncrementCount(const uint8_t * value, const uint8_t * backgroundValue, uint8_t * backgroundCount, size_t offset, __m256i tailMask) { const __m256i _value = Load<align>((__m256i*)(value + offset)); const __m256i _backgroundValue = Load<align>((__m256i*)(backgroundValue + offset)); const __m256i _backgroundCount = Load<align>((__m256i*)(backgroundCount + offset)); const __m256i inc = _mm256_and_si256(tailMask, Greater8u(_value, _backgroundValue)); Store<align>((__m256i*)(backgroundCount + offset), _mm256_adds_epu8(_backgroundCount, inc)); }
int main(int, char**) { /* AVX */ _mm256_zeroall(); __m256i a = _mm256_setzero_si256(); /* AVX2 */ __m256i b = _mm256_and_si256(a, a); __m256i result = _mm256_add_epi8(a, b); (void)result; return 0; }
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; uint64_t result = 0; // 1. blocks of 8 registers while (ptr + 8*32 < end) { const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32))); const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32))); const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32))); const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32))); const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32))); const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32))); const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32))); const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32))); const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01)); const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02)); const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04)); const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08)); const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10)); const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20)); const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40)); const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80))); const __m256i m01 = _mm256_or_si256(eq0bit, eq1bit); const __m256i m23 = _mm256_or_si256(eq2bit, eq3bit); const __m256i m45 = _mm256_or_si256(eq4bit, eq5bit); const __m256i m67 = _mm256_or_si256(eq6bit, eq7bit); const __m256i m0123 = _mm256_or_si256(m01, m23); const __m256i m4567 = _mm256_or_si256(m45, m67); const __m256i merged = _mm256_or_si256(m0123, m4567); result += __builtin_popcountll(_mm256_extract_epi64(merged, 0)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 1)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 2)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 3)); ptr += 8 * 32; } return result + scalar_count_bytes(ptr, end - ptr, byte); }
static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); }
// credit: Harold Aptroot uint32_t maskedvectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N - 1); __m256i sum = _mm256_setzero_si256(); for(uint32_t j = 0; j < nmbr ; j += 8) { __m256i indexes = _mm256_loadu_si256((__m256i*)(accesses + j)); indexes = _mm256_and_si256(indexes, Nvec); __m256i fi = _mm256_i32gather_epi32((int*)z, indexes, 4); sum = _mm256_add_epi32(sum, fi); } __m128i sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 0), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_hadd_epi32(sum128, sum128); return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1); }
static __m256i avx2_popcount(const __m256i vec) { const __m256i lookup = _mm256_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); const __m256i low_mask = _mm256_set1_epi8(0x0f); const __m256i lo = _mm256_and_si256(vec, low_mask); const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); return _mm256_add_epi8(popcnt1, popcnt2); }
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { const int AB_BITS = MAX(10, (int)INTER_BITS); int x1 = 0; __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); for (; x1 <= bw - 16; x1 += 16) { __m256i tx0, tx1, ty0, ty1; tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX); ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY); tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX); ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY); tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), _mm256_and_si256(tx1, fxy_mask)); __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), _mm256_and_si256(ty1, fxy_mask)); tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), _mm256_srai_epi32(tx1, INTER_BITS)); ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), _mm256_srai_epi32(ty1, INTER_BITS)); fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); } _mm256_zeroupper(); return x1; }
// Compare rank with all values currently in the queue. Returns -1 if the value already exists // or is larger than all values. // Otherwise, returns the index of the register in which the value should be inserted. // Mask is replicated to both lanes, so it can be used for both value and rank lane. int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i >mask) { static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); __m256i eq, eq4; int reg, mask; // Because items are sorted in ascending order within each (double) register, the mask after GT // comparison must be of the form 000...1111, which is one less than a power of two. { __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20); // [0 .. 7] gtmask = _mm256_cmpgt_epi32(r0_7, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_cmpeq_epi32(r0_7, mrank); _ASSERTE(((mask + 1) & mask) == 0); reg = 1; } if (!mask) { __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20); // [8 .. 15] gtmask = _mm256_cmpgt_epi32(r8_15, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank)); _ASSERTE(((mask + 1) & mask) == 0); reg = 3; } if (!mask) { gtmask = _mm256_cmpgt_epi32(_rv[4], mrank); // [16 .. 19]; don't care about value eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF; // ignore comparison with values eq = _mm256_or_si256(eq, eq4); _ASSERTE(((mask + 1) & mask) == 0); reg = 4; } if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0) mask = 0; if (!mask) return -1; // Adjust register according to mask (higher 128-bits i double register: one register lower) // There is no "previous" register to test against for equality if we need to insert in the // very first register. Also duplicate the same mask to both lanes. if (mask > 0xF) { mask >>= 4; --reg; gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11); // replicate high lane to both }
void maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i in1, in2, out, t1, t2, m1, m2, l, h; register __m128i bc; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } bc = _mm_load_si128((void *)tl[constant]); t1 = __builtin_ia32_vbroadcastsi256(bc); bc = _mm_load_si128((void *)th[constant]); t2 = __builtin_ia32_vbroadcastsi256(bc); m1 = _mm256_set1_epi8(0x0f); m2 = _mm256_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=32, region2+=32) { in2 = _mm256_load_si256((void *)region2); in1 = _mm256_load_si256((void *)region1); l = _mm256_and_si256(in2, m1); l = _mm256_shuffle_epi8(t1, l); h = _mm256_and_si256(in2, m2); h = _mm256_srli_epi64(h, 4); h = _mm256_shuffle_epi8(t2, h); out = _mm256_xor_si256(h,l); out = _mm256_xor_si256(out, in1); _mm256_store_si256((void *)region1, out); } }
static FORCE_INLINE __m256i lookup_double_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i vx = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w])); vx = _mm256_srai_epi32(vx, 1); __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w])); vy = _mm256_srai_epi16(vy, 1); __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch); addr = _mm256_add_epi32(addr, vx); addr = _mm256_add_epi32(addr, dwords_hoffsets); // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid. __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType)); gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); return gathered; }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256 dd = _mm256_set1_ps ((float) scale); __m256 XX0 = _mm256_set1_ps ((float) X0); for (unsigned j = YSTART; j < SY; j++) { __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0); for (unsigned i = 0; i < SX; i += 8) { __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0); __m256 x = x0; __m256 y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256 x2 = _mm256_mul_ps (x, x); __m256 y2 = _mm256_mul_ps (y, y); __m256 abs = _mm256_add_ps (x2, y2); __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi32 (counts, cmp_mask); __m256 t = _mm256_add_ps (x, x); y = _mm256_fmadd_ps (t, y, y0); x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12)); __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0); result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2)); _mm_storel_epi64 ((__m128i*) out, result128); out += 8; } } }
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256d dd = _mm256_set1_pd (scale); __m256d XX0 = _mm256_set1_pd (X0); for (unsigned j = YSTART; j < SY; j++) { __m256d y0 = _mm256_set1_pd (j*scale + Y0); for (unsigned i = 0; i < SX; i += 4) { __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3); __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0); __m256d x = x0; __m256d y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256d x2 = _mm256_mul_pd (x, x); __m256d y2 = _mm256_mul_pd (y, y); __m256d abs = _mm256_add_pd (x2, y2); __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi64 (counts, cmp_mask); __m256d t = _mm256_add_pd (x, x); y = _mm256_fmadd_pd (t, y, y0); x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8)); *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16); out += 4; } } }
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk) { const __m256i shufflemask = _mm256_set_epi8( -1, 9, 10, 11, -1, 9, 10, 11, -1, 6, 7, 8, -1, 6, 7, 8, -1, 3, 4, 5, -1, 3, 4, 5, -1, 0, 1, 2, -1, 0, 1, 2 ); const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12); const __m256i masks = _mm256_set1_epi32(4095); // convert from big endian and rearrange the bytes chunk = _mm256_shuffle_epi8(chunk, shufflemask); chunk = _mm256_srlv_epi32(chunk, shifts); chunk = _mm256_and_si256(chunk, masks); // write the two halves to memory do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0)); do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1)); }
// 16bpp general R/G/B, usually 5/6/5 or 5/5/5 static inline __m256i stretchblt_line_bilinear_pixel_blend_avx_rgb16(const __m256i cur,const __m256i nxt,const __m256i mul,const __m256i rmask,const uint16_t rshift,const __m256i gmask,const uint16_t gshift,const __m256i bmask,const uint16_t bshift) { __m256i rc,gc,bc; __m256i rn,gn,bn; __m256i d,sum; rc = _mm256_and_si256(_mm256_srli_epi16(cur,rshift),rmask); gc = _mm256_and_si256(_mm256_srli_epi16(cur,gshift),gmask); bc = _mm256_and_si256(_mm256_srli_epi16(cur,bshift),bmask); rn = _mm256_and_si256(_mm256_srli_epi16(nxt,rshift),rmask); gn = _mm256_and_si256(_mm256_srli_epi16(nxt,gshift),gmask); bn = _mm256_and_si256(_mm256_srli_epi16(nxt,bshift),bmask); d = _mm256_sub_epi16(rn,rc); sum = _mm256_slli_epi16(_mm256_add_epi16(rc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),rshift); d = _mm256_sub_epi16(gn,gc); sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(gc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),gshift),sum); d = _mm256_sub_epi16(bn,bc); sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(bc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),bshift),sum); return sum; }