static void write_uint8_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rgba; rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); #define SWIZZLE(x, y, z, w) \ ( ((x) << 0) | ((y) << 2) | ((z) << 4) | ((w) << 6) ) /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + args->rt.stride, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
template <bool align> SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr, __m256i permuteBody, __m256i permuteTail, __m256i shuffle) { __m256i bgra[4]; bgra[0] = _mm256_or_si256(K32_01000000, PermuteAndShiffle(Load<align>((__m256i*)(bgr + 0)), permuteBody, shuffle)); bgra[1] = _mm256_or_si256(K32_01000000, PermuteAndShiffle(Load<false>((__m256i*)(bgr + 24)), permuteBody, shuffle)); bgra[2] = _mm256_or_si256(K32_01000000, PermuteAndShiffle(Load<false>((__m256i*)(bgr + 48)), permuteBody, shuffle)); bgra[3] = _mm256_or_si256(K32_01000000, PermuteAndShiffle(Load<align>((__m256i*)(bgr + 64)), permuteTail, shuffle)); return BgraToGray(bgra); }
static uint32_t maxbitas32int(const __m256i accumulator) { const __m256i _tmp1 = _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); uint32_t ans1 = _mm256_extract_epi32(_tmp2, 0); uint32_t ans2 = _mm256_extract_epi32(_tmp2, 4); uint32_t ans = ans1 > ans2 ? ans1 : ans2; return ans; }
int main() { const ssize_t A = 3; const size_t Awidth = 2; const size_t Dwidth = 4; const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1)); const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1; const ssize_t Cwidth = Awidth + Dwidth; const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1); const size_t numCodewords = (1ull << Cwidth); std::cout << "numCodewords: " << numCodewords << std::endl; const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate? int * pNonCodewordMasks = new int[numMasks]; const int16_t c = ~((1ll << (Cwidth - 1)) - 1); std::cout << "c = 0x" << std::hex << c << std::dec << std::endl; for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) { int tmpMask = 0; for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) { if ((cw % A) != 0) { // we want the non-codewords // std::cout << "cw % A != 0: " << cw << std::endl; tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits } } pNonCodewordMasks[posMask] = tmpMask; } std::cout << "numMasks: " << numMasks << std::endl; std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0'); for (size_t posMask = 0; posMask < numMasks; ++posMask) { std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':'; } std::cout << std::dec << std::endl << std::setfill(' '); auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c); auto mmAddUp = _mm256_set1_epi16(16); auto mmAinv = _mm256_set1_epi16(AInv); auto mmDmin = _mm256_set1_epi16(Dmin); auto mmDmax = _mm256_set1_epi16(Dmax); const size_t posEnd = (1ull << Cwidth); __m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl; std::cout << std::setfill('0') << std::hex; for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) { auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords)); auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]); auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin); auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1); auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1); auto mm5 = _mm256_or_si256(mm3, mm4); auto mm6 = _mm256_and_si256(mm2, mm5); auto mask = _mm256_movemask_epi8(mm6); if (mask & pNonCodewordMasks[posMask]) { std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } else { std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp); } std::cout << std::setfill(' ') << std::dec; }
static void sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const int cpp = 4; struct reg *src = &t->grf[args->src]; const __m256 scale = _mm256_set1_ps(255.0f); const __m256 half = _mm256_set1_ps(0.5f); __m256i r, g, b, a; __m256i rgba; switch (args->rt.format) { case SF_R8G8B8A8_UNORM: r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; case SF_B8G8R8A8_UNORM: b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; default: stub("unorm8 ymajor format"); return; } rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + 16, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
static void sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { __m256i argb; const float scale = 255.0f; struct reg src[4]; memcpy(src, &t->grf[args->src], sizeof(src)); const int cpp = 4; const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); if (gt.blend.enable) { /* Load unorm8 */ __m128i lo = _mm_load_si128(base); __m128i hi = _mm_load_si128(base + 512); __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3)); blend_unorm8_argb(src, dst_argb); } gamma_correct(args->rt.format, src); const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); argb = _mm256_slli_epi32(a, 8); argb = _mm256_or_si256(argb, r); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, g); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, b); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(argb, 0)); _mm_maskstore_epi32(base + 512, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(argb, 1)); }
template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha) { __m256i _blue = _mm256_and_si256(LoadPermuted<align>((__m256i*)(blue + offset)), K16_00FF); __m256i _green = _mm256_and_si256(LoadPermuted<align>((__m256i*)(green + offset)), K16_00FF); __m256i _red = _mm256_and_si256(LoadPermuted<align>((__m256i*)(red + offset)), K16_00FF); __m256i bg = _mm256_or_si256(_blue, _mm256_slli_si256(_green, 1)); __m256i ra = _mm256_or_si256(_red, alpha); Store<align>((__m256i*)bgra + 0, _mm256_unpacklo_epi16(bg, ra)); Store<align>((__m256i*)bgra + 1, _mm256_unpackhi_epi16(bg, ra)); }
// Compare rank with all values currently in the queue. Returns -1 if the value already exists // or is larger than all values. // Otherwise, returns the index of the register in which the value should be inserted. // Mask is replicated to both lanes, so it can be used for both value and rank lane. int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i >mask) { static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); __m256i eq, eq4; int reg, mask; // Because items are sorted in ascending order within each (double) register, the mask after GT // comparison must be of the form 000...1111, which is one less than a power of two. { __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20); // [0 .. 7] gtmask = _mm256_cmpgt_epi32(r0_7, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_cmpeq_epi32(r0_7, mrank); _ASSERTE(((mask + 1) & mask) == 0); reg = 1; } if (!mask) { __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20); // [8 .. 15] gtmask = _mm256_cmpgt_epi32(r8_15, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank)); _ASSERTE(((mask + 1) & mask) == 0); reg = 3; } if (!mask) { gtmask = _mm256_cmpgt_epi32(_rv[4], mrank); // [16 .. 19]; don't care about value eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF; // ignore comparison with values eq = _mm256_or_si256(eq, eq4); _ASSERTE(((mask + 1) & mask) == 0); reg = 4; } if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0) mask = 0; if (!mask) return -1; // Adjust register according to mask (higher 128-bits i double register: one register lower) // There is no "previous" register to test against for equality if we need to insert in the // very first register. Also duplicate the same mask to both lanes. if (mask > 0xF) { mask >>= 4; --reg; gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11); // replicate high lane to both }
static char * detile_xmajor(struct surface *s, __m256i alpha) { int height = align_u64(s->height, 8); void *pixels; int tile_stride = s->stride / 512; int ret; ret = posix_memalign(&pixels, 32, s->stride * height); ksim_assert(ret == 0); ksim_assert((s->stride & 511) == 0); for (int y = 0; y < height; y++) { int tile_y = y / 8; int iy = y & 7; void *src = s->pixels + tile_y * tile_stride * 4096 + iy * 512; void *dst = pixels + y * s->stride; for (int x = 0; x < tile_stride; x++) { for (int c = 0; c < 512; c += 32) { __m256i m = _mm256_load_si256(src + x * 4096 + c); m = _mm256_or_si256(m, alpha); _mm256_store_si256(dst + x * 512 + c, m); } } } return pixels; }
static char * detile_ymajor(struct surface *s, __m256i alpha) { int height = align_u64(s->height, 8); void *pixels; int tile_stride = s->stride / 128; const int column_stride = 32 * 16; const int columns = s->stride / 16; int ret; ret = posix_memalign(&pixels, 32, s->stride * height); ksim_assert(ret == 0); ksim_assert((s->stride & 127) == 0); for (int y = 0; y < height; y += 2) { int tile_y = y / 32; int iy = y & 31; void *src = s->pixels + tile_y * tile_stride * 4096 + iy * 16; void *dst = pixels + y * s->stride; for (int x = 0; x < columns ; x++) { __m256i m = _mm256_load_si256(src + x * column_stride); m = _mm256_or_si256(m, alpha); _mm_store_si128(dst + x * 16, _mm256_extractf128_si256(m, 0)); _mm_store_si128(dst + x * 16 + s->stride, _mm256_extractf128_si256(m, 1)); } } return pixels; }
void static avx2_test (void) { union256i_q s1, s2, res; long long int res_ref[4]; int i, j, sign = 1; int fail = 0; for (i = 0; i < 10; i++) { for (j = 0; j < 4; j++) { s1.a[j] = i * j * sign; s2.a[j] = (j + 20) * sign; sign = -sign; } res.x = _mm256_or_si256 (s1.x, s2.x); compute_por256 (s1.a, s2.a, res_ref); fail += check_union256i_q (res, res_ref); } if (fail != 0) abort (); }
template <bool align> SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset) { const __m256i _value = Load<align>((__m256i*)(value + offset)); const __m256i _background = Load<align>((__m256i*)(background + offset)); const __m256i _mask = Load<align>((const __m256i*)(mask + offset)); Store<align>((__m256i*)(background + offset), _mm256_or_si256(_mm256_and_si256(_mask, _value), _mm256_andnot_si256(_mask, _background))); }
SIMD_INLINE __m256i BinomialSum16(const __m256i & ab, const __m256i & cd) { #ifdef SIMD_MADDUBS_ERROR return _mm256_add_epi16(_mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, ab), K8_01_03), _mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, cd), K8_03_01)); #else return _mm256_add_epi16(_mm256_maddubs_epi16(ab, K8_01_03), _mm256_maddubs_epi16(cd, K8_03_01)); #endif }
template <> SIMD_INLINE void InterpolateX<1>(const __m256i * alpha, __m256i * buffer) { #if defined(_MSC_VER) // Workaround for Visual Studio 2012 compiler bug in release mode: __m256i _buffer = _mm256_or_si256(K_ZERO, _mm256_load_si256(buffer)); #else __m256i _buffer = _mm256_load_si256(buffer); #endif _mm256_store_si256(buffer, _mm256_maddubs_epi16(_buffer, _mm256_load_si256(alpha))); }
static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); }
bool is_sorted_avx2_unrolled4(int32_t* a, size_t n) { const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7); size_t i = 0; while (i < n - (4*7 + 1)) { const __m256i curr0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 0*7)); const __m256i curr1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 1*7)); const __m256i curr2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 2*7)); const __m256i curr3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 3*7)); const __m256i next0 = _mm256_permutevar8x32_epi32(curr0, shuffle_pattern); const __m256i next1 = _mm256_permutevar8x32_epi32(curr1, shuffle_pattern); const __m256i next2 = _mm256_permutevar8x32_epi32(curr2, shuffle_pattern); const __m256i next3 = _mm256_permutevar8x32_epi32(curr3, shuffle_pattern); const __m256i mask0 = _mm256_cmpgt_epi32(curr0, next0); const __m256i mask1 = _mm256_cmpgt_epi32(curr1, next1); const __m256i mask2 = _mm256_cmpgt_epi32(curr2, next2); const __m256i mask3 = _mm256_cmpgt_epi32(curr3, next3); const __m256i mask = _mm256_or_si256(mask0, _mm256_or_si256(mask1, _mm256_or_si256(mask2, mask3))); if (!_mm256_testz_si256(mask, mask)) { return false; } i += 7*4; } for (/**/; i + 1 < n; i++) { if (a[i] > a[i + 1]) return false; } return true; }
static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); }
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; uint64_t result = 0; // 1. blocks of 8 registers while (ptr + 8*32 < end) { const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32))); const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32))); const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32))); const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32))); const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32))); const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32))); const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32))); const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32))); const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01)); const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02)); const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04)); const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08)); const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10)); const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20)); const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40)); const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80))); const __m256i m01 = _mm256_or_si256(eq0bit, eq1bit); const __m256i m23 = _mm256_or_si256(eq2bit, eq3bit); const __m256i m45 = _mm256_or_si256(eq4bit, eq5bit); const __m256i m67 = _mm256_or_si256(eq6bit, eq7bit); const __m256i m0123 = _mm256_or_si256(m01, m23); const __m256i m4567 = _mm256_or_si256(m45, m67); const __m256i merged = _mm256_or_si256(m0123, m4567); result += __builtin_popcountll(_mm256_extract_epi64(merged, 0)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 1)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 2)); result += __builtin_popcountll(_mm256_extract_epi64(merged, 3)); ptr += 8 * 32; } return result + scalar_count_bytes(ptr, end - ptr, byte); }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) { __m256i src[3], shuffled; src[0] = _mm256_load_si256(buffer + 0); src[1] = _mm256_load_si256(buffer + 1); src[2] = _mm256_load_si256(buffer + 2); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static FORCE_INLINE void FlowInterSimple_generic_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i f, b; if (sizeof(PixelType) == 1) { __m256i dstF_dstB = _mm256_or_si256(dstF, _mm256_slli_epi32(dstB, 16)); maskf = _mm256_or_si256(_mm256_slli_epi32(maskf, 16), maskf_inv); maskb = _mm256_or_si256(maskb, _mm256_slli_epi32(maskb_inv, 16)); f = _mm256_madd_epi16(dstF_dstB, maskf); b = _mm256_madd_epi16(dstF_dstB, maskb); } else { __m256i dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); __m256i dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); __m256i dstB_maskf = _mm256_mullo_epi32(dstB, maskf); __m256i dstF_maskb = _mm256_mullo_epi32(dstF, maskb); f = _mm256_add_epi32(dstF_maskf_inv, dstB_maskf); b = _mm256_add_epi32(dstB_maskb_inv, dstF_maskb); } f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
l1 = _mm_shuffle_epi8(l1, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); /* Combine into a single 256-bit register: */ str = _mm256_castsi128_si256(l0); str = _mm256_insertf128_si256(str, l1, 1); /* Mask to pass through only the lower 6 bits of one byte: */ mask = _mm256_set1_epi32(0x3F000000); /* Shift bits by 2, mask in only the first byte: */ res = _mm256_and_si256(_mm256_srli_epi32(str, 2), mask); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 4, mask in only the second byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 4), mask)); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 6, mask in only the third byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask)); mask = _mm256_srli_epi32(mask, 8); /* No shift necessary for the fourth byte because we duplicated * the third byte to this position; just mask: */ res = _mm256_or_si256(res, _mm256_and_si256(str, mask)); /* Reorder to 32-bit little-endian: */ res = _mm256_shuffle_epi8(res, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4,
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
/// get the number of non-zero size_t vec_i8_cnt_nonzero(const int8_t *p, size_t n) { size_t ans = 0; #ifdef COREARRAY_SIMD_SSE2 const __m128i ZERO = { 0LL, 0LL }; const __m128i ONES = { 0x0101010101010101LL, 0x0101010101010101LL }; const __m128i ONE = { 1LL, 1LL }; // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--) ans += (*p++) ? 1 : 0; # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; n -= 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 16 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } const __m256i ZERO2 = { 0LL, 0LL, 0LL, 0LL }; const __m256i ONES2 = { 0x0101010101010101LL, 0x0101010101010101LL, 0x0101010101010101LL, 0x0101010101010101LL }; // body, AVX2 for (; n >= 256; n -= 256) { __m256i c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); __m256i bit = _mm256_and_si256(c, ONES2); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; c = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i const*)p), ZERO2); bit = _mm256_or_si256(_mm256_sll_epi64(bit, ONE), _mm256_and_si256(c, ONES2)); p += 32; uint64_t array[4] __attribute__((aligned(32))); *((__m256i*)array) = bit; ans += 256 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]) - POPCNT_U64(array[2]) - POPCNT_U64(array[3]); } # endif // body, SSE2 for (; n >= 128; n -= 128) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); bit = _mm_or_si128(_mm_sll_epi64(bit, ONE), _mm_and_si128(c, ONES)); p += 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 128 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } for (; n >= 16; n -= 16) { __m128i c = _mm_cmpeq_epi8(_mm_load_si128((__m128i const*)p), ZERO); __m128i bit = _mm_and_si128(c, ONES); p += 16; uint64_t array[2] __attribute__((aligned(16))); *((__m128i*)array) = bit; ans += 16 - POPCNT_U64(array[0]) - POPCNT_U64(array[1]); } #else // header, 8-byte aligned size_t h = (8 - ((size_t)p & 0x07)) & 0x07; for (; (n > 0) && (h > 0); n--, h--) ans += (*p++) ? 1 : 0; // body, unroll for (; n >= 8; n -= 8) { ans += (p[0] ? 1 : 0) + (p[1] ? 1 : 0) + (p[2] ? 1 : 0) + (p[3] ? 1 : 0) + (p[4] ? 1 : 0) + (p[5] ? 1 : 0) + (p[6] ? 1 : 0) + (p[7] ? 1 : 0); p += 8; } #endif // tail for (; n > 0; n--) ans += (*p++) ? 1 : 0; return ans; }
// # From To Add Characters // 1 [43] [62] +19 + // 2 [47] [63] +16 / // 3 [48..57] [52..61] +4 0..9 // 4 [65..90] [0..25] -65 A..Z // 5 [97..122] [26..51] -71 a..z // (6) Everything else => invalid input const __m256i set1 = CMPEQ(str, '+'); const __m256i set2 = CMPEQ(str, '/'); const __m256i set3 = RANGE(str, '0', '9'); const __m256i set4 = RANGE(str, 'A', 'Z'); const __m256i set5 = RANGE(str, 'a', 'z'); __m256i delta = REPLACE(set1, 19); delta = _mm256_or_si256(delta, REPLACE(set2, 16)); delta = _mm256_or_si256(delta, REPLACE(set3, 4)); delta = _mm256_or_si256(delta, REPLACE(set4, -65)); delta = _mm256_or_si256(delta, REPLACE(set5, -71)); // Check for invalid input: if any of the delta values are zero, // fall back on bytewise code to do error checking and reporting: if (_mm256_movemask_epi8(CMPEQ(delta, 0))) { break; } // Now simply add the delta values to the input: str = _mm256_add_epi8(str, delta); // Reshuffle the input to packed 12-byte output format: str = dec_reshuffle(str);
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }
__m256i test_mm256_or_si256(__m256i a, __m256i b) { // CHECK: or <4 x i64> return _mm256_or_si256(a, b); }
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }
/** * \brief quantize transformed coefficents * */ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, int32_t height, int8_t type, int8_t scan_idx, int8_t block_type) { const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1]; int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2; const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]); const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6]; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift; const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t uint32_t ac_sum = 0; __m256i v_ac_sum = _mm256_setzero_si256(); __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]); for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level); v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1)); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); v_level = _mm256_sign_epi16(v_level, v_sign); _mm256_storeu_si256((__m256i*)&(q_coef[n]), v_level); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_a); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_b); } __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1)); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1))); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0))); ac_sum += _mm_cvtsi128_si32(temp); if (!(encoder->sign_hiding && ac_sum >= 2)) return; int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a); v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b); v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8); v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8); _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a)); _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1)); _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b)); _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1)); } if (ac_sum >= 2) { #define SCAN_SET_SIZE 16 #define LOG2_SCAN_SET_SIZE 4 int32_t n, last_cg = -1, abssum = 0, subset, subpos; for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) { int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1; subpos = subset << LOG2_SCAN_SET_SIZE; abssum = 0; // Find last coeff pos for (n = SCAN_SET_SIZE - 1; n >= 0; n--) { if (q_coef[scan[n + subpos]]) { last_nz_pos_in_cg = n; break; } } // First coeff pos for (n = 0; n <SCAN_SET_SIZE; n++) { if (q_coef[scan[n + subpos]]) { first_nz_pos_in_cg = n; break; } } // Sum all kvz_quant coeffs between first and last for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) { abssum += q_coef[scan[n + subpos]]; } if (last_nz_pos_in_cg >= 0 && last_cg == -1) { last_cg = 1; } if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) { int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1); if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff; int16_t final_change = 0, cur_change = 0; for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) { uint32_t blkPos = scan[n + subpos]; if (q_coef[blkPos] != 0) { if (delta_u[blkPos] > 0) { cur_cost = -delta_u[blkPos]; cur_change = 1; } else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) { cur_cost = 0x7fffffff; } else { cur_cost = delta_u[blkPos]; cur_change = -1; } } else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) { cur_cost = 0x7fffffff; } else { cur_cost = -delta_u[blkPos]; cur_change = 1; } if (cur_cost < min_cost_inc) { min_cost_inc = cur_cost; final_change = cur_change; min_pos = blkPos; } } // CG loop if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) { final_change = -1; } if (coef[min_pos] >= 0) q_coef[min_pos] += final_change; else q_coef[min_pos] -= final_change; } // Hide } if (last_cg == 1) last_cg = 0; } #undef SCAN_SET_SIZE #undef LOG2_SCAN_SET_SIZE }