static void write_uint8_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rgba; rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); #define SWIZZLE(x, y, z, w) \ ( ((x) << 0) | ((y) << 2) | ((z) << 4) | ((w) << 6) ) /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + args->rt.stride, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
static void sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const int cpp = 4; struct reg *src = &t->grf[args->src]; const __m256 scale = _mm256_set1_ps(255.0f); const __m256 half = _mm256_set1_ps(0.5f); __m256i r, g, b, a; __m256i rgba; switch (args->rt.format) { case SF_R8G8B8A8_UNORM: r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; case SF_B8G8R8A8_UNORM: b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; default: stub("unorm8 ymajor format"); return; } rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + 16, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
static void sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { __m256i argb; const float scale = 255.0f; struct reg src[4]; memcpy(src, &t->grf[args->src], sizeof(src)); const int cpp = 4; const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); if (gt.blend.enable) { /* Load unorm8 */ __m128i lo = _mm_load_si128(base); __m128i hi = _mm_load_si128(base + 512); __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3)); blend_unorm8_argb(src, dst_argb); } gamma_correct(args->rt.format, src); const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); argb = _mm256_slli_epi32(a, 8); argb = _mm256_or_si256(argb, r); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, g); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, b); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(argb, 0)); _mm_maskstore_epi32(base + 512, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(argb, 1)); }
static void sfid_render_cache_rt_write_rep16_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { const __m128 scale = _mm_set1_ps(255.0f); const __m128 half = _mm_set1_ps(0.5f); struct reg src[1]; memcpy(src, &t->grf[args->src], sizeof(src)); if (srgb_format(args->rt.format)) { const __m256 inv_gamma = _mm256_set1_ps(1.0f / 2.4f); src[0].reg = _ZGVdN8vv_powf(src[0].reg, inv_gamma); /* Don't gamma correct alpha */ src[0].f[3] = t->grf[args->src].f[3]; } __m128 bgra = _mm_shuffle_ps(_mm256_castps256_ps128(src[0].reg), _mm256_castps256_ps128(src[0].reg), SWIZZLE(2, 1, 0, 3)); bgra = _mm_mul_ps(bgra, scale); bgra = _mm_add_ps(bgra, half); __m128i bgra_i = _mm_cvtps_epi32(bgra); bgra_i = _mm_packus_epi32(bgra_i, bgra_i); bgra_i = _mm_packus_epi16(bgra_i, bgra_i); /* Swizzle two middle mask pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x0 = t->grf[1].uw[4]; const int y0 = t->grf[1].uw[5] + slice_y; const int cpp = 4; void *base0 = xmajor_offset(args->rt.pixels, x0, y0, args->rt.stride, cpp); _mm_maskstore_epi32(base0, _mm256_extractf128_si256(mask, 0), bgra_i); _mm_maskstore_epi32(base0 + 512, _mm256_extractf128_si256(mask, 1), bgra_i); const int x1 = t->grf[1].uw[8]; const int y1 = t->grf[1].uw[9] + slice_y; void *base1 = xmajor_offset(args->rt.pixels, x1, y1, args->rt.stride, 4); __m256i mask1 = _mm256_permute4x64_epi64(t->mask_q2, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base1, _mm256_extractf128_si256(mask1, 0), bgra_i); _mm_maskstore_epi32(base1 + 512, _mm256_extractf128_si256(mask1, 1), bgra_i); }
void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) { // CHECK: @llvm.x86.avx2.maskstore.d _mm_maskstore_epi32(a, m, b); }
void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) { // CHECK-LABEL: test_mm_maskstore_epi32 // CHECK: call void @llvm.x86.avx2.maskstore.d(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) _mm_maskstore_epi32(a, m, b); }