示例#1
0
文件: render-cache.c 项目: krh/ksim
static void
write_uint8_linear(struct thread *t,
                   const struct sfid_render_cache_args *args,
                   __m256i r, __m256i g, __m256i b, __m256i a)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    __m256i rgba;

    rgba = _mm256_slli_epi32(a, 8);
    rgba = _mm256_or_si256(rgba, b);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, g);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, r);

#define SWIZZLE(x, y, z, w) \
	( ((x) << 0) | ((y) << 2) | ((z) << 4) | ((w) << 6) )

    /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3));
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride;

    _mm_maskstore_epi32(base,
                        _mm256_extractf128_si256(mask, 0),
                        _mm256_extractf128_si256(rgba, 0));
    _mm_maskstore_epi32(base + args->rt.stride,
                        _mm256_extractf128_si256(mask, 1),
                        _mm256_extractf128_si256(rgba, 1));
}
示例#2
0
文件: render-cache.c 项目: krh/ksim
static void
sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const int cpp = 4;
    struct reg *src = &t->grf[args->src];
    const __m256 scale = _mm256_set1_ps(255.0f);
    const __m256 half =  _mm256_set1_ps(0.5f);
    __m256i r, g, b, a;
    __m256i rgba;

    switch (args->rt.format) {
    case SF_R8G8B8A8_UNORM:
        r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
        g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
        b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
        a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));
        break;
    case SF_B8G8R8A8_UNORM:
        b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
        g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
        r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
        a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));
        break;
    default:
        stub("unorm8 ymajor format");
        return;
    }

    rgba = _mm256_slli_epi32(a, 8);
    rgba = _mm256_or_si256(rgba, b);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, g);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, r);

    /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3));
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    _mm_maskstore_epi32(base,
                        _mm256_extractf128_si256(mask, 0),
                        _mm256_extractf128_si256(rgba, 0));
    _mm_maskstore_epi32(base + 16,
                        _mm256_extractf128_si256(mask, 1),
                        _mm256_extractf128_si256(rgba, 1));
}
示例#3
0
文件: render-cache.c 项目: krh/ksim
static void
sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    __m256i argb;
    const float scale = 255.0f;
    struct reg src[4];

    memcpy(src, &t->grf[args->src], sizeof(src));

    const int cpp = 4;
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    if (gt.blend.enable) {
        /* Load unorm8 */
        __m128i lo = _mm_load_si128(base);
        __m128i hi = _mm_load_si128(base + 512);
        __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
        dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3));

        blend_unorm8_argb(src, dst_argb);
    }

    gamma_correct(args->rt.format, src);

    const __m256i r = to_unorm(src[0].reg, scale);
    const __m256i g = to_unorm(src[1].reg, scale);
    const __m256i b = to_unorm(src[2].reg, scale);
    const __m256i a = to_unorm(src[3].reg, scale);

    argb = _mm256_slli_epi32(a, 8);
    argb = _mm256_or_si256(argb, r);
    argb = _mm256_slli_epi32(argb, 8);
    argb = _mm256_or_si256(argb, g);
    argb = _mm256_slli_epi32(argb, 8);
    argb = _mm256_or_si256(argb, b);

    /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3));
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    _mm_maskstore_epi32(base,
                        _mm256_extractf128_si256(mask, 0),
                        _mm256_extractf128_si256(argb, 0));
    _mm_maskstore_epi32(base + 512,
                        _mm256_extractf128_si256(mask, 1),
                        _mm256_extractf128_si256(argb, 1));
}
示例#4
0
文件: render-cache.c 项目: krh/ksim
static void
sfid_render_cache_rt_write_rep16_bgra_unorm8_xmajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const __m128 scale = _mm_set1_ps(255.0f);
    const __m128 half =  _mm_set1_ps(0.5f);
    struct reg src[1];

    memcpy(src, &t->grf[args->src], sizeof(src));

    if (srgb_format(args->rt.format)) {
        const __m256 inv_gamma = _mm256_set1_ps(1.0f / 2.4f);
        src[0].reg = _ZGVdN8vv_powf(src[0].reg, inv_gamma);
        /* Don't gamma correct alpha */
        src[0].f[3] = t->grf[args->src].f[3];
    }

    __m128 bgra = _mm_shuffle_ps(_mm256_castps256_ps128(src[0].reg),
                                 _mm256_castps256_ps128(src[0].reg),
                                 SWIZZLE(2, 1, 0, 3));

    bgra = _mm_mul_ps(bgra, scale);
    bgra = _mm_add_ps(bgra, half);

    __m128i bgra_i = _mm_cvtps_epi32(bgra);
    bgra_i = _mm_packus_epi32(bgra_i, bgra_i);
    bgra_i = _mm_packus_epi16(bgra_i, bgra_i);

    /* Swizzle two middle mask pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x0 = t->grf[1].uw[4];
    const int y0 = t->grf[1].uw[5] + slice_y;
    const int cpp = 4;
    void *base0 = xmajor_offset(args->rt.pixels, x0,  y0, args->rt.stride, cpp);

    _mm_maskstore_epi32(base0, _mm256_extractf128_si256(mask, 0), bgra_i);
    _mm_maskstore_epi32(base0 + 512, _mm256_extractf128_si256(mask, 1), bgra_i);

    const int x1 = t->grf[1].uw[8];
    const int y1 = t->grf[1].uw[9] + slice_y;
    void *base1 = xmajor_offset(args->rt.pixels, x1,  y1, args->rt.stride, 4);

    __m256i mask1 = _mm256_permute4x64_epi64(t->mask_q2, SWIZZLE(0, 2, 1, 3));
    _mm_maskstore_epi32(base1, _mm256_extractf128_si256(mask1, 0), bgra_i);
    _mm_maskstore_epi32(base1 + 512, _mm256_extractf128_si256(mask1, 1), bgra_i);
}
示例#5
0
void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
  // CHECK: @llvm.x86.avx2.maskstore.d
  _mm_maskstore_epi32(a, m, b);
}
示例#6
0
void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
  // CHECK-LABEL: test_mm_maskstore_epi32
  // CHECK: call void @llvm.x86.avx2.maskstore.d(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  _mm_maskstore_epi32(a, m, b);
}