示例#1
0
__m256i test_mm256_cvtepi32_epi64(__m128i a) {
  // CHECK-LABEL: test_mm256_cvtepi32_epi64
  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepi32_epi64(a);
}
示例#2
0
__m256i test_mm256_cvtepi32_epi64(__m128i a) {
  // CHECK: @llvm.x86.avx2.pmovsxdq
  return _mm256_cvtepi32_epi64(a);
}
示例#3
0
文件: render-cache.c 项目: krh/ksim
static void
sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const struct reg *src = &t->grf[args->src];

    __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride;
    __m128i *base1 = (void *) base0 + args->rt.stride;

    __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg);
    __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg);
    __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg);
    __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg);

    __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145);
    __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145);

    __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367);
    __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367);

    struct reg mask = { .ireg = t->mask_q1 };

    if (mask.d[0] < 0)
        base0[0] = _mm256_extractf128_si256(rgba04, 0);
    if (mask.d[1] < 0)
        base0[1] = _mm256_extractf128_si256(rgba15, 0);
    if (mask.d[2] < 0)
        base1[0] = _mm256_extractf128_si256(rgba26, 0);
    if (mask.d[3] < 0)
        base1[1] = _mm256_extractf128_si256(rgba37, 0);

    if (mask.d[4] < 0)
        base0[2] = _mm256_extractf128_si256(rgba04, 1);
    if (mask.d[5] < 0)
        base0[3] = _mm256_extractf128_si256(rgba15, 1);
    if (mask.d[6] < 0)
        base1[2] = _mm256_extractf128_si256(rgba26, 1);
    if (mask.d[7] < 0)
        base1[3] = _mm256_extractf128_si256(rgba37, 1);
}

static void
write_uint16_linear(struct thread *t,
                    const struct sfid_render_cache_args *args,
                    __m256i r, __m256i g, __m256i b, __m256i a)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    __m256i rg, ba;

    rg = _mm256_slli_epi32(g, 16);
    rg = _mm256_or_si256(rg, r);
    ba = _mm256_slli_epi32(a, 16);
    ba = _mm256_or_si256(ba, b);

    __m256i p0 = _mm256_unpacklo_epi32(rg, ba);
    __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0));

    __m256i p1 = _mm256_unpackhi_epi32(rg, ba);
    __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1));

    void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride;

    _mm_maskstore_epi64(base,
                        _mm256_extractf128_si256(m0, 0),
                        _mm256_extractf128_si256(p0, 0));
    _mm_maskstore_epi64((base + 16),
                        _mm256_extractf128_si256(m1, 0),
                        _mm256_extractf128_si256(p0, 1));

    _mm_maskstore_epi64((base + args->rt.stride),
                        _mm256_extractf128_si256(m0, 1),
                        _mm256_extractf128_si256(p1, 0));
    _mm_maskstore_epi64((base + args->rt.stride + 16),
                        _mm256_extractf128_si256(m1, 1),
                        _mm256_extractf128_si256(p1, 1));
}

static void
sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    __m256i r, g, b, a;
    const __m256 scale = _mm256_set1_ps(65535.0f);
    const __m256 half =  _mm256_set1_ps(0.5f);
    struct reg *src = &t->grf[args->src];

    r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
    g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
    b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
    a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));

    write_uint16_linear(t, args, r, g, b, a);
}