__m256i test_mm256_cvtepi32_epi64(__m128i a) { // CHECK-LABEL: test_mm256_cvtepi32_epi64 // CHECK: sext <4 x i32> %{{.*}} to <4 x i64> return _mm256_cvtepi32_epi64(a); }
__m256i test_mm256_cvtepi32_epi64(__m128i a) { // CHECK: @llvm.x86.avx2.pmovsxdq return _mm256_cvtepi32_epi64(a); }
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }