static fstb_FORCEINLINE void TransLut_store_avx2 (T *dst_ptr, __m256 val) { _mm256_store_si256 ( reinterpret_cast <__m256i *> (dst_ptr), _mm256_cvtps_epi32 (val) ); }
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
void mandel_avx(unsigned char *image, const struct spec *s) { __m256 xmin = _mm256_set1_ps(s->xlim[0]); __m256 ymin = _mm256_set1_ps(s->ylim[0]); __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width); __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height); __m256 threshold = _mm256_set1_ps(4); __m256 one = _mm256_set1_ps(1); __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations); __m256 depth_scale = _mm256_set1_ps(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 8) { __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x + 0); __m256 my = _mm256_set1_ps(y); __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin); __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin); __m256 zr = cr; __m256 zi = ci; int k = 1; __m256 mk = _mm256_set1_ps(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m256 zr2 = _mm256_mul_ps(zr, zr); __m256 zi2 = _mm256_mul_ps(zi, zi); __m256 zrzi = _mm256_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr); zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm256_mul_ps(zr, zr); zi2 = _mm256_mul_ps(zi, zi); __m256 mag2 = _mm256_add_ps(zr2, zi2); __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS); mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk); /* Early bailout? */ if (_mm256_testz_ps(mask, _mm256_set1_ps(-1))) break; } mk = _mm256_mul_ps(mk, iter_scale); mk = _mm256_sqrt_ps(mk); mk = _mm256_mul_ps(mk, depth_scale); __m256i pixels = _mm256_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 8; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
static void sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const int cpp = 4; struct reg *src = &t->grf[args->src]; const __m256 scale = _mm256_set1_ps(255.0f); const __m256 half = _mm256_set1_ps(0.5f); __m256i r, g, b, a; __m256i rgba; switch (args->rt.format) { case SF_R8G8B8A8_UNORM: r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; case SF_B8G8R8A8_UNORM: b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; default: stub("unorm8 ymajor format"); return; } rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + 16, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
static inline __m256i to_unorm(__m256 reg, float scale_f) { const __m256 scale = _mm256_set1_ps(scale_f); const __m256 one = _mm256_set1_ps(1.0f); const __m256 zero = _mm256_set1_ps(0.0f); const __m256 half = _mm256_set1_ps(0.5f); const __m256 clamped = _mm256_max_ps(_mm256_min_ps(reg, one), zero); return _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(clamped, scale), half)); }
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid) { // The following algorithm works until hexid=12,589,056 // const unsigned iarg = 1+4*(hexid-1)/3; // return (unsigned(std::sqrt(float(iarg)))+1)/2; __m256 arg = _mm256_cvtepi32_ps(hexid); arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds), calin::math::simd::c_m256(_c_m256_one_third)); arg = _mm256_sqrt_ps(arg); arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half), calin::math::simd::c_m256(_c_m256_one_half)); arg = _mm256_floor_ps(arg); return _mm256_cvtps_epi32(arg); }
void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width) { int x = 0; __m256 scale256 = _mm256_set1_ps(scale); __m256 shift256 = _mm256_set1_ps(shift); const int shuffle = 0xD8; for (; x <= width - 16; x += 16) { __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); v_src = _mm256_permute4x64_epi64(v_src, shuffle); __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); } for (; x < width; x++) dst[x] = saturate_cast<int>(src[x] * scale + shift); }
void TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); const __m256 scale = _mm256_set1_ps (1 << LINLUT_RES_L2); const __m256i offset = _mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2)); const __m256i val_min = _mm256_setzero_si256 (); const __m256i val_max = _mm256_set1_epi32 (LINLUT_SIZE_F - 2); const __m256 v = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_scl = _mm256_mul_ps (v, scale); const __m256i index_raw = _mm256_cvtps_epi32 (val_scl); __m256i index_tmp = _mm256_add_epi32 (index_raw, offset); index_tmp = _mm256_min_epi32 (index_tmp, val_max); index = _mm256_max_epi32 (index_tmp, val_min); frac = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw)); }
static __m128i cielabv (union hvrgbpix rgb) { __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5); __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0); __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0); __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0); __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]); __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]); __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]); __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]); __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]); __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v)); xvxyz[0] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO))); xvxyz[1] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO))); __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]); __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]); #ifdef __AVX__ __m256 vlab, vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], 0}, vxyz2 = {0, cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]}; vlab = _mm256_sub_ps(vxyz,vxyz2); vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0)); vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0)); vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64)); vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO); __m256i vlabi = _mm256_cvtps_epi32(vlab); return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]); #else __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], 0}; __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], 0}; vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3))); vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0)); vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0)); vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64)); vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO); vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3))); vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0)); vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0)); vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64)); vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO); return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh)); #endif }
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); //avx_m256i_t emm2 = _mm256_cvttps_epi32(y); //emm2 = _mm256_add_epi32(emm2, _pi32_1); avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1)); //emm2 = _mm256_and_si256(emm2, _pi32_inv1); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1))); y = _mm256_cvtepi32_ps(emm2); //avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2)); //avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4); avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_4))); //emm0 = _mm256_slli_epi32(emm0, 29); __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 29); emm0lo = _mm_slli_epi32(emm0lo, 29); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); //cos_emm0 = _mm256_slli_epi32(cos_emm0, 29); __m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0); __m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1); cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29); cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1); //emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_2))); //cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2); cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_2))); //emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256()); cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f = _mm256_castsi256_ps(emm0); avx_m256_t emm2f = _mm256_castsi256_ps(emm2); avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0); avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2); sign_bit = _mm256_xor_ps(sign_bit, emm0f); avx_m256_t temp_2 = _ps_minus_cephes_DP123; temp_2 = _mm256_mul_ps(y, temp_2); x = _mm256_add_ps(x, temp_2); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp_2 = _mm256_mul_ps(x2, _ps_0p5); y2 = _mm256_add_ps(y2, x); temp_2 = _mm256_sub_ps(temp_2, _ps_1); y = _mm256_sub_ps(y, temp_2); avx_m256_t cos_y = y; avx_m256_t cos_y2 = y2; y = _mm256_andnot_ps(emm2f, y); cos_y = _mm256_andnot_ps(cos_emm2f, cos_y); y2 = _mm256_and_ps(emm2f, y2); cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2); y = _mm256_add_ps(y, y2); cos_y = _mm256_add_ps(cos_y, cos_y2); *s = _mm256_xor_ps(y, sign_bit); *c = _mm256_xor_ps(cos_y, cos_emm0f); } // newsincos_ps()
void dump_surface(const char *filename, uint32_t binding_table_offset, int i) { struct surface s; char *linear; __m256i alpha; get_surface(binding_table_offset, i, &s); int png_format; switch (s.format) { case SF_R8G8B8X8_UNORM: case SF_R8G8B8A8_UNORM: case SF_R8G8B8X8_UNORM_SRGB: case SF_R8G8B8A8_UNORM_SRGB: png_format = PNG_FORMAT_RGBA; break; case SF_B8G8R8A8_UNORM: case SF_B8G8R8X8_UNORM: case SF_B8G8R8A8_UNORM_SRGB: case SF_B8G8R8X8_UNORM_SRGB: png_format = PNG_FORMAT_BGRA; break; default: stub("image format"); return; } switch (s.format) { case SF_R8G8B8X8_UNORM: case SF_B8G8R8X8_UNORM: case SF_R8G8B8X8_UNORM_SRGB: case SF_B8G8R8X8_UNORM_SRGB: alpha = _mm256_set1_epi32(0xff000000); break; default: alpha = _mm256_set1_epi32(0); break; } switch (s.tile_mode) { case LINEAR: linear = s.pixels; break; case XMAJOR: linear = detile_xmajor(&s, alpha); break; case YMAJOR: linear = detile_ymajor(&s, alpha); break; default: linear = s.pixels; stub("detile wmajor"); break; } FILE *f = fopen(filename, "wb"); ksim_assert(f != NULL); png_image pi = { .version = PNG_IMAGE_VERSION, .width = s.width, .height = s.height, .format = png_format }; ksim_assert(png_image_write_to_stdio(&pi, f, 0, linear, s.stride, NULL)); fclose(f); if (linear != s.pixels) free(linear); } static void depth_test(struct primitive *p, struct dispatch *d) { uint32_t cpp = depth_format_size(gt.depth.format); struct reg w_unorm; struct reg d24x8, cmp, d_f; void *base = ymajor_offset(p->depth.buffer, d->x, d->y, gt.depth.stride, cpp); if (gt.depth.test_enable) { const __m256 inv_scale = _mm256_set1_ps(1.0f / 16777215.0f); switch (gt.depth.format) { case D32_FLOAT: d_f.reg = _mm256_load_ps(base); break; case D24_UNORM_X8_UINT: d24x8.ireg = _mm256_load_si256(base); d_f.reg = _mm256_mul_ps(_mm256_cvtepi32_ps(d24x8.ireg), inv_scale); break; case D16_UNORM: stub("D16_UNORM"); default: ksim_unreachable("invalid depth format"); } /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * match the shader dispatch subspan orderingg. */ d_f.ireg = _mm256_permute4x64_epi64(d_f.ireg, SWIZZLE(0, 2, 1, 3)); switch (gt.depth.test_function) { case COMPAREFUNCTION_ALWAYS: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_TRUE_US); break; case COMPAREFUNCTION_NEVER: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_FALSE_OS); break; case COMPAREFUNCTION_LESS: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LT_OS); break; case COMPAREFUNCTION_EQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_EQ_OS); break; case COMPAREFUNCTION_LEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LE_OS); break; case COMPAREFUNCTION_GREATER: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GT_OS); break; case COMPAREFUNCTION_NOTEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_NEQ_OS); break; case COMPAREFUNCTION_GEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GE_OS); break; } d->mask.ireg = _mm256_and_si256(cmp.ireg, d->mask.ireg); } if (gt.depth.write_enable) { const __m256 scale = _mm256_set1_ps(16777215.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg w; w.ireg = _mm256_permute4x64_epi64(d->w.ireg, SWIZZLE(0, 2, 1, 3)); __m256i m = _mm256_permute4x64_epi64(d->mask.ireg, SWIZZLE(0, 2, 1, 3)); switch (gt.depth.format) { case D32_FLOAT: _mm256_maskstore_ps(base, m, w.reg); break; case D24_UNORM_X8_UINT: w_unorm.ireg = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(w.reg, scale), half)); _mm256_maskstore_epi32(base, m, w_unorm.ireg); break; case D16_UNORM: stub("D16_UNORM"); default: ksim_unreachable("invalid depth format"); } } }
INLINE const avxi min( const avxi& a, const avxi& b ) { return _mm256_cvtps_epi32(_mm256_min_ps(_mm256_cvtepi32_ps(a), _mm256_cvtepi32_ps(b))); }
INLINE const avxi operator *( const avxi& a, const avxi& b ) { return _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(a), _mm256_cvtepi32_ps(b))); }
INLINE explicit avxi( const __m256 a ) : m256(_mm256_cvtps_epi32(a)) {}
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }