Example #1
0
static fstb_FORCEINLINE void	TransLut_store_avx2 (T *dst_ptr, __m256 val)
{
	_mm256_store_si256 (
		reinterpret_cast <__m256i *> (dst_ptr),
		_mm256_cvtps_epi32 (val)
	);
}
Example #2
0
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v)
{
  // Convert X,Y first into U,V space then round to nearest
  // integer. That gets us close to correct answer, mapping XY to a
  // lozenge-shaped space rather than hexagonal. We then correct the
  // four regions that lie outside the hexagonal cell assigning them
  // to their correct neighboring cell.
  // Writer's note: see ~/Google Drive/Work/calin

  // double dv = y*c_vy_inv;
  // double du = x-dv*c_vx;
  // u = std::lround(du);
  // v = std::lround(dv);
  // du -= u;
  // dv -= v;

  y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv));
  x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x);
  u = _mm256_cvtps_epi32(x);
  v = _mm256_cvtps_epi32(y);
  x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u));
  y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v));

  // double c3 = dv-du;
  const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x));

  __m256i uvshift;
  __m256i mask;

  // double c1 = du+0.5*dv;
  // double c2 = dv+0.5*du;
  // if(c3<0) {
  //   if(c1>=1) u++;
  //   else if(c2<-1) v--;
  // } else {
  //   if(c2>=1) v++;
  //   else if(c1<-1) u--;
  // }

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask);

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask);
}
Example #3
0
void
mandel_avx(unsigned char *image, const struct spec *s)
{
    __m256 xmin = _mm256_set1_ps(s->xlim[0]);
    __m256 ymin = _mm256_set1_ps(s->ylim[0]);
    __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width);
    __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height);
    __m256 threshold = _mm256_set1_ps(4);
    __m256 one = _mm256_set1_ps(1);
    __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations);
    __m256 depth_scale = _mm256_set1_ps(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 8) {
            __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4,
                                      x + 3, x + 2, x + 1, x + 0);
            __m256 my = _mm256_set1_ps(y);
            __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin);
            __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin);
            __m256 zr = cr;
            __m256 zi = ci;
            int k = 1;
            __m256 mk = _mm256_set1_ps(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m256 zr2 = _mm256_mul_ps(zr, zr);
                __m256 zi2 = _mm256_mul_ps(zi, zi);
                __m256 zrzi = _mm256_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr);
                zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm256_mul_ps(zr, zr);
                zi2 = _mm256_mul_ps(zi, zi);
                __m256 mag2 = _mm256_add_ps(zr2, zi2);
                __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS);
                mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk);

                /* Early bailout? */
                if (_mm256_testz_ps(mask, _mm256_set1_ps(-1)))
                    break;
            }
            mk = _mm256_mul_ps(mk, iter_scale);
            mk = _mm256_sqrt_ps(mk);
            mk = _mm256_mul_ps(mk, depth_scale);
            __m256i pixels = _mm256_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 8; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
Example #4
0
static void
sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const int cpp = 4;
    struct reg *src = &t->grf[args->src];
    const __m256 scale = _mm256_set1_ps(255.0f);
    const __m256 half =  _mm256_set1_ps(0.5f);
    __m256i r, g, b, a;
    __m256i rgba;

    switch (args->rt.format) {
    case SF_R8G8B8A8_UNORM:
        r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
        g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
        b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
        a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));
        break;
    case SF_B8G8R8A8_UNORM:
        b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
        g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
        r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
        a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));
        break;
    default:
        stub("unorm8 ymajor format");
        return;
    }

    rgba = _mm256_slli_epi32(a, 8);
    rgba = _mm256_or_si256(rgba, b);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, g);
    rgba = _mm256_slli_epi32(rgba, 8);
    rgba = _mm256_or_si256(rgba, r);

    /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3));
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    _mm_maskstore_epi32(base,
                        _mm256_extractf128_si256(mask, 0),
                        _mm256_extractf128_si256(rgba, 0));
    _mm_maskstore_epi32(base + 16,
                        _mm256_extractf128_si256(mask, 1),
                        _mm256_extractf128_si256(rgba, 1));
}
Example #5
0
static inline __m256i
to_unorm(__m256 reg, float scale_f)
{
    const __m256 scale = _mm256_set1_ps(scale_f);
    const __m256 one = _mm256_set1_ps(1.0f);
    const __m256 zero = _mm256_set1_ps(0.0f);
    const __m256 half =  _mm256_set1_ps(0.5f);

    const __m256 clamped = _mm256_max_ps(_mm256_min_ps(reg, one), zero);

    return _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(clamped, scale), half));
}
Example #6
0
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid)
{
  // The following algorithm works until hexid=12,589,056
  // const unsigned iarg = 1+4*(hexid-1)/3;
  // return (unsigned(std::sqrt(float(iarg)))+1)/2;
  __m256 arg = _mm256_cvtepi32_ps(hexid);
  arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds),
    calin::math::simd::c_m256(_c_m256_one_third));
  arg = _mm256_sqrt_ps(arg);
  arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half),
    calin::math::simd::c_m256(_c_m256_one_half));
  arg = _mm256_floor_ps(arg);
  return _mm256_cvtps_epi32(arg);
}
Example #7
0
void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
{
    int x = 0;

    __m256 scale256 = _mm256_set1_ps(scale);
    __m256 shift256 = _mm256_set1_ps(shift);
    const int shuffle = 0xD8;

    for (; x <= width - 16; x += 16)
    {
        __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
        v_src = _mm256_permute4x64_epi64(v_src, shuffle);
        __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
        __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
        __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
        __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
        _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
        _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
    }

    for (; x < width; x++)
        dst[x] = saturate_cast<int>(src[x] * scale + shift);
}
Example #8
0
void	TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac)
{
	assert (val_arr != 0);

	const __m256   scale     = _mm256_set1_ps (1 << LINLUT_RES_L2);
	const __m256i  offset    =
		_mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2));
	const __m256i  val_min   = _mm256_setzero_si256 ();
	const __m256i  val_max   = _mm256_set1_epi32 (LINLUT_SIZE_F - 2);

	const __m256   v         =
		_mm256_load_ps (reinterpret_cast <const float *> (val_arr));
	const __m256   val_scl   = _mm256_mul_ps (v, scale);
	const __m256i  index_raw = _mm256_cvtps_epi32 (val_scl);
	__m256i        index_tmp = _mm256_add_epi32 (index_raw, offset);
	index_tmp = _mm256_min_epi32 (index_tmp, val_max);
	index     = _mm256_max_epi32 (index_tmp, val_min);
	frac      = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw));
}
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
Example #10
0
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2,
						avx_m256_t *c1, avx_m256_t *c2) {
	avx_m256_t tempa = _ps_sign_mask;
	avx_m256_t tempb = _ps_inv_sign_mask;
	avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa);
	avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa);
	x1 = _mm256_and_ps(x1, tempb);
	x2 = _mm256_and_ps(x2, tempb);

	tempa = _ps_cephes_FOPI;
	avx_m256_t y1 = _mm256_mul_ps(x1, tempa);
	avx_m256_t y2 = _mm256_mul_ps(x2, tempa);

	//avx_m256i_t emm21 = _mm256_cvttps_epi32(y1);
	//avx_m256i_t emm22 = _mm256_cvttps_epi32(y2);
	//emm21 = _mm256_add_epi32(emm21, _pi32_1);
	//emm22 = _mm256_add_epi32(emm22, _pi32_1);
	avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1));
	avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1));

	//emm21 = _mm256_and_si256(emm21, _pi32_inv1);
	//emm22 = _mm256_and_si256(emm22, _pi32_inv1);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1)));

	y1 = _mm256_cvtepi32_ps(emm21);
	y2 = _mm256_cvtepi32_ps(emm22);

	//avx_m256i_t tempia = _pi32_2;
	//avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia);
	//avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia);
	avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2));
	avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2));

	//avx_m256i_t tempib = _pi32_4;
	//avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib);
	//avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib);
	avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib);
	//avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib);
	avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//emm01 = _mm256_slli_epi32(emm01, 29);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 29);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 29);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 29);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 29);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 29);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1);

	//cos_emm01 = _mm256_slli_epi32(cos_emm01, 29);
	__m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0);
	__m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1);
	cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29);
	cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1);

	//cos_emm02 = _mm256_slli_epi32(cos_emm02, 29);
	__m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0);
	__m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1);
	cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29);
	cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1);

	//tempia = _pi32_2;
	//tempib = _mm256_setzero_si256();
	//emm21 = _mm256_and_si256(emm21, tempia);
	//emm22 = _mm256_and_si256(emm22, tempia);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_2)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm21 = _mm256_and_si256(cos_emm21, tempia);
	//cos_emm22 = _mm256_and_si256(cos_emm22, tempia);
	cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_2)));
	cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//emm21 = _mm256_cmpeq_epi32(emm21, tempib);
	//emm22 = _mm256_cmpeq_epi32(emm22, tempib);
	emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib);
	//cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib);
	cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));
	
	avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01);
	avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02);
	avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21);
	avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22);
	avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01);
	avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02);
	avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21);
	avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22);

	sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1);
	sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2);

	tempa = _ps_minus_cephes_DP123;
	tempb = _mm256_mul_ps(y2, tempa);
	tempa = _mm256_mul_ps(y1, tempa);
	x2 = _mm256_add_ps(x2, tempb);
	x1 = _mm256_add_ps(x1, tempa);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);

	tempa = _ps_coscof_p0;
	tempb = _ps_sincof_p0;

	y1 = _mm256_mul_ps(x21, tempa);
	y2 = _mm256_mul_ps(x22, tempa);
	avx_m256_t y21 = _mm256_mul_ps(x21, tempb);
	avx_m256_t y22 = _mm256_mul_ps(x22, tempb);
	tempa = _ps_coscof_p1;
	tempb = _ps_sincof_p1;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x21);
	y2 = _mm256_mul_ps(y2, x22);
	y21 = _mm256_mul_ps(y21, x21);
	y22 = _mm256_mul_ps(y22, x22);
	tempa = _ps_coscof_p2;
	tempb = _ps_sincof_p2;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x41);
	y2 = _mm256_mul_ps(y2, x42);
	y21 = _mm256_mul_ps(y21, x31);
	y22 = _mm256_mul_ps(y22, x32);
	tempa = _ps_0p5;
	tempb = _ps_1;
	avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa);
	avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa);
	y21 = _mm256_add_ps(y21, x1);
	y22 = _mm256_add_ps(y22, x2);
	temp_21 = _mm256_sub_ps(temp_21, tempb);
	temp_22 = _mm256_sub_ps(temp_22, tempb);
	y1 = _mm256_sub_ps(y1, temp_21);
	y2 = _mm256_sub_ps(y2, temp_22);

	avx_m256_t cos_y1 = y1;
	avx_m256_t cos_y2 = y2;
	avx_m256_t cos_y21 = y21;
	avx_m256_t cos_y22 = y22;
	y1 = _mm256_andnot_ps(emm2f1, y1);
	y2 = _mm256_andnot_ps(emm2f2, y2);
	cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1);
	cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2);
	y21 = _mm256_and_ps(emm2f1, y21);
	y22 = _mm256_and_ps(emm2f2, y22);
	cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21);
	cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22);
	y1 = _mm256_add_ps(y1, y21);
	y2 = _mm256_add_ps(y2, y22);
	cos_y1 = _mm256_add_ps(cos_y1, cos_y21);
	cos_y2 = _mm256_add_ps(cos_y2, cos_y22);

	*s1 = _mm256_xor_ps(y1, sign_bit1);
	*s2 = _mm256_xor_ps(y2, sign_bit2);
	*c1 = _mm256_xor_ps(cos_y1, cos_emm0f1);
	*c2 = _mm256_xor_ps(cos_y2, cos_emm0f2);
} // newsincos_ps_dual()
Example #11
0
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);

	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	//avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	//emm2 = _mm256_add_epi32(emm2, _pi32_1);
	avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1));

	//emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1)));

	y = _mm256_cvtepi32_ps(emm2);

	//avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2);
	avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2));

	//avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4);
	avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//emm0 = _mm256_slli_epi32(emm0, 29);
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 29);
	emm0lo = _mm_slli_epi32(emm0lo, 29);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	//cos_emm0 = _mm256_slli_epi32(cos_emm0, 29);
	__m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0);
	__m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1);
	cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29);
	cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1);

	//emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2);
	cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256());
	cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	avx_m256_t emm0f = _mm256_castsi256_ps(emm0);
	avx_m256_t emm2f = _mm256_castsi256_ps(emm2);
	avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0);
	avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2);

	sign_bit = _mm256_xor_ps(sign_bit, emm0f);

	avx_m256_t temp_2 = _ps_minus_cephes_DP123;
	temp_2 = _mm256_mul_ps(y, temp_2);
	x = _mm256_add_ps(x, temp_2);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp_2 = _mm256_mul_ps(x2, _ps_0p5);
	y2 = _mm256_add_ps(y2, x);
	temp_2 = _mm256_sub_ps(temp_2, _ps_1);
	y = _mm256_sub_ps(y, temp_2);

	avx_m256_t cos_y = y;
	avx_m256_t cos_y2 = y2;
	y = _mm256_andnot_ps(emm2f, y);
	cos_y = _mm256_andnot_ps(cos_emm2f, cos_y);
	y2 = _mm256_and_ps(emm2f, y2);
	cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2);
	y = _mm256_add_ps(y, y2);
	cos_y = _mm256_add_ps(cos_y, cos_y2);

	*s = _mm256_xor_ps(y, sign_bit);
	*c = _mm256_xor_ps(cos_y, cos_emm0f);
} // newsincos_ps()
Example #12
0
File: wm.c Project: krh/ksim
void
dump_surface(const char *filename, uint32_t binding_table_offset, int i)
{
	struct surface s;
	char *linear;
	__m256i alpha;

	get_surface(binding_table_offset, i, &s);

	int png_format;
	switch (s.format) {
	case SF_R8G8B8X8_UNORM:
	case SF_R8G8B8A8_UNORM:
	case SF_R8G8B8X8_UNORM_SRGB:
	case SF_R8G8B8A8_UNORM_SRGB:
		png_format = PNG_FORMAT_RGBA;
		break;
	case SF_B8G8R8A8_UNORM:
	case SF_B8G8R8X8_UNORM:
	case SF_B8G8R8A8_UNORM_SRGB:
	case SF_B8G8R8X8_UNORM_SRGB:
		png_format = PNG_FORMAT_BGRA;
		break;
	default:
		stub("image format");
		return;
	}

	switch (s.format) {
	case SF_R8G8B8X8_UNORM:
	case SF_B8G8R8X8_UNORM:
	case SF_R8G8B8X8_UNORM_SRGB:
	case SF_B8G8R8X8_UNORM_SRGB:
		alpha = _mm256_set1_epi32(0xff000000);
		break;
	default:
		alpha = _mm256_set1_epi32(0);
		break;
	}

	switch (s.tile_mode) {
	case LINEAR:
		linear = s.pixels;
		break;
	case XMAJOR:
		linear = detile_xmajor(&s, alpha);
		break;
	case YMAJOR:
		linear = detile_ymajor(&s, alpha);
		break;
	default:
		linear = s.pixels;
		stub("detile wmajor");
		break;
	}

	FILE *f = fopen(filename, "wb");
	ksim_assert(f != NULL);

	png_image pi = {
		.version = PNG_IMAGE_VERSION,
		.width = s.width,
		.height = s.height,
		.format = png_format
	};

	ksim_assert(png_image_write_to_stdio(&pi, f, 0, linear, s.stride, NULL));

	fclose(f);

	if (linear != s.pixels)
		free(linear);
}

static void
depth_test(struct primitive *p, struct dispatch *d)
{
	uint32_t cpp = depth_format_size(gt.depth.format);

	struct reg w_unorm;
	struct reg d24x8, cmp, d_f;

	void *base = ymajor_offset(p->depth.buffer, d->x, d->y, gt.depth.stride, cpp);

	if (gt.depth.test_enable) {
		const __m256 inv_scale = _mm256_set1_ps(1.0f / 16777215.0f);
		switch (gt.depth.format) {
		case D32_FLOAT:
			d_f.reg = _mm256_load_ps(base);
			break;
		case D24_UNORM_X8_UINT:
			d24x8.ireg = _mm256_load_si256(base);
			d_f.reg = _mm256_mul_ps(_mm256_cvtepi32_ps(d24x8.ireg),
						inv_scale);
			break;
		case D16_UNORM:
			stub("D16_UNORM");
		default:
			ksim_unreachable("invalid depth format");
		}

		/* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
		 * match the shader dispatch subspan orderingg. */
		d_f.ireg = _mm256_permute4x64_epi64(d_f.ireg, SWIZZLE(0, 2, 1, 3));

		switch (gt.depth.test_function) {
		case COMPAREFUNCTION_ALWAYS:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_TRUE_US);
			break;
		case COMPAREFUNCTION_NEVER:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_FALSE_OS);
			break;
		case COMPAREFUNCTION_LESS:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LT_OS);
			break;
		case COMPAREFUNCTION_EQUAL:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_EQ_OS);
			break;
		case COMPAREFUNCTION_LEQUAL:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LE_OS);
			break;
		case COMPAREFUNCTION_GREATER:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GT_OS);
			break;
		case COMPAREFUNCTION_NOTEQUAL:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_NEQ_OS);
			break;
		case COMPAREFUNCTION_GEQUAL:
			cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GE_OS);
			break;
		}
		d->mask.ireg = _mm256_and_si256(cmp.ireg, d->mask.ireg);
	}

	if (gt.depth.write_enable) {
		const __m256 scale = _mm256_set1_ps(16777215.0f);
		const __m256 half =  _mm256_set1_ps(0.5f);

		struct reg w;
		w.ireg = _mm256_permute4x64_epi64(d->w.ireg, SWIZZLE(0, 2, 1, 3));
		__m256i m = _mm256_permute4x64_epi64(d->mask.ireg,
						     SWIZZLE(0, 2, 1, 3));

		switch (gt.depth.format) {
		case D32_FLOAT:
			_mm256_maskstore_ps(base, m, w.reg);
			break;
		case D24_UNORM_X8_UINT:
			w_unorm.ireg = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(w.reg, scale), half));
			_mm256_maskstore_epi32(base, m, w_unorm.ireg);
			break;
		case D16_UNORM:
			stub("D16_UNORM");
		default:
			ksim_unreachable("invalid depth format");
		}

	}
}
Example #13
0
 INLINE const avxi min( const avxi& a, const avxi& b ) { return _mm256_cvtps_epi32(_mm256_min_ps(_mm256_cvtepi32_ps(a), _mm256_cvtepi32_ps(b))); }
Example #14
0
 INLINE const avxi operator *( const avxi& a, const avxi& b ) { return _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(a), _mm256_cvtepi32_ps(b))); }
Example #15
0
 INLINE explicit avxi( const __m256 a ) : m256(_mm256_cvtps_epi32(a)) {}
Example #16
0
static void
sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const struct reg *src = &t->grf[args->src];

    __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride;
    __m128i *base1 = (void *) base0 + args->rt.stride;

    __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg);
    __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg);
    __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg);
    __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg);

    __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145);
    __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145);

    __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367);
    __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367);

    struct reg mask = { .ireg = t->mask_q1 };

    if (mask.d[0] < 0)
        base0[0] = _mm256_extractf128_si256(rgba04, 0);
    if (mask.d[1] < 0)
        base0[1] = _mm256_extractf128_si256(rgba15, 0);
    if (mask.d[2] < 0)
        base1[0] = _mm256_extractf128_si256(rgba26, 0);
    if (mask.d[3] < 0)
        base1[1] = _mm256_extractf128_si256(rgba37, 0);

    if (mask.d[4] < 0)
        base0[2] = _mm256_extractf128_si256(rgba04, 1);
    if (mask.d[5] < 0)
        base0[3] = _mm256_extractf128_si256(rgba15, 1);
    if (mask.d[6] < 0)
        base1[2] = _mm256_extractf128_si256(rgba26, 1);
    if (mask.d[7] < 0)
        base1[3] = _mm256_extractf128_si256(rgba37, 1);
}

static void
write_uint16_linear(struct thread *t,
                    const struct sfid_render_cache_args *args,
                    __m256i r, __m256i g, __m256i b, __m256i a)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    __m256i rg, ba;

    rg = _mm256_slli_epi32(g, 16);
    rg = _mm256_or_si256(rg, r);
    ba = _mm256_slli_epi32(a, 16);
    ba = _mm256_or_si256(ba, b);

    __m256i p0 = _mm256_unpacklo_epi32(rg, ba);
    __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0));

    __m256i p1 = _mm256_unpackhi_epi32(rg, ba);
    __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1));

    void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride;

    _mm_maskstore_epi64(base,
                        _mm256_extractf128_si256(m0, 0),
                        _mm256_extractf128_si256(p0, 0));
    _mm_maskstore_epi64((base + 16),
                        _mm256_extractf128_si256(m1, 0),
                        _mm256_extractf128_si256(p0, 1));

    _mm_maskstore_epi64((base + args->rt.stride),
                        _mm256_extractf128_si256(m0, 1),
                        _mm256_extractf128_si256(p1, 0));
    _mm_maskstore_epi64((base + args->rt.stride + 16),
                        _mm256_extractf128_si256(m1, 1),
                        _mm256_extractf128_si256(p1, 1));
}

static void
sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    __m256i r, g, b, a;
    const __m256 scale = _mm256_set1_ps(65535.0f);
    const __m256 half =  _mm256_set1_ps(0.5f);
    struct reg *src = &t->grf[args->src];

    r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half));
    g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half));
    b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half));
    a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half));

    write_uint16_linear(t, args, r, g, b, a);
}