vector unit_vector(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2)
    {
        const __m256 x_diff = _mm256_sub_ps(x2, x1);
        const __m256 y_diff = _mm256_sub_ps(y2, y1);

        const __m256 dist = distance(x1, y1, x2, y2);

        const __m256 ux = _mm256_div_ps(x_diff, dist);
        const __m256 uy = _mm256_div_ps(y_diff, dist);

        vector result = {ux, uy};

        return result;
    }
Esempio n. 2
0
double compute_pi_leibniz_avx_opt_single(size_t n)
{
	double pi = 0.0;
	register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
	register __m256 ymm9, ymm10, ymm11, ymm12, ymm13;

	ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
	ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
	ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0);
	ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0);
	ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0);
	ymm13 = _mm256_set1_ps(64.0);

	ymm5 = _mm256_setzero_ps();
	ymm6 = _mm256_setzero_ps();
	ymm7 = _mm256_setzero_ps();
	ymm8 = _mm256_setzero_ps();
	
	for (int i = 0; i <= n - 32; i += 32) {
		ymm9 = _mm256_div_ps(ymm0, ymm1);
		ymm1 = _mm256_add_ps(ymm1, ymm13);
		ymm10 = _mm256_div_ps(ymm0, ymm2);
		ymm2 = _mm256_add_ps(ymm2, ymm13);
		ymm11 = _mm256_div_ps(ymm0, ymm3);
		ymm3 = _mm256_add_ps(ymm3, ymm13);
		ymm12 = _mm256_div_ps(ymm0, ymm4);
		ymm4 = _mm256_add_ps(ymm4, ymm13);

		ymm5 = _mm256_add_ps(ymm5, ymm9);
		ymm6 = _mm256_add_ps(ymm6, ymm10);
		ymm7 = _mm256_add_ps(ymm7, ymm11);
		ymm8 = _mm256_add_ps(ymm8, ymm12);
	}
	float tmp[8] __attribute__((aligned(32)));
	_mm256_store_ps(tmp, ymm5);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm6);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm7);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm8);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];

	return pi * 4.0;
}
Esempio n. 3
0
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Set the dct_values for the current interation
		dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]],
				in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]],
				in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]],
				in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(dct_values, factor);

		// Load quant-values and multiply with previous product
		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_div_ps(result, quant_values);

		// Round off values and store in out_data buffer
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(out_data + zigzag, result);
	}
}
Esempio n. 4
0
void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM0 = _mm256_div_ps(YMM0, YMM15);
    YMM1 = _mm256_div_ps(YMM1, YMM15);
    _mm256_storeu_ps(y+i, YMM0);
    _mm256_storeu_ps(y+i+8, YMM1);
  }
  for (; i<(n); i++) {
    y[i] = x[i] / c;
  }
}
Esempio n. 5
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) {
    __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x);
    __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y);
    __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    _mm256_store_ps(accI, aix);
    _mm256_store_ps(accI + 8, aiy);
    _mm256_store_ps(accI + 16, aiz);

}
Esempio n. 6
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
Esempio n. 7
0
void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM2 = _mm256_loadu_ps(y+i);
    YMM3 = _mm256_loadu_ps(y+i+8);
    YMM2 = _mm256_div_ps(YMM0, YMM2);
    YMM3 = _mm256_div_ps(YMM1, YMM3);
    _mm256_storeu_ps(z+i, YMM2);
    _mm256_storeu_ps(z+i+8, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] / y[i];
  }
}
Esempio n. 8
0
void 
nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm)
{
	NV_ASSERT(a->n >= x->n);
#if NV_ENABLE_AVX
	{
		__m256 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffff8);

		vv = _mm256_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 8) {
			xx = _mm256_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm256_div_ps(vv, xx);
			_mm256_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		__m128 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffffc);

		vv = _mm_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 4) {
			xx = _mm_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm_div_ps(vv, xx);
			_mm_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#endif
}
Esempio n. 9
0
void static
avx_test (void)
{
  int i;
  union256 u, s1, s2;
  float e[8];

  s1.x = _mm256_set_ps (24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4);
  s2.x = _mm256_set_ps (1.17, 2.16, 3.15, 4.14, 5.13, 6.12, 7.11, 8.9);
  u.x = _mm256_div_ps (s1.x, s2.x);

  for (i = 0; i < 8; i++)
    e[i] = s1.a[i] / s2.a[i];

  if (check_union256 (u, e))
    abort ();
}
Esempio n. 10
0
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale));
    __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2);

    for (int i = 0; i < 8; i++) {
        if (close.m256_f32[i] == 0) {
            numNeighbours[7 - i] = 0;
        }
    }

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
Esempio n. 11
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }
void AVXAccelerator::forcesFor(std::size_t i, std::vector<XY>& forces) const
{
    const std::size_t objs = m_objects->size();

    // AVX can be used for 8 element aligned packs.
    const std::size_t first_simd_idx = (i + 8) & (-8);
    const std::size_t last_simd_idx = objs & (-8);

    // pre AVX calculations (for elements before first_simd_idx)
    std::size_t j = i + 1;
    for(; j < std::min(first_simd_idx, objs); j++)
    {
        const XY force_vector = force(i, j);

        forces[i] += force_vector;
        forces[j] += -force_vector;
    }

    // AVX calculations (for elements between first_simd_idx and last_simd_idx)
    for(; j < last_simd_idx; j+=8)
    {
        const float G = 6.6732e-11;

        const float xi    = m_objects->getX()[i];
        const __m256 x0   = {xi, xi, xi, xi, xi, xi, xi, xi};
        const __m256 x1234 = _mm256_load_ps( &m_objects->getX()[j] );

        const float yi    = m_objects->getY()[i];
        const __m256 y0   = {yi, yi, yi, yi, yi, yi, yi, yi};
        const __m256 y1234 = _mm256_load_ps( &m_objects->getY()[j] );

        const float mi    = m_objects->getMass()[i];
        const __m256 m0   = {mi, mi, mi, mi, mi, mi, mi, mi};
        const __m256 m1234 = _mm256_load_ps( &m_objects->getMass()[j] );

        const __m256 dist = utils::distance(x0, y0, x1234, y1234);
        const __m256 dist2 = _mm256_mul_ps(dist, dist);

        const __m256 vG = {G, G, G, G, G, G, G, G};
        const __m256 vG_m0 = _mm256_mul_ps(vG, m0);

        const __m256 m1234_dist2 = _mm256_div_ps(m1234, dist2);

        const __m256 Fg = _mm256_mul_ps(vG_m0, m1234_dist2);

        utils::vector force_vector = utils::unit_vector(x0, y0, x1234, y1234);

        force_vector.x = _mm256_mul_ps(force_vector.x, Fg);
        force_vector.y = _mm256_mul_ps(force_vector.y, Fg);

        for (int k = 0; k < 8; k++)
        {
            forces[i] += XY(force_vector.x[k], force_vector.y[k]);
            forces[j + k] += XY(-force_vector.x[k], -force_vector.y[k]);
        }
    }

    // post AVX calculations (for elements after last_simd_idx)
    for(; j < objs; j++)
    {
        const XY force_vector = force(i, j);

        forces[i] += force_vector;
        forces[j] += -force_vector;
    }
}
Esempio n. 13
0
void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_)
{
#ifdef __AVX2__
    const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ);
    const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ);

    const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2);

    const int N = q.N;
    const int N_n = q_n.N_pad;

    __m256 Epot8 = _mm256_setzero_ps();
    __m256 _1 = _mm256_set1_ps(1.f);
    __m256 _2 = _mm256_set1_ps(2.f);
    __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ);

    for(int i = 0; i < N; ++i)
    {
        const __m256 xi = _mm256_set1_ps(q.x[i]);
        const __m256 yi = _mm256_set1_ps(q.y[i]);
        const __m256 zi = _mm256_set1_ps(q.z[i]);

        __m256 f_xi = _mm256_setzero_ps();
        __m256 f_yi = _mm256_setzero_ps();
        __m256 f_zi = _mm256_setzero_ps();

        for(int j = 0; j < N_n; j += 8)
        {
            // count number of interactions
            if(MOLEC_CELLLIST_COUNT_INTERACTION)
                ++num_potential_interactions;

            // load coordinates and fores into AVX vectors
            const __m256 xj = _mm256_load_ps(&q_n.x[j]);
            const __m256 yj = _mm256_load_ps(&q_n.y[j]);
            const __m256 zj = _mm256_load_ps(&q_n.z[j]);

            __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]);
            __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]);
            __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]);


            // distance computation
            const __m256 xij = _mm256_sub_ps(xi, xj);
            const __m256 yij = _mm256_sub_ps(yi, yj);
            const __m256 zij = _mm256_sub_ps(zi, zj);


            const __m256 zij2 = _mm256_mul_ps(zij, zij);
            const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2));


            // r2 < Rcut2
            const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ);

            // if( any(r2 < R2) )
            if(_mm256_movemask_ps(mask))
            {
                const __m256 r2inv = _mm256_div_ps(_1, r2);

                const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv);
                const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2);
                const __m256 s12 = _mm256_mul_ps(s6, s6);

                const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6);
                const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6);

                Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask));

                const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6);
                const __m256 fr_mask = _mm256_and_ps(fr, mask);


                // update forces
                f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi);
                f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi);
                f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi);

                f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj);
                f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj);
                f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj);

                // store back j-forces
                _mm256_store_ps(&q_n.f_x[j], f_xj);
                _mm256_store_ps(&q_n.f_y[j], f_yj);
                _mm256_store_ps(&q_n.f_z[j], f_zj);
            }
        }

        // update i-forces
        float MOLEC_ALIGNAS(32) f_array[8];
        _mm256_store_ps(f_array, f_xi);
        q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
        _mm256_store_ps(f_array, f_yi);
        q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
        _mm256_store_ps(f_array, f_zi);
        q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
    }

    float MOLEC_ALIGNAS(32) E_pot_array[8];
    _mm256_store_ps(E_pot_array, Epot8);

    // perform reduction of potential energy
    *Epot_ += 4
              * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2]
                                        + E_pot_array[3] + E_pot_array[4] + E_pot_array[5]
                                        + E_pot_array[6] + E_pot_array[7]);
#endif
}
Esempio n. 14
0
void animate()
{
	float mx;
	float my;
	if(ManualControl)
	{
		POINT pos;
		GetCursorPos(&pos);
		RECT rc;
		GetClientRect(hMainWnd, &rc);
		ScreenToClient(hMainWnd, &pos);

		mx = pos.x;
		my = pos.y;
	}
	else
	{
		UpdatePosition(mx, my);
	}


	const auto size = partCount;

	VertexData *pVertexBuffer;
	pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD);

	_mm256_zeroall();

#pragma omp parallel \
	shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size)
	{
#pragma omp for nowait
		for(int i = 0; i < size; i += 4)
		{
			float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my };

			float *particleCoordsVec = (float*)particlesCoord + i;
			float *velocityVec = (float*)particlesVel + i;

			auto xyCoord = _mm256_loadu_ps(particleCoordsVec);
			auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec));

			auto squares = _mm256_mul_ps(hwTempData, hwTempData);
			auto distSquare = _mm256_hadd_ps(squares, squares);
			distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50);

			auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare);

			if(distSquare.m256_f32[0] < 400)
			{
				theForce.m256_f32[0] = 0;
				theForce.m256_f32[1] = 0;
			}

			if(distSquare.m256_f32[2] < 400)
			{
				theForce.m256_f32[2] = 0;
				theForce.m256_f32[3] = 0;
			}
			if(distSquare.m256_f32[4] < 400)
			{
				theForce.m256_f32[4] = 0;
				theForce.m256_f32[5] = 0;
			}

			if(distSquare.m256_f32[6] < 400)
			{
				theForce.m256_f32[6] = 0;
				theForce.m256_f32[7] = 0;
			}

			auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce);

			auto xyVelocities = _mm256_loadu_ps(velocityVec);
			xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance));
			xyVelocities = _mm256_add_ps(xyVelocities, xyForces);

			xyCoord = _mm256_add_ps(xyCoord, xyVelocities);

			_mm256_storeu_ps(velocityVec, xyVelocities);
			_mm256_storeu_ps(particleCoordsVec, xyCoord);


			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]);

			pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x;
			pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y;
			pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x;
			pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y;
			pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x;
			pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y;
			pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x;
			pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y;
		}
	}
	pVertexObject->Unlock();

	_mm256_zeroall();
}
void Decoder::ADMMDecoder_deg_6_7_2_3_6()
{
	int maxIter          = maxIteration;
	float mu             = 5.5f; 
	float tableau[12]    = { 0.0f };

    if ((mBlocklength == 576) && (mNChecks == 288))
    {
     	mu          = 3.37309f;//penalty
        tableau[2]  = 0.00001f;
	tableau[3]  = 2.00928f;
	tableau[6]  = 4.69438f;

    }
    else if((mBlocklength == 2304) && (mNChecks == 1152) )
    {
    	mu          = 3.81398683f;//penalty
        tableau[2]  = 0.29669288f; 
	tableau[3]  = 0.46964023f;
	tableau[6]  = 3.19548154f;
    }
    else
    {
    	mu          = 5.5;//penalty
        tableau[2]  = 0.8f;
	tableau[3]  = 0.8f;
	tableau[6]  = 0.8f;
    }

    const float rho      = 1.9f;    //over relaxation parameter;
    const float un_m_rho = 1.0 - rho;
    const auto  _rho      = _mm256_set1_ps(      rho );
    const auto  _un_m_rho = _mm256_set1_ps( un_m_rho );
    float tableaX[12];

    //
    // ON PRECALCULE LES CONSTANTES
    //
	#pragma  unroll
    for (int i = 0; i < 7; i++)
    {
        tableaX[i] = tableau[ i ] / mu;
    }
	const auto t_mu    = _mm256_set1_ps ( mu );

	const auto t2_amu  = _mm256_set1_ps (        tableau[ 2 ] / mu   );
	const auto t3_amu  = _mm256_set1_ps (        tableau[ 3 ] / mu   );
	const auto t6_amu  = _mm256_set1_ps (        tableau[ 6 ] / mu   );

	const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu );
	const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu );
	const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu );

	const auto t2_deg  = _mm256_set1_ps ( 2.0f );
	const auto t3_deg  = _mm256_set1_ps ( 3.0f );
	const auto t6_deg  = _mm256_set1_ps ( 6.0f );

	const auto zero    = _mm256_set1_ps ( 0.0f );
	const auto un      = _mm256_set1_ps ( 1.0f );
    const __m256 a     = _mm256_set1_ps ( 0.0f );
    const __m256 b     = _mm256_set1_ps ( 0.5f );

    //////////////////////////////////////////////////////////////////////////////////////
	#pragma  unroll
	for( int j = 0; j < _mPCheckMapSize; j+=8 )
    {
	_mm256_store_ps(&Lambda  [j],         a);
        _mm256_store_ps(&zReplica[j],         b);
        _mm256_store_ps(&latestProjVector[j], b);
    }
    //////////////////////////////////////////////////////////////////////////////////////

	for(int i = 0; i < maxIter; i++)
	{
        int ptr    = 0;
		mIteration = i + 1;

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto start = timer();
		#endif

        //
		// VN processing kernel
		//
		#pragma  unroll
		for (int j = 0; j < _mBlocklength; j++)
        {
            const int degVn = VariableDegree[j];
            float M[8] __attribute__((aligned(64)));

            if( degVn == 2 ){
#if 1
            	const int dVN = 2;
            	for(int qq = 0; qq < 8; qq++) 
		{
    				M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    				#pragma  unroll
    				for(int k = 1; k < dVN; k++) 
				{
    					M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    				}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
            	const int degVN = 2;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }else if( degVn == 3 ){
#if 1
            	const int dVN = 3;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 3;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
		}else if( degVn == 6 ){
#if 1
            	const int dVN = 6;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 6;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }
        }

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				t_vn   += (timer() - start);
		#endif

		//
		// CN processing kernel
		//
	int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph
        int allVerified       = 0;
	float vector_before_proj[8] __attribute__((aligned(64)));

        const auto zero    = _mm256_set1_ps ( 0.0f    );
        const auto mask_6  = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto mask_7  = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto  dot5   = _mm256_set1_ps(     0.5f );

    	//
    	// MEASURE OF THE CN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto starT = timer();
		#endif

    	const auto seuilProj = _mm256_set1_ps( 1e-5f );
        for(int j = 0; j < _mNChecks; j++)
		{
            if( CheckDegree[j] == 6 ){
            	const int  cDeg6       = 0x3F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		int test              = (_mm256_movemask_ps( synd ) & cDeg6);  // deg 6
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  (xpred,      _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified       += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6

    	        if( skip == false )
    	        {
    	        	const auto _ztemp  = mp.projection_deg6( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
	    	    	const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
	    	    	const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda[CumSumCheckDegree],         mask_6,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica[CumSumCheckDegree],         mask_6,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
					t_pj   += (timer() - START);
		#endif
                CumSumCheckDegree += 6;

            }else if( CheckDegree[j] == 7 )
	    {
            	const int  cDeg7       = 0x7F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		const int  test       = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  ( xpred,    _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified         += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7

    	        if( skip == false )
    	        {
			const auto _ztemp  = mp.projection_deg7( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
    	    		const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
    	    		const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda        [CumSumCheckDegree], mask_7,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica        [CumSumCheckDegree], mask_7,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
							t_pj   += (timer() - START);
		#endif

                CumSumCheckDegree += 7;

            }else{
                exit( 0 );
            }
        }

    	//
    	// MEASURE OF THE CN LOOP EXECUTION TIME
    	//
        #ifdef PROFILE_ON
				t_cn   += (timer() - starT);
	#endif
	#ifdef PROFILE_ON
	t_ex += 1;
		//FILE *ft=fopen("time.txt","a");
		//fprintf(ft,"%d \n", t_cn/t_ex);
		//fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj);
		//fclose(ft);
	#endif
		if(allVerified == 0)
		{
			mAlgorithmConverge = true;
			mValidCodeword     = true;
			break;
		}
	}

	//
	// MEASURE OF THE NUMBER OF EXECUTION
	//
//	#ifdef PROFILE_ON
//		t_ex += 1;
//	#endif

}
Esempio n. 16
0
inline vec8 operator/(vec8 a, vec8 b) { return _mm256_div_ps(a, b); }
Esempio n. 17
0
 /*!
  * \brief Divide the two given vectors
  */
 ETL_STATIC_INLINE(avx_simd_float) div(avx_simd_float lhs, avx_simd_float rhs) {
     return _mm256_div_ps(lhs.value, rhs.value);
 }
    void run_softmax_int32_float_work_item_batch8x(nn_workload_item *const work_item, uint16_t NoBatch8)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;
        const auto batch_size_global = work_item->output->parent->lengths.t[NN_DATA_COORD_n];
        const auto batch_size = 8;

        const auto num_full_blocks = output_width / C_max_acc;
        const auto partial_block_size = output_width % C_max_acc;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x] * batch_size * NoBatch8;
        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p] * batch_size * NoBatch8;

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * batch_size * sizeof(float), 64);
        float * output_f = (float*)_mm_malloc(output_width * batch_size * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start + NoBatch8 * 8 * input_width];
        //auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width * batch_size; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width* batch_size; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width* batch_size; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();

        {
            auto input_buffer = input_f;
            //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width];
            auto output_buffer = output_f;
            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);

        {
            //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width];
            auto output_buffer = output_f;
            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

        for (auto itrW = 0; itrW < output_width; itrW++)
        for (auto itr8 = 0; itr8 < C_batch_size; itr8++)
            output_buffer[itr8 + itrW * batch_size_global + NoBatch8 * C_batch_size] = output_f[itr8 + itrW * C_batch_size];

        _mm_free(input_f);
        _mm_free(output_f);
    }
Esempio n. 19
0
void run_dct(int width, int height, float *quant, float *input, int32_t *output)
{
  float acosvals[8][8];

  /* Calculating cosines is expensive, and there
   * are only 64 cosines that need to be calculated
   * so precompute them and cache. */
  for (int i = 0; i < 8; i++)
  {
    for (int j = 0; j < 8; j++)
    {
      if (j == 0)
      {
        acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j);
      }
      else
      {
        acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j);
      }
    }
  }

/* Separate the parallel from the for, so each processor gets its
   * own copy of the buffers and variables. */
#pragma omp parallel
  {
    float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
    avload[0] = sqrt(1.0 / 8.0);
    __m256 row0, row1, row2, row3, row4, row5, row6, row7;
    __m256 loaderlow, loaderhigh;
    __m256 temp;
    __m256 minus128 = _mm256_set1_ps(-128.0);
    __m256 avxcosloader, avxcos;
    float avxcosmover;
    __m256i integer;

/* The DCT breaks the image into 8 by 8 blocks and then
   * transforms them into color frequencies. */
#pragma omp for
    for (int brow = 0; brow < height / 8; brow++)
    {
      for (int bcol = 0; bcol < width / 8; bcol++)
      {
        int head_pointer = bcol * 8 + brow * 8 * width;
        row0 = _mm256_setzero_ps();
        row1 = _mm256_setzero_ps();
        row2 = _mm256_setzero_ps();
        row3 = _mm256_setzero_ps();
        row4 = _mm256_setzero_ps();
        row5 = _mm256_setzero_ps();
        row6 = _mm256_setzero_ps();
        row7 = _mm256_setzero_ps();

        /* This pair of loops uses AVX instuctions to add the frequency
       * component from each pixel to all of the buckets at once.  Allows
       * us to do the DCT on a block in 64 iterations of a loop rather
       * than 64 iterations of 64 iterations of a loop (all 64 pixels affect
       * all 64 frequencies) */
        for (int x = 0; x < 8; x++)
        {
          for (int y = 0; y < 4; y++)
          {
            loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]);
            loaderlow = _mm256_add_ps(loaderlow, minus128);
            loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]);
            loaderhigh = _mm256_add_ps(loaderhigh, minus128);

            avxcos = _mm256_loadu_ps(&acosvals[x][0]);
            loaderlow = _mm256_mul_ps(loaderlow, avxcos);
            loaderhigh = _mm256_mul_ps(loaderhigh, avxcos);

            avxcosloader = _mm256_loadu_ps(&acosvals[y][0]);

            avxcosmover = avxcosloader[0];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row0 = _mm256_add_ps(row0, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row0 = _mm256_add_ps(row0, temp);

            avxcosmover = avxcosloader[1];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row1 = _mm256_add_ps(row1, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row1 = _mm256_sub_ps(row1, temp);

            avxcosmover = avxcosloader[2];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row2 = _mm256_add_ps(row2, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row2 = _mm256_add_ps(row2, temp);

            avxcosmover = avxcosloader[3];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row3 = _mm256_add_ps(row3, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row3 = _mm256_sub_ps(row3, temp);

            avxcosmover = avxcosloader[4];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row4 = _mm256_add_ps(row4, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row4 = _mm256_add_ps(row4, temp);

            avxcosmover = avxcosloader[5];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row5 = _mm256_add_ps(row5, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row5 = _mm256_sub_ps(row5, temp);

            avxcosmover = avxcosloader[6];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row6 = _mm256_add_ps(row6, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row6 = _mm256_add_ps(row6, temp);

            avxcosmover = avxcosloader[7];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row7 = _mm256_add_ps(row7, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row7 = _mm256_sub_ps(row7, temp);
          }
        }

        /* Each frequency stored as a float needs to be divided by
       * the quantization value, then rounded to the nearest integer.
       * Also changes the order of the values from pixel order to
       * each 8 by 8 block stored one after another. */
        temp = _mm256_loadu_ps(&quant[0]);
        row0 = _mm256_div_ps(row0, temp);
        row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row0);
        _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[8]);
        row1 = _mm256_div_ps(row1, temp);
        row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row1);
        _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[16]);
        row2 = _mm256_div_ps(row2, temp);
        row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row2);
        _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[24]);
        row3 = _mm256_div_ps(row3, temp);
        row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row3);
        _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[32]);
        row4 = _mm256_div_ps(row4, temp);
        row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row4);
        _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[40]);
        row5 = _mm256_div_ps(row5, temp);
        row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row5);
        _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[48]);
        row6 = _mm256_div_ps(row6, temp);
        row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row6);
        _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[56]);
        row7 = _mm256_div_ps(row7, temp);
        row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row7);
        _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer);
      }
    }
  }
}
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
Esempio n. 21
0
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const
{
	float tBox = std::numeric_limits<float>::min();
	if (!Intersects(ray, bb, tBox) || tBox > t)
		return nullptr;

	const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X);
	const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y);
	const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z);

	const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X);
	const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y);
	const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z);

	union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; };

	for (int i = 0; i < count; i++)
	{
		// Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position;
		const __m256 e1X = edge1X8[i];
		const __m256 e1Y = edge1Y8[i];
		const __m256 e1Z = edge1Z8[i];

		// Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position;
		const __m256 e2X = edge2X8[i];
		const __m256 e2Y = edge2Y8[i];
		const __m256 e2Z = edge2Z8[i];

		// Vector3F p = ray.Direction.Cross(e2);
		const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y));
		const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z));
		const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X));

		// float det = e1.Dot(p);
		const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ)));

		// if (det > -EPSILON && det < EPSILON)
		//     return false;
		__m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS));

		// float invDet = 1 / det;
		const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det);

		// Vector3F r = ray.Origin - triangle.Vertices[0].Position;
		const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]);
		const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]);
		const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]);

		// float u = r.Dot(p) * invDet;
		const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ))));

		// if (u < 0 || u > 1)
		//	   return false;
		mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS));

		// Vector3F q = r.Cross(e1);
		const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y));
		const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z));
		const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X));

		// float v = ray.Direction.Dot(q) * invDet;
		const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ))));

		// if (v < 0 || u + v > 1)
		//     return false;
		mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS)));

		// float tt = e2.Dot(q) * invDet;
		const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ))));

		// if (tt > EPSILON)
		// {
		//     t = tt;
		//     return true;
		// }
		//
		// return false;
		distances[i] = _mm256_and_ps(tt, mask);
	}

	Triangle* triangle = nullptr;
	for (int i = 0; i < count * NROFLANES; i++)
		if (dists[i] < t && dists[i] > EPSILON)
		{
			t = dists[i];
			triangle = triangles[i];
		}

	return triangle;
}