Пример #1
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
Пример #2
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) {
    __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x);
    __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y);
    __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    _mm256_store_ps(accI, aix);
    _mm256_store_ps(accI + 8, aiy);
    _mm256_store_ps(accI + 16, aiz);

}
Пример #3
0
extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) 
{
	for(int i=0; i<n; i++) {	
		__m256 t1 = _mm256_loadu_ps(&c[i*n +  0]);
		__m256 t2 = _mm256_loadu_ps(&c[i*n +  8]);
		__m256 t3 = _mm256_loadu_ps(&c[i*n + 16]);
		__m256 t4 = _mm256_loadu_ps(&c[i*n + 24]);		
		for(int k=0; k<n; k++) {
			__m256 a1 = _mm256_set1_ps(a[k*n+i]);
			
			__m256 b1 = _mm256_loadu_ps(&b[k*n+0]);
			t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1));
			
			__m256 b2 = _mm256_loadu_ps(&b[k*n+8]);
			t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2));

			__m256 b3 = _mm256_loadu_ps(&b[k*n+16]);
			t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3));

			__m256 b4 = _mm256_loadu_ps(&b[k*n+24]);
			t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4));			
		}
		_mm256_storeu_ps(&c[i*n +  0], t1);
		_mm256_storeu_ps(&c[i*n +  8], t2);
		_mm256_storeu_ps(&c[i*n + 16], t3);
		_mm256_storeu_ps(&c[i*n + 24], t4);
	}
}
Пример #4
0
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y)
{ 
#ifdef AVX
	//Black magic
	//But it does produce the same results as the not AVX code
	__m256 accumulator;
	__m256 x_s = _mm256_load_ps(x->Features);
	__m256 y_s = _mm256_load_ps(y->Features);
	__m256 result = _mm256_sub_ps(x_s, y_s);
	accumulator = _mm256_mul_ps(result, result);

	x_s = _mm256_load_ps(&x->Features[8]);
	y_s = _mm256_load_ps(&y->Features[8]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[16]);
	y_s = _mm256_load_ps(&y->Features[16]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[24]);
	y_s = _mm256_load_ps(&y->Features[24]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);
	//We now have a vector of 8 floats

	__m256 t1 = _mm256_hadd_ps(accumulator, accumulator);
	__m256 t2 = _mm256_hadd_ps(t1, t1);
	__m128 t3 = _mm256_extractf128_ps(t2, 1);
	__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
	//And now we don't
	return std::sqrtf(_mm_cvtss_f32(t4));
#endif
#ifndef AVX
	//Can be autovectorized
	float accumulator[32];
	float distance = 0;
	for (int i = 0; i < 30; i++)
	{
		accumulator[i] = x->Features[i] - y->Features[i];
	}

	//If done properly this should be 4(8) instructions
	for (int i = 0; i < 30; i++)
	{
		distance += accumulator[i] * accumulator[i];
	}

	return std::sqrtf(distance);
#endif

	
}
Пример #5
0
__m256 mm256_exp_ps(__m256 x) {
  __m256 tmp = _mm256_setzero_ps(), fx;
  __m256i emm0;
  __m256 one = *(__m256*)m256_ps_1;

  x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi);
  x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5);

  /* how to perform a floorf with SSE: just below */
  /* step 1 : cast to int */
  emm0 = _mm256_cvttps_epi32(fx);
  /* step 2 : cast back to float */
  tmp  = _mm256_cvtepi32_ps(emm0);

  /* if greater, substract 1 */
  __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS );
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1);
  __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  __m256 y = *(__m256*)m256_ps_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  emm0 = _mm256_cvttps_epi32(fx);
  emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f);
  emm0 = _mm256_slli_epi32(emm0, 23);
  __m256 pow2n = _mm256_castsi256_ps(emm0);

  y = _mm256_mul_ps(y, pow2n);
  _mm256_zeroupper();
  return y;
}
Пример #6
0
v8sf exp256_ps(v8sf x) {
  v8sf tmp = _mm256_setzero_ps(), fx;
  v8si imm0;
  v8sf one = *(v8sf*)_ps256_1;

  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);

  /* how to perform a floorf with SSE: just below */
  //imm0 = _mm256_cvttps_epi32(fx);
  //tmp  = _mm256_cvtepi32_ps(imm0);
  
  tmp = _mm256_floor_ps(fx);

  /* if greater, substract 1 */
  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  imm0 = _mm256_cvttps_epi32(fx);
  // another two AVX2 instructions
  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
  imm0 = _mm256_slli_epi32(imm0, 23);
  v8sf pow2n = _mm256_castsi256_ps(imm0);
  y = _mm256_mul_ps(y, pow2n);
  return y;
}
Пример #7
0
inline avx_m256_t newsin_ps(avx_m256_t x) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);
	
	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	emm2 = _mm256_add_epi32(emm2, _pi32_1);
	emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	y = _mm256_cvtepi32_ps(emm2);

	avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	emm0 = _mm256_slli_epi32(emm0, 29);

	emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	
	avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0);
	avx_m256_t poly_mask = _mm256_castsi256_ps(emm2);
	sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
	
	avx_m256_t temp = _ps_minus_cephes_DP123;
	temp = _mm256_mul_ps(y, temp);
	x = _mm256_add_ps(x, temp);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp = _mm256_mul_ps(x2, _ps_0p5);
	temp = _mm256_sub_ps(temp, _ps_1);
	y = _mm256_sub_ps(y, temp);
	y2 = _mm256_add_ps(y2, x);

	y = _mm256_andnot_ps(poly_mask, y);
	y2 = _mm256_and_ps(poly_mask, y2);
	y = _mm256_add_ps(y, y2);

	y = _mm256_xor_ps(y, sign_bit);

	return y;
} // newsin_ps()
Пример #8
0
    __m256 distance(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2)
    {
        const __m256 x_diff = _mm256_sub_ps(x1, x2);
        const __m256 y_diff = _mm256_sub_ps(y1, y2);
        const __m256 x_diff2 = _mm256_mul_ps(x_diff, x_diff);
        const __m256 y_diff2 = _mm256_mul_ps(y_diff, y_diff);
        const __m256 sum = _mm256_add_ps(x_diff2, y_diff2);
        const __m256 dist = _mm256_sqrt_ps(sum);

        return dist;
    }
Пример #9
0
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for (; i <= width - 16; i += 16)
    {
        __m256 f, s0 = d8, s1 = d8;
        __m256 x0;
        S = src[0] + i;

        for (k = 1; k <= ksize2; k++)
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for (; i <= width - 4; i += 4)
    {
        __m128 f, x0, s0 = d4;

        for (k = 1; k <= ksize2; k++)
        {
            f = _mm_set1_ps(ky[k]);
            x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}
Пример #10
0
/* AVX implementation for Zero Forcing (ZF) solver */
inline void srslte_mat_2x2_zf_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11,
                                      __m256 *x0, __m256 *x1, float norm) {

  __m256 det = srslte_mat_2x2_det_avx(h00, h01, h10, h11);
  __m256 detrec = _mm256_mul_ps(srslte_mat_cf_recip_avx(det), _mm256_set1_ps(norm));

#ifdef LV_HAVE_FMA
  *x0 = _MM256_PROD_PS(_MM256_PROD_SUB_PS(h11, y0, _MM256_PROD_PS(h01, y1)), detrec);
  *x1 = _MM256_PROD_PS(_MM256_PROD_SUB_PS(h00, y1, _MM256_PROD_PS(h10, y0)), detrec);
#else
  *x0 = _MM256_PROD_PS(_mm256_sub_ps(_MM256_PROD_PS(h11, y0), _MM256_PROD_PS(h01, y1)), detrec);
  *x1 = _MM256_PROD_PS(_mm256_sub_ps(_MM256_PROD_PS(h00, y1), _MM256_PROD_PS(h10, y0)), detrec);
#endif /* LV_HAVE_FMA */
}
Пример #11
0
    vector unit_vector(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2)
    {
        const __m256 x_diff = _mm256_sub_ps(x2, x1);
        const __m256 y_diff = _mm256_sub_ps(y2, y1);

        const __m256 dist = distance(x1, y1, x2, y2);

        const __m256 ux = _mm256_div_ps(x_diff, dist);
        const __m256 uy = _mm256_div_ps(y_diff, dist);

        vector result = {ux, uy};

        return result;
    }
Пример #12
0
/*
 * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr.
 *
 * Traverse elements randomly
 */
void
comp_s(st_coords arr, int L)
{
  for(int j=0; j<L; j+=8) {
    int i = (rand() % (L/8)) * 8;
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
#ifdef FMA
    register __m256 s0;
    s0 = _mm256_mul_ps(x, x);
    s0 = _mm256_fmadd_ps(y, y, s0);
    s0 = _mm256_fmadd_ps(z, z, s0);
    s0 = _mm256_fmsub_ps(t, t, s0);
    s0 = _mm256_sqrt_ps(s0);
#else
    register __m256 s0, s1;
    s1 = _mm256_mul_ps(x, x);
    s0 = _mm256_mul_ps(y, y);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(z, z);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(t, t);
    s1 = _mm256_sub_ps(s0, s1);
    s0 = _mm256_sqrt_ps(s1);
#endif
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
Пример #13
0
static inline void
blend_unorm8_argb(struct reg *src, __m256i dst_argb)
{
    if (gt.blend.enable) {
        const __m256i mask = _mm256_set1_epi32(0xff);
        const __m256 scale = _mm256_set1_ps(1.0f / 255.0f);
        struct reg dst[4];

        /* Convert to float */
        dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale);
        dst_argb = _mm256_srli_epi32(dst_argb, 8);
        dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale);
        dst_argb = _mm256_srli_epi32(dst_argb, 8);
        dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale);
        dst_argb = _mm256_srli_epi32(dst_argb, 8);
        dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale);

        /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst
         * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */
        const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg);
        src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg),
                                   _mm256_mul_ps(inv_alpha, dst[0].reg));
        src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg),
                                   _mm256_mul_ps(inv_alpha, dst[1].reg));
        src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg),
                                   _mm256_mul_ps(inv_alpha, dst[2].reg));
        src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg),
                                   _mm256_mul_ps(inv_alpha, dst[3].reg));
    }
}
Пример #14
0
/* AVX implementation for 2x2 determinant */
inline __m256 srslte_mat_2x2_det_avx(__m256 a00, __m256 a01, __m256 a10, __m256 a11) {
#ifdef LV_HAVE_FMA
  return _MM256_PROD_SUB_PS(a00, a11, _MM256_PROD_PS(a01, a10));
#else
  return _mm256_sub_ps(_MM256_PROD_PS(a00, a11), _MM256_PROD_PS(a01, a10));
#endif /* LV_HAVE_FMA */
}
Пример #15
0
void neuralNet::activationPrime_avx(const float* neuronOutput, float* result)
{
	static const __m256 ones = _mm256_set1_ps(1.0f);
	static const __m256 sigCoefficients = _mm256_set1_ps(SIGMOIDCOEFFICIENT);

	__m256 temp;
	const __m256* vOutput = (__m256*)neuronOutput;

	// 1 - ans
	temp = _mm256_sub_ps(ones, *vOutput);
	// (1-ans) * ans
	temp = _mm256_mul_ps(temp, *vOutput);
	// ans * coefficient
	temp = _mm256_mul_ps(temp, sigCoefficients);

#ifndef NDEBUG
	const float* _temp = (float*)&temp;
	assert(fastabs(_temp[0] - activationPrime(neuronOutput[0])) < 0.05f);
	assert(fastabs(_temp[1] - activationPrime(neuronOutput[1])) < 0.05f);
	assert(fastabs(_temp[2] - activationPrime(neuronOutput[2])) < 0.05f);
	assert(fastabs(_temp[3] - activationPrime(neuronOutput[3])) < 0.05f);
	assert(fastabs(_temp[4] - activationPrime(neuronOutput[4])) < 0.05f);
	assert(fastabs(_temp[5] - activationPrime(neuronOutput[5])) < 0.05f);
	assert(fastabs(_temp[6] - activationPrime(neuronOutput[6])) < 0.05f);
	assert(fastabs(_temp[7] - activationPrime(neuronOutput[7])) < 0.05f);
#endif

	// return ans
	_mm256_store_ps(result, temp);
};
Пример #16
0
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v)
{
  // Convert X,Y first into U,V space then round to nearest
  // integer. That gets us close to correct answer, mapping XY to a
  // lozenge-shaped space rather than hexagonal. We then correct the
  // four regions that lie outside the hexagonal cell assigning them
  // to their correct neighboring cell.
  // Writer's note: see ~/Google Drive/Work/calin

  // double dv = y*c_vy_inv;
  // double du = x-dv*c_vx;
  // u = std::lround(du);
  // v = std::lround(dv);
  // du -= u;
  // dv -= v;

  y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv));
  x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x);
  u = _mm256_cvtps_epi32(x);
  v = _mm256_cvtps_epi32(y);
  x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u));
  y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v));

  // double c3 = dv-du;
  const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x));

  __m256i uvshift;
  __m256i mask;

  // double c1 = du+0.5*dv;
  // double c2 = dv+0.5*du;
  // if(c3<0) {
  //   if(c1>=1) u++;
  //   else if(c2<-1) v--;
  // } else {
  //   if(c2>=1) v++;
  //   else if(c1<-1) u--;
  // }

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask);

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask);
}
Пример #17
0
void
mandel_avx(unsigned char *image, const struct spec *s)
{
    __m256 xmin = _mm256_set1_ps(s->xlim[0]);
    __m256 ymin = _mm256_set1_ps(s->ylim[0]);
    __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width);
    __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height);
    __m256 threshold = _mm256_set1_ps(4);
    __m256 one = _mm256_set1_ps(1);
    __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations);
    __m256 depth_scale = _mm256_set1_ps(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 8) {
            __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4,
                                      x + 3, x + 2, x + 1, x + 0);
            __m256 my = _mm256_set1_ps(y);
            __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin);
            __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin);
            __m256 zr = cr;
            __m256 zi = ci;
            int k = 1;
            __m256 mk = _mm256_set1_ps(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m256 zr2 = _mm256_mul_ps(zr, zr);
                __m256 zi2 = _mm256_mul_ps(zi, zi);
                __m256 zrzi = _mm256_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr);
                zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm256_mul_ps(zr, zr);
                zi2 = _mm256_mul_ps(zi, zi);
                __m256 mag2 = _mm256_add_ps(zr2, zi2);
                __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS);
                mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk);

                /* Early bailout? */
                if (_mm256_testz_ps(mask, _mm256_set1_ps(-1)))
                    break;
            }
            mk = _mm256_mul_ps(mk, iter_scale);
            mk = _mm256_sqrt_ps(mk);
            mk = _mm256_mul_ps(mk, depth_scale);
            __m256i pixels = _mm256_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 8; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
Пример #18
0
inline void avx2_xy_to_uv_with_remainder_f(
  __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v)
{
  avx2_xy_to_uv_f(x_in_dx_out, y_in_dy_out, u, v);
  x_in_dx_out = _mm256_sub_ps(x_in_dx_out, _mm256_cvtepi32_ps(u));
  __m256 vf = _mm256_cvtepi32_ps(v);
  x_in_dx_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vx), x_in_dx_out);
  y_in_dy_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vy), y_in_dy_out);
}
Пример #19
0
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale));
    __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2);

    for (int i = 0; i < 8; i++) {
        if (close.m256_f32[i] == 0) {
            numNeighbours[7 - i] = 0;
        }
    }

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
Пример #20
0
inline void avx2_xy_trans_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v,
  const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0)
{
  // x = (x - dx)/scale;
  // y = (y - dy)/scale;
  // double xx = x*crot + y*srot;
  // y = y*crot - x*srot;
  // xy_to_uv(xx,y,u,v);
  const __m256 vsrot = _mm256_set1_ps(srot);
  const __m256 vcrot = _mm256_set1_ps(crot);
  const __m256 vscale = _mm256_set1_ps(1.0f/scale);
  x = _mm256_mul_ps(_mm256_sub_ps(x, _mm256_set1_ps(dx)), vscale);
  y = _mm256_mul_ps(_mm256_sub_ps(y, _mm256_set1_ps(dy)), vscale);
  __m256 yy = _mm256_mul_ps(x, vsrot);
  yy = _mm256_fmsub_ps(y, vcrot, yy);
  x = _mm256_mul_ps(x, vcrot);
  x = _mm256_fmadd_ps(y, vsrot, x);
  avx2_xy_to_uv_f(x, yy, u, v);
}
Пример #21
0
void somap::train(const imgdata & obj)
{
	if (this->weight <= 0.0)return;
	__m256 tmp;
	__m256 *v1 = (__m256*)(this->fvex);
	const __m256 *v2 = (__m256*)(obj.fvex);
	const __m256 ws = _mm256_set1_ps(this->weight);
	for (int i = 0; i < f; i++) {
		tmp = _mm256_sub_ps(v2[i], v1[i]);
		tmp = _mm256_mul_ps(tmp, ws);
		v1[i] = _mm256_add_ps(v1[i], tmp);
	}
}
Пример #22
0
extern "C" void product64x64_avx(float *a, float *b, float *c, int n) 
{
	for(int i=0; i<n; i++) {	
		__m256 t1 = _mm256_loadu_ps(&c[i*n +  0]);
		__m256 t2 = _mm256_loadu_ps(&c[i*n +  8]);
		__m256 t3 = _mm256_loadu_ps(&c[i*n + 16]);
		__m256 t4 = _mm256_loadu_ps(&c[i*n + 24]);
		__m256 t5 = _mm256_loadu_ps(&c[i*n + 32]);
		__m256 t6 = _mm256_loadu_ps(&c[i*n + 40]);
		__m256 t7 = _mm256_loadu_ps(&c[i*n + 48]);
		__m256 t8 = _mm256_loadu_ps(&c[i*n + 56]);
		for(int k=0; k<n; k++) {
			__m256 a1 = _mm256_set1_ps(a[k*n+i]);
			
			__m256 b1 = _mm256_loadu_ps(&b[k*n+0]);
			t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1));
			
			__m256 b2 = _mm256_loadu_ps(&b[k*n+8]);
			t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2));

			__m256 b3 = _mm256_loadu_ps(&b[k*n+16]);
			t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3));

			__m256 b4 = _mm256_loadu_ps(&b[k*n+24]);
			t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4));

			__m256 b5 = _mm256_loadu_ps(&b[k*n+32]);
			t5 = _mm256_sub_ps(t5,_mm256_mul_ps(a1,b5));

			__m256 b6 = _mm256_loadu_ps(&b[k*n+40]);
			t6 = _mm256_sub_ps(t6,_mm256_mul_ps(a1,b6));

			__m256 b7 = _mm256_loadu_ps(&b[k*n+48]);
			t7 = _mm256_sub_ps(t7,_mm256_mul_ps(a1,b7));

			__m256 b8 = _mm256_loadu_ps(&b[k*n+56]);
			t8 = _mm256_sub_ps(t8,_mm256_mul_ps(a1,b8));
		}
		_mm256_storeu_ps(&c[i*n +  0], t1);
		_mm256_storeu_ps(&c[i*n +  8], t2);
		_mm256_storeu_ps(&c[i*n + 16], t3);
		_mm256_storeu_ps(&c[i*n + 24], t4);
		_mm256_storeu_ps(&c[i*n + 32], t5);
		_mm256_storeu_ps(&c[i*n + 40], t6);
		_mm256_storeu_ps(&c[i*n + 48], t7);
		_mm256_storeu_ps(&c[i*n + 56], t8);
	}
}
Пример #23
0
void convertCAVX(int num, uint8_t *in, float *out){
    int i;
    __m256 sub = _mm256_set1_ps(128.0);
    __m256 mul = _mm256_set1_ps(1/128.0);
    for(i=0; i<num; i+=8){
        __m128i val  = _mm_loadu_si128((__m128i *)(in + i));
        __m256i ints = _mm256_cvtepu8_epi32(val);
        __m256  cvtd = _mm256_cvtepi32_ps(ints);

        __m256  res  = _mm256_mul_ps(_mm256_sub_ps(cvtd, sub), mul);

        _mm256_storeu_ps(out + i, res);
    }
}
Пример #24
0
void 
nv_vector_sub(nv_matrix_t *vec0, int m0,
			  const nv_matrix_t *vec1, int m1,
			  const nv_matrix_t *vec2, int m2)
{
	NV_ASSERT(vec1->n == vec2->n);
	NV_ASSERT(vec2->n == vec0->n);
	
#if NV_ENABLE_AVX	
	{
		__m256 x;
		int n;
		int pk_lp = (vec1->n & 0xfffffff8);

		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n));
			x = _mm256_sub_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n));
			_mm256_store_ps(&NV_MAT_V(vec0, m0, n), x);
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);

#ifdef _OPENMP
//#pragma omp parallel for
#endif
		for (n = 0; n < pk_lp; n += 4) {
			__m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n));
			x = _mm_sub_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n));
			_mm_store_ps(&NV_MAT_V(vec0, m0, n), x);
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n);
		}
	}
#endif
}
Пример #25
0
inline void avx2_xy_trans_to_uv_with_remainder_f(
  __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v,
  const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0)
{
  const __m256 vsrot = _mm256_set1_ps(srot);
  const __m256 vcrot = _mm256_set1_ps(crot);
  __m256 vscale = _mm256_set1_ps(1.0f/scale);
  x_in_dx_out = _mm256_mul_ps(_mm256_sub_ps(x_in_dx_out, _mm256_set1_ps(dx)), vscale);
  y_in_dy_out = _mm256_mul_ps(_mm256_sub_ps(y_in_dy_out, _mm256_set1_ps(dy)), vscale);
  __m256 yy = _mm256_mul_ps(x_in_dx_out, vsrot);
  yy = _mm256_fmsub_ps(y_in_dy_out, vcrot, yy);
  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot);
  x_in_dx_out = _mm256_fmadd_ps(y_in_dy_out, vsrot, x_in_dx_out);
  avx2_xy_to_uv_with_remainder_f(x_in_dx_out, yy, u, v);

  vscale = _mm256_set1_ps(scale);
  y_in_dy_out = _mm256_mul_ps(yy, vcrot);
  y_in_dy_out = _mm256_fmadd_ps(x_in_dx_out, vsrot, y_in_dy_out);
  y_in_dy_out = _mm256_mul_ps(y_in_dy_out, vscale);

  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot);
  x_in_dx_out = _mm256_fnmadd_ps(yy, vsrot, x_in_dx_out);
  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vscale);
}
Пример #26
0
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const
{
    float pe = 0.0;
    int atom_i = 0;

    /* BEGIN SIMD SECTION */
    // This performs the exact same calculation after the SIMD section
    // but doing it on 8 atoms at a time using SIMD instructions.

    coordinates8 rand_xyz8(rand_xyz), atom_xyz;
    __m256 r2_8, mask, r6, ri6, pe_tmp;
    __m256 pe_sum = _mm256_setzero_ps();
    float result[n] __attribute__((aligned (16)));

    for (; atom_i < this->n-8; atom_i+=8)
    {
        atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i);
        r2_8 = distance2(atom_xyz, rand_xyz8, box);
        mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS);
        r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8));
        ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6));
        pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8)));
        pe_sum = _mm256_add_ps(pe_tmp, pe_sum);
    }
    _mm256_store_ps(result, pe_sum);
    for (int i = 0; i < 8; i++)
    {
        pe += result[i];
    }

    /* END SIMD SECTION */

    for (; atom_i < this->n; atom_i++)
    {
        coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i);
        float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box));
        if (r2 < this->rcut2)
        {
            float ri6 = 1.0/(pow(r2,3));
            pe += ri6*(this->c12*ri6 - this->c6);
        }
    }

    pe += this->n/vol * this->tail_factor;;

    return pe;
}
Пример #27
0
Файл: main.cpp Проект: sclc/DPP
int
main(void)
{
    float out[8];

    __m256 a=_mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
    __m256 b=_mm256_setr_ps(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);

    __m256 dst = _mm256_sub_ps(a, b);

    _mm256_storeu_ps(out, dst);

    for(int i=0; i<sizeof(out)/sizeof(out[0]); i++)
        printf("out[%d]=%5.1f\n", i, out[i]);

    return 0;
}
Пример #28
0
void	TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac)
{
	assert (val_arr != 0);

	const __m256   scale     = _mm256_set1_ps (1 << LINLUT_RES_L2);
	const __m256i  offset    =
		_mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2));
	const __m256i  val_min   = _mm256_setzero_si256 ();
	const __m256i  val_max   = _mm256_set1_epi32 (LINLUT_SIZE_F - 2);

	const __m256   v         =
		_mm256_load_ps (reinterpret_cast <const float *> (val_arr));
	const __m256   val_scl   = _mm256_mul_ps (v, scale);
	const __m256i  index_raw = _mm256_cvtps_epi32 (val_scl);
	__m256i        index_tmp = _mm256_add_epi32 (index_raw, offset);
	index_tmp = _mm256_min_epi32 (index_tmp, val_max);
	index     = _mm256_max_epi32 (index_tmp, val_min);
	frac      = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw));
}
Пример #29
0
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result)
{
	BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f);
	// code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html
	// approximates sigmoid function with coefficient 4.0f
	static const __m256 ones = _mm256_set1_ps(1.0f);
	static const __m256 oneFourths = _mm256_set1_ps(0.25f);
	static const __m256 fours = _mm256_set1_ps(4.0f);

	__m256 temp;
	const __m256* vOutput = (__m256*)_neuronOutput;

	// min (output, 4.0)
	temp = _mm256_min_ps(*vOutput, fours);
	// multiply by 0.25
	temp = _mm256_mul_ps(temp, oneFourths);
	// 1 - ans
	temp = _mm256_sub_ps(ones, temp);
	// ans^16
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	// 1 + ans
	temp = _mm256_add_ps(ones, temp);
	// 1 / ans
	temp = _mm256_rcp_ps(temp);

#ifndef NDEBUG
	const float* _temp = (float*)&temp;
	assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f);
	assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f);
	assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f);
	assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f);
	assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f);
	assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f);
	assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f);
	assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f);
#endif

	// return ans
	_mm256_store_ps(result, temp);
};
Пример #30
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }