void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) {
    __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x);
    __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y);
    __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    _mm256_store_ps(accI, aix);
    _mm256_store_ps(accI + 8, aiy);
    _mm256_store_ps(accI + 16, aiz);

}
Example #2
0
File: main.cpp Project: sclc/DPP
//-----------------------------------------------------------------
// AOS -> SOA
//
//    pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ...
// ->
//    pBlu:  b0, b1, b2, b3, b4, ...
//    pGrn:  g0, g1, g2, g3, g4, ...
//    pRed:  r0, r1, r2, r3, r4, ...
void
aos2soa(float *pBgr, float *pBlu, float *pGrn, float *pRed, const size_t length)
{
    __m128 *bgr = (__m128 *)pBgr;
    float *b = pBlu;
    float *g = pGrn;
    float *r = pRed;

    for (size_t i = 0; i < length; i += 24, b += 8, g += 8, r += 8)
    {
        __m256 m03 = _mm256_castps128_ps256(*bgr++);  // 下半分のロード
        __m256 m14 = _mm256_castps128_ps256(*bgr++);
        __m256 m25 = _mm256_castps128_ps256(*bgr++);
        m03 = _mm256_insertf128_ps(m03, *bgr++, 1);   // 上半分のロード
        m14 = _mm256_insertf128_ps(m14, *bgr++, 1);
        m25 = _mm256_insertf128_ps(m25, *bgr++, 1);

        __m256 bg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // b と g の上部分
        __m256 gr = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // g と r の下部分
        __m256 bb = _mm256_shuffle_ps(m03, bg,  _MM_SHUFFLE(2, 0, 3, 0));
        __m256 gg = _mm256_shuffle_ps(gr, bg,   _MM_SHUFFLE(3, 1, 2, 0));
        __m256 rr = _mm256_shuffle_ps(gr, m25,  _MM_SHUFFLE(3, 0, 3, 1));

        _mm256_store_ps(b, bb);
        _mm256_store_ps(g, gg);
        _mm256_store_ps(r, rr);
    }
}
static inline void rectifier_kernel_avx5(float *a, const size_t blocks) {
    for (size_t i = 0; i < blocks; ++i) {
        _mm256_store_ps( &a[i*8*5 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 0*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 1*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 2*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 3*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*5 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 4*8] ) , _mm256_setzero_ps() ) );
    }
}
Example #4
0
double compute_pi_leibniz_avx_opt_single(size_t n)
{
	double pi = 0.0;
	register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
	register __m256 ymm9, ymm10, ymm11, ymm12, ymm13;

	ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
	ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
	ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0);
	ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0);
	ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0);
	ymm13 = _mm256_set1_ps(64.0);

	ymm5 = _mm256_setzero_ps();
	ymm6 = _mm256_setzero_ps();
	ymm7 = _mm256_setzero_ps();
	ymm8 = _mm256_setzero_ps();
	
	for (int i = 0; i <= n - 32; i += 32) {
		ymm9 = _mm256_div_ps(ymm0, ymm1);
		ymm1 = _mm256_add_ps(ymm1, ymm13);
		ymm10 = _mm256_div_ps(ymm0, ymm2);
		ymm2 = _mm256_add_ps(ymm2, ymm13);
		ymm11 = _mm256_div_ps(ymm0, ymm3);
		ymm3 = _mm256_add_ps(ymm3, ymm13);
		ymm12 = _mm256_div_ps(ymm0, ymm4);
		ymm4 = _mm256_add_ps(ymm4, ymm13);

		ymm5 = _mm256_add_ps(ymm5, ymm9);
		ymm6 = _mm256_add_ps(ymm6, ymm10);
		ymm7 = _mm256_add_ps(ymm7, ymm11);
		ymm8 = _mm256_add_ps(ymm8, ymm12);
	}
	float tmp[8] __attribute__((aligned(32)));
	_mm256_store_ps(tmp, ymm5);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm6);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm7);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm8);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];

	return pi * 4.0;
}
static inline void rectifier_kernel_back_avx(float *a, const size_t blocks) {
    assert(blocks > 0);
    size_t i = blocks;
    do {
        _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) );
    } while(--i);
}
Example #6
0
File: Play.cpp Project: zhangce/nn
void aaaa(const int c, float * const a3, float * const a4, const float const * fck){
	__m256 a_fck = _mm256_load_ps(fck);

	for(int i=0;i<c;i+=16){
		
		__m256 b_i = _mm256_load_ps(&a4[i]);
		__m256 out_i = _mm256_mul_ps(b_i, a_fck);

		__m256 c_i = _mm256_load_ps(&a4[i+8]);
		__m256 out_i2 = _mm256_mul_ps(c_i, a_fck);

		_mm256_store_ps(&a3[i], out_i);
		_mm256_store_ps(&a3[i+8], out_i2);

	}
}
Example #7
0
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Set the dct_values for the current interation
		dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]],
				in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]],
				in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]],
				in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(dct_values, factor);

		// Load quant-values and multiply with previous product
		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_div_ps(result, quant_values);

		// Round off values and store in out_data buffer
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(out_data + zigzag, result);
	}
}
Example #8
0
float 
nv_vector_dot(const nv_matrix_t *vec1, int m1,
			  const nv_matrix_t *vec2, int m2)
{
	NV_ASSERT(vec1->n == vec2->n);
	
#if NV_ENABLE_AVX
	{
		NV_ALIGNED(float, mm[8], 32);
		__m256 x, u;
		int n;
		int pk_lp = (vec1->n & 0xfffffff8);
		float dp = 0.0f;
		
		u = _mm256_setzero_ps();
		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec2, m2, n));
			u = _mm256_add_ps(u, _mm256_mul_ps(x, *(__m256 *)&NV_MAT_V(vec1, m1, n)));
		}
		_mm256_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7];
		for (n = pk_lp; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
		
		return dp;
	}
#elif NV_ENABLE_SSE2
	{
		NV_ALIGNED(float, mm[4], 16);
		__m128 x, u;
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);
		float dp = 0.0f;

		u = _mm_setzero_ps();
		for (n = 0; n < pk_lp; n += 4) {
			x = _mm_load_ps(&NV_MAT_V(vec2, m2, n));
			u = _mm_add_ps(u,
				_mm_mul_ps(x, *(__m128 *)&NV_MAT_V(vec1, m1, n)));
		}
		_mm_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3];
		for (n = pk_lp; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
  
		return dp;
	}
#else
	{
		int n;
		float dp = 0.0f;
		for (n = 0; n < vec1->n; ++n) {
			dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n);
		}
		return dp;
	}
#endif
}
Example #9
0
/*
 * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr.
 *
 * Traverse elements randomly
 */
void
comp_s(st_coords arr, int L)
{
  for(int j=0; j<L; j+=8) {
    int i = (rand() % (L/8)) * 8;
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
#ifdef FMA
    register __m256 s0;
    s0 = _mm256_mul_ps(x, x);
    s0 = _mm256_fmadd_ps(y, y, s0);
    s0 = _mm256_fmadd_ps(z, z, s0);
    s0 = _mm256_fmsub_ps(t, t, s0);
    s0 = _mm256_sqrt_ps(s0);
#else
    register __m256 s0, s1;
    s1 = _mm256_mul_ps(x, x);
    s0 = _mm256_mul_ps(y, y);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(z, z);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(t, t);
    s1 = _mm256_sub_ps(s0, s1);
    s0 = _mm256_sqrt_ps(s1);
#endif
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
Example #10
0
void memsetf(float *ptr, float value, size_t length) {
#ifdef __AVX__
  const __m256 fillvec = _mm256_set1_ps(value);
  size_t startIndex = align_complement_f32(ptr);

  for (size_t i = 0; i < startIndex; i++) {
    ptr[i] = value;
  }

  for (int i = (int)startIndex; i < (int)length - 7; i += 8) {
    _mm256_store_ps(ptr + i, fillvec);
  }

  for (size_t i = startIndex + ((length - startIndex) & ~0x7);
      i < length; i++) {
    ptr[i] = value;
  }
#elif defined(__ARM_NEON__)
  const float32x4_t fillvec = vdupq_n_f32(value);
  for (int i = 0; i < (int)length - 3; i += 4) {
    vst1q_f32(ptr + i, fillvec);
  }
  for (size_t i = (length & ~0x3); i < length; i++) {
    ptr[i] = value;
  }
#else
  for (size_t i = 0; i < length; i++) {
    ptr[i] = value;
  }
#endif
}
Example #11
0
void neuralNet::activationPrime_avx(const float* neuronOutput, float* result)
{
	static const __m256 ones = _mm256_set1_ps(1.0f);
	static const __m256 sigCoefficients = _mm256_set1_ps(SIGMOIDCOEFFICIENT);

	__m256 temp;
	const __m256* vOutput = (__m256*)neuronOutput;

	// 1 - ans
	temp = _mm256_sub_ps(ones, *vOutput);
	// (1-ans) * ans
	temp = _mm256_mul_ps(temp, *vOutput);
	// ans * coefficient
	temp = _mm256_mul_ps(temp, sigCoefficients);

#ifndef NDEBUG
	const float* _temp = (float*)&temp;
	assert(fastabs(_temp[0] - activationPrime(neuronOutput[0])) < 0.05f);
	assert(fastabs(_temp[1] - activationPrime(neuronOutput[1])) < 0.05f);
	assert(fastabs(_temp[2] - activationPrime(neuronOutput[2])) < 0.05f);
	assert(fastabs(_temp[3] - activationPrime(neuronOutput[3])) < 0.05f);
	assert(fastabs(_temp[4] - activationPrime(neuronOutput[4])) < 0.05f);
	assert(fastabs(_temp[5] - activationPrime(neuronOutput[5])) < 0.05f);
	assert(fastabs(_temp[6] - activationPrime(neuronOutput[6])) < 0.05f);
	assert(fastabs(_temp[7] - activationPrime(neuronOutput[7])) < 0.05f);
#endif

	// return ans
	_mm256_store_ps(result, temp);
};
Example #12
0
File: core.c Project: jfellus/agem
void _mm256_print_ps(__m256 x) {
    size_t i;
    float tab[8];
    _mm256_store_ps (tab,x);
    for (i=0;i<8;i++)
        printf ("%8.2f ",tab[i]);
}
Example #13
0
File: main.cpp Project: sclc/DPP
//-------------------------------------------------------------------
// blend
void
effect(float *pBuffer[], const Cbmp *bmp, const float weight)
{
    size_t width = bmp->getWidth();
    size_t height = bmp->getAbsHeight();

    const float fmul0 = weight;
    const float fmul1 = 1.0f - weight;
    __m256 weight0 = _mm256_broadcast_ss(&fmul0);
    __m256 weight1 = _mm256_broadcast_ss(&fmul1);

    float *pF[2];
    pF[0] = pBuffer[0];
    pF[1] = pBuffer[1];

    for (size_t y = 0; y < height; y++)
    {
        for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8)
        {
            __m256 p0 = _mm256_load_ps(pF[0]);
            p0 = _mm256_mul_ps(p0, weight0);

            __m256 p1 = _mm256_load_ps(pF[1]);
            p1 = _mm256_mul_ps(p1, weight1);

            __m256 r = _mm256_add_ps(p0, p1);

            _mm256_store_ps(pF[0], r);
        }
    }
}
Example #14
0
/*
 * compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr
 */
void
comp_s(st_coords arr, int L)
{
  for(int i=0; i<L; i+=8) {
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
    register __m256 s0, s1; /* Temporaries */
    /* 
       _TODO_B_ 

       Complete this function using intrinsics so that is computes: 

       s[i:i+8] = sqrt(t[i:i+8]**2 - (x[i:i+8]**2 + y[i:i+8]**2 + z[i:i+8]**2))

       where, s, t, x, y, z are members of arr.

       You will use:
       
       _mm256_mul_ps();
       _mm256_add_ps();
       _mm256_sub_ps();
       _mm256_sqrt_ps();
    */
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
Example #15
0
File: main.cpp Project: sclc/DPP
//-------------------------------------------------------------------
// effect
static void
effect(float *pBlu, float *pGrn, float *pRed, const size_t height, const size_t width)
{
    __m256 zeroPs = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0);
    float *b = pBlu;
    float *r = pRed;

    for (size_t y = 0; y < height; y++)
    {
        for (size_t x = 0; x < width / 8; x++, b += 8, r += 8)
        {
            _mm256_store_ps(b, zeroPs); // B
                                        // G, skip
            _mm256_store_ps(r, zeroPs); // R
        }
    }
}
static __forceinline void
convert_half_to_float(float* dstp, const uint8_t* srcp, size_t count)
{
    for (size_t x = 0; x < count; x += 8) {
        __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + 2 * x));
        __m256 d = _mm256_cvtph_ps(s);
        _mm256_store_ps(dstp + x, d);
    }
}
void rectifier_avx_2(float *a, const size_t len) {
	float *p = a;
	for (; p + 8 <= &a[len]; p += 8) {
		_mm256_store_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) );
	}
	for (; p < &a[len]; ++p) {
		*p = *p > 0.0 ? *p : 0.0;
	}
}
Example #18
0
// =============================================================
// ====================== RGBX2BGRX_32F ========================
// =============================================================
void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width,
                    unsigned int _pitchs, unsigned int _pitchd,
                    unsigned int _start, unsigned int _stop) {

#ifdef USE_SSE

    const unsigned int widthz = (_pitchs/8);

    // Get start positions for buffers
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<widthz; ++x ) {

#ifdef USE_AVX1
            const __m256 v0 = _mm256_load_ps(tsrc);
            tsrc+=8;

            __m256 r0 = _mm256_permute_ps(v0,0xc6);

            _mm256_store_ps(tdest, r0 );
            tdest+=8;
#else // NOT TESTED

            const __m128 v0 = _mm_load_ps(tsrc);
            tsrc+=4;
            const __m128 v1 = _mm_load_ps(tsrc);
            tsrc+=4;

            //__m128 r0 = _mm_shuffle_ps(v0,0xc6);
            //__m128 r1 = _mm_shuffle_ps(v1,0xc6);

            //_mm_store_ps(tdest, r0 ); tdest+=4;
            //_mm_store_ps(tdest, r1 ); tdest+=4;
#endif

        }
    }

#else
    const float* tsrc;
    float* tdest;

    for( unsigned int y=_start; y<=_stop; ++y ) {
        tsrc = _src+(y*_pitchs);
        tdest = _dest+(y*_pitchd);
        for( unsigned int x=0; x<_width; ++x ) {
            float t = tsrc[4*x];
            tdest[4*x] = tsrc[4*x+2];
            tdest[4*x+2] = t;
        }
    }
#endif
}
Example #19
0
void AlignedAvxMult(float* d, float const* a, float const* b)
{
	for(int i = 0; i < gNumFloats; i += 8)
	{
		__m256 v1 = _mm256_load_ps(&a[i]);
		__m256 v2 = _mm256_load_ps(&b[i]);
		__m256 r = _mm256_mul_ps(v1, v2);
		_mm256_store_ps(&d[i], r);
	}
}
Example #20
0
File: main.cpp Project: sclc/DPP
//--------------------------------------------------------------------------
// amp AVX
static void
effectAvx(float fData[], const float amp, const size_t length)
{
    __m256 psAmp = _mm256_broadcast_ss(&amp);
    __m256 *pIn = (__m256 *)fData;

    for (size_t i = 0; i < length; i += 8, pIn++)
    {
        __m256 a = _mm256_mul_ps(*pIn, psAmp);
        _mm256_store_ps(&fData[i], a);
    }
}
Example #21
0
static void scale_block(float *in_data, float *out_data)
{
	__m256 in_vector, result;

	// Load the a1 values into a register
	static float a1_values[8] __attribute__((aligned(32))) = { ISQRT2, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
			1.0f, 1.0f };
	__m256 a1 = _mm256_load_ps(a1_values);

	// Load the a2 values into a register for the exception case
	__m256 a2 = _mm256_set1_ps(ISQRT2);

	/* First row is an exception
	 * Requires two _mm256_mul_ps operations */
	in_vector = _mm256_load_ps(in_data);
	result = _mm256_mul_ps(in_vector, a1);
	result = _mm256_mul_ps(result, a2);
	_mm256_store_ps(out_data, result);

	// Remaining calculations can be done with one _mm256_mul_ps operation
	in_vector = _mm256_load_ps(in_data + 8);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 8, result);

	in_vector = _mm256_load_ps(in_data + 16);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 16, result);

	in_vector = _mm256_load_ps(in_data + 24);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 24, result);

	in_vector = _mm256_load_ps(in_data + 32);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 32, result);

	in_vector = _mm256_load_ps(in_data + 40);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 40, result);

	in_vector = _mm256_load_ps(in_data + 48);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 48, result);

	in_vector = _mm256_load_ps(in_data + 56);
	result = _mm256_mul_ps(in_vector, a1);
	_mm256_store_ps(out_data + 56, result);
}
Example #22
0
void 
nv_vector_add(nv_matrix_t *vec0, int m0,
			  const nv_matrix_t *vec1, int m1,
			  const nv_matrix_t *vec2, int m2)
{
	NV_ASSERT(vec1->n == vec2->n);
	NV_ASSERT(vec2->n == vec0->n);
	
#if NV_ENABLE_AVX
	{
		__m256 x;
		int n;
		int pk_lp = (vec1->n & 0xfffffff8);
		
		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n));
			_mm256_store_ps(&NV_MAT_V(vec0, m0, n),
							_mm256_add_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n)));
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);

#ifdef _OPENMP
//#pragma omp parallel for
#endif
		for (n = 0; n < pk_lp; n += 4) {
			__m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n));
			_mm_store_ps(&NV_MAT_V(vec0, m0, n),
						 _mm_add_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n)));
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n);
		}
	}
#endif
}
Example #23
0
static void dct_1d_general(float* in_data, float* out_data, float lookup[64])
{
	__m256 current, dct_values, multiplied, sum;

	current = _mm256_broadcast_ss(in_data);
	dct_values = _mm256_load_ps(lookup);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = multiplied;

	// Broadcasts a single float (scalar) to every element in 'current'.
	current = _mm256_broadcast_ss(in_data + 1);
	// Loads DCT values from the lookup table. iDCT uses a transposed lookup table here.
	dct_values = _mm256_load_ps(lookup + 8);
	// Vertically multiply the scalar with the DCT values.
	multiplied = _mm256_mul_ps(dct_values, current);
	// Vertically add to the previous sum.
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 2);
	dct_values = _mm256_load_ps(lookup + 16);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 3);
	dct_values = _mm256_load_ps(lookup + 24);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 4);
	dct_values = _mm256_load_ps(lookup + 32);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 5);
	dct_values = _mm256_load_ps(lookup + 40);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 6);
	dct_values = _mm256_load_ps(lookup + 48);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	current = _mm256_broadcast_ss(in_data + 7);
	dct_values = _mm256_load_ps(lookup + 56);
	multiplied = _mm256_mul_ps(dct_values, current);
	sum = _mm256_add_ps(sum, multiplied);

	_mm256_store_ps(out_data, sum);
}
Example #24
0
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const
{
    float pe = 0.0;
    int atom_i = 0;

    /* BEGIN SIMD SECTION */
    // This performs the exact same calculation after the SIMD section
    // but doing it on 8 atoms at a time using SIMD instructions.

    coordinates8 rand_xyz8(rand_xyz), atom_xyz;
    __m256 r2_8, mask, r6, ri6, pe_tmp;
    __m256 pe_sum = _mm256_setzero_ps();
    float result[n] __attribute__((aligned (16)));

    for (; atom_i < this->n-8; atom_i+=8)
    {
        atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i);
        r2_8 = distance2(atom_xyz, rand_xyz8, box);
        mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS);
        r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8));
        ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6));
        pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8)));
        pe_sum = _mm256_add_ps(pe_tmp, pe_sum);
    }
    _mm256_store_ps(result, pe_sum);
    for (int i = 0; i < 8; i++)
    {
        pe += result[i];
    }

    /* END SIMD SECTION */

    for (; atom_i < this->n; atom_i++)
    {
        coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i);
        float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box));
        if (r2 < this->rcut2)
        {
            float ri6 = 1.0/(pow(r2,3));
            pe += ri6*(this->c12*ri6 - this->c6);
        }
    }

    pe += this->n/vol * this->tail_factor;;

    return pe;
}
Example #25
0
void 
nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm)
{
	NV_ASSERT(a->n >= x->n);
#if NV_ENABLE_AVX
	{
		__m256 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffff8);

		vv = _mm256_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 8) {
			xx = _mm256_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm256_div_ps(vv, xx);
			_mm256_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#elif NV_ENABLE_SSE2
	{
		__m128 xx, vv;
		int n;
		int pk_lp = (x->n & 0xfffffffc);

		vv = _mm_set1_ps(1.0f);

		for (n = 0; n < pk_lp; n += 4) {
			xx = _mm_load_ps(&NV_MAT_V(x, xm, n));
			xx = _mm_div_ps(vv, xx);
			_mm_store_ps(&NV_MAT_V(a, am, n), xx);
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#else
	{
		int n;
		for (n = 0; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n);
		}
	}
#endif
}
void polynomial(float *ret, const float *const r_values, int num) {
  // r*r*r*(10+r*(-15+r*6));

  __m256 const_6 = _mm256_set1_ps(6.0f);
  __m256 const_neg_15 = _mm256_set1_ps(-15.0f);
  __m256 const_10 = _mm256_set1_ps(10.0f);
  // constants

  const int loop_factor = 8;

  for (int i = 0; i < num; i+=loop_factor) {

#ifdef USE_IACA
  IACA_START
#endif
    __m256 r;
    __m256 left;
    __m256 right;
    // aligned load of 256 bits r
    r = _mm256_load_ps(&r_values[i]);
    left = _mm256_mul_ps(r, r); // r * r
#ifndef __FMA__

    right = _mm256_mul_ps(r, const_6); // r * 6
    left = _mm256_mul_ps(left, r); // r * r * r
    right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6
    right = _mm256_mul_ps(right, r); //r * (-15 + r * 6)
    right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6))

#else
    right = _mm256_fmadd_ps(r, const_6, const_neg_15);
    left = _mm256_mul_ps(left, r);

    right = _mm256_fmadd_ps(r, right, const_10);

#endif
    right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6))

    _mm256_store_ps(&ret[i], right); // store 8 values to ret[i]

  }
#ifdef USE_IACA
  IACA_END
#endif
}
Example #27
0
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result)
{
	BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f);
	// code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html
	// approximates sigmoid function with coefficient 4.0f
	static const __m256 ones = _mm256_set1_ps(1.0f);
	static const __m256 oneFourths = _mm256_set1_ps(0.25f);
	static const __m256 fours = _mm256_set1_ps(4.0f);

	__m256 temp;
	const __m256* vOutput = (__m256*)_neuronOutput;

	// min (output, 4.0)
	temp = _mm256_min_ps(*vOutput, fours);
	// multiply by 0.25
	temp = _mm256_mul_ps(temp, oneFourths);
	// 1 - ans
	temp = _mm256_sub_ps(ones, temp);
	// ans^16
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	// 1 + ans
	temp = _mm256_add_ps(ones, temp);
	// 1 / ans
	temp = _mm256_rcp_ps(temp);

#ifndef NDEBUG
	const float* _temp = (float*)&temp;
	assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f);
	assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f);
	assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f);
	assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f);
	assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f);
	assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f);
	assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f);
	assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f);
#endif

	// return ans
	_mm256_store_ps(result, temp);
};
Example #28
0
void 
nv_vector_muls(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm, float v)
{
	NV_ASSERT(a->n >= x->n);
#if NV_ENABLE_AVX
	{
		__m256 vv;
		int n;
		int pk_lp = (x->n & 0xfffffff8);

		vv = _mm256_set1_ps(v);
		for (n = 0; n < pk_lp; n += 8) {
			_mm256_store_ps(&NV_MAT_V(a, am, n),
				_mm256_mul_ps(vv, *(const __m256 *)&NV_MAT_V(x, xm, n)));
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v;
		}
	}
#elif NV_ENABLE_SSE2
	{
		__m128 vv;
		int n;
		int pk_lp = (x->n & 0xfffffffc);

		vv = _mm_set1_ps(v);
		for (n = 0; n < pk_lp; n += 4) {
			_mm_store_ps(&NV_MAT_V(a, am, n),
				_mm_mul_ps(vv, *(const __m128 *)&NV_MAT_V(x, xm, n)));
		}
		for (n = pk_lp; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v;
		}
	}
#else
	{
		int n;
		for (n = 0; n < x->n; ++n) {
			NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v;
		}
	}
#endif
}
Example #29
0
static void dequantize_block(float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	// Temporary buffer
	float temp_buf[8] __attribute__((aligned(32)));

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Load dct-values
		dct_values = _mm256_load_ps(in_data + zigzag);

		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_mul_ps(dct_values, quant_values);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(result, factor);

		// Round off products and store them temporarily
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(temp_buf, result);

		// Store the results at the correct places in the out_data buffer
		out_data[UV_indexes[zigzag]] = temp_buf[0];
		out_data[UV_indexes[zigzag + 1]] = temp_buf[1];
		out_data[UV_indexes[zigzag + 2]] = temp_buf[2];
		out_data[UV_indexes[zigzag + 3]] = temp_buf[3];
		out_data[UV_indexes[zigzag + 4]] = temp_buf[4];
		out_data[UV_indexes[zigzag + 5]] = temp_buf[5];
		out_data[UV_indexes[zigzag + 6]] = temp_buf[6];
		out_data[UV_indexes[zigzag + 7]] = temp_buf[7];
	}
}
Example #30
0
INLINE void store8b(void *ptr, const avxb& b) {
  return _mm256_store_ps((float*)ptr,b);
}