void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
//----------------------------------------------------------------- // AOS -> SOA // // pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ... // -> // pBlu: b0, b1, b2, b3, b4, ... // pGrn: g0, g1, g2, g3, g4, ... // pRed: r0, r1, r2, r3, r4, ... void aos2soa(float *pBgr, float *pBlu, float *pGrn, float *pRed, const size_t length) { __m128 *bgr = (__m128 *)pBgr; float *b = pBlu; float *g = pGrn; float *r = pRed; for (size_t i = 0; i < length; i += 24, b += 8, g += 8, r += 8) { __m256 m03 = _mm256_castps128_ps256(*bgr++); // 下半分のロード __m256 m14 = _mm256_castps128_ps256(*bgr++); __m256 m25 = _mm256_castps128_ps256(*bgr++); m03 = _mm256_insertf128_ps(m03, *bgr++, 1); // 上半分のロード m14 = _mm256_insertf128_ps(m14, *bgr++, 1); m25 = _mm256_insertf128_ps(m25, *bgr++, 1); __m256 bg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // b と g の上部分 __m256 gr = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // g と r の下部分 __m256 bb = _mm256_shuffle_ps(m03, bg, _MM_SHUFFLE(2, 0, 3, 0)); __m256 gg = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0)); __m256 rr = _mm256_shuffle_ps(gr, m25, _MM_SHUFFLE(3, 0, 3, 1)); _mm256_store_ps(b, bb); _mm256_store_ps(g, gg); _mm256_store_ps(r, rr); } }
static inline void rectifier_kernel_avx5(float *a, const size_t blocks) { for (size_t i = 0; i < blocks; ++i) { _mm256_store_ps( &a[i*8*5 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 0*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 1*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 2*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 3*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 4*8] ) , _mm256_setzero_ps() ) ); } }
double compute_pi_leibniz_avx_opt_single(size_t n) { double pi = 0.0; register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256 ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0); ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0); ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0); ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0); ymm13 = _mm256_set1_ps(64.0); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); for (int i = 0; i <= n - 32; i += 32) { ymm9 = _mm256_div_ps(ymm0, ymm1); ymm1 = _mm256_add_ps(ymm1, ymm13); ymm10 = _mm256_div_ps(ymm0, ymm2); ymm2 = _mm256_add_ps(ymm2, ymm13); ymm11 = _mm256_div_ps(ymm0, ymm3); ymm3 = _mm256_add_ps(ymm3, ymm13); ymm12 = _mm256_div_ps(ymm0, ymm4); ymm4 = _mm256_add_ps(ymm4, ymm13); ymm5 = _mm256_add_ps(ymm5, ymm9); ymm6 = _mm256_add_ps(ymm6, ymm10); ymm7 = _mm256_add_ps(ymm7, ymm11); ymm8 = _mm256_add_ps(ymm8, ymm12); } float tmp[8] __attribute__((aligned(32))); _mm256_store_ps(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; return pi * 4.0; }
static inline void rectifier_kernel_back_avx(float *a, const size_t blocks) { assert(blocks > 0); size_t i = blocks; do { _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) ); } while(--i); }
void aaaa(const int c, float * const a3, float * const a4, const float const * fck){ __m256 a_fck = _mm256_load_ps(fck); for(int i=0;i<c;i+=16){ __m256 b_i = _mm256_load_ps(&a4[i]); __m256 out_i = _mm256_mul_ps(b_i, a_fck); __m256 c_i = _mm256_load_ps(&a4[i+8]); __m256 out_i2 = _mm256_mul_ps(c_i, a_fck); _mm256_store_ps(&a3[i], out_i); _mm256_store_ps(&a3[i+8], out_i2); } }
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl) { int zigzag; __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Set the dct_values for the current interation dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]], in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]], in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]], in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(dct_values, factor); // Load quant-values and multiply with previous product quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_div_ps(result, quant_values); // Round off values and store in out_data buffer result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(out_data + zigzag, result); } }
float nv_vector_dot(const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[8], 32); __m256 x, u; int n; int pk_lp = (vec1->n & 0xfffffff8); float dp = 0.0f; u = _mm256_setzero_ps(); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm256_add_ps(u, _mm256_mul_ps(x, *(__m256 *)&NV_MAT_V(vec1, m1, n))); } _mm256_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[4], 16); __m128 x, u; int n; int pk_lp = (vec1->n & 0xfffffffc); float dp = 0.0f; u = _mm_setzero_ps(); for (n = 0; n < pk_lp; n += 4) { x = _mm_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm_add_ps(u, _mm_mul_ps(x, *(__m128 *)&NV_MAT_V(vec1, m1, n))); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #else { int n; float dp = 0.0f; for (n = 0; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #endif }
/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
void memsetf(float *ptr, float value, size_t length) { #ifdef __AVX__ const __m256 fillvec = _mm256_set1_ps(value); size_t startIndex = align_complement_f32(ptr); for (size_t i = 0; i < startIndex; i++) { ptr[i] = value; } for (int i = (int)startIndex; i < (int)length - 7; i += 8) { _mm256_store_ps(ptr + i, fillvec); } for (size_t i = startIndex + ((length - startIndex) & ~0x7); i < length; i++) { ptr[i] = value; } #elif defined(__ARM_NEON__) const float32x4_t fillvec = vdupq_n_f32(value); for (int i = 0; i < (int)length - 3; i += 4) { vst1q_f32(ptr + i, fillvec); } for (size_t i = (length & ~0x3); i < length; i++) { ptr[i] = value; } #else for (size_t i = 0; i < length; i++) { ptr[i] = value; } #endif }
void neuralNet::activationPrime_avx(const float* neuronOutput, float* result) { static const __m256 ones = _mm256_set1_ps(1.0f); static const __m256 sigCoefficients = _mm256_set1_ps(SIGMOIDCOEFFICIENT); __m256 temp; const __m256* vOutput = (__m256*)neuronOutput; // 1 - ans temp = _mm256_sub_ps(ones, *vOutput); // (1-ans) * ans temp = _mm256_mul_ps(temp, *vOutput); // ans * coefficient temp = _mm256_mul_ps(temp, sigCoefficients); #ifndef NDEBUG const float* _temp = (float*)&temp; assert(fastabs(_temp[0] - activationPrime(neuronOutput[0])) < 0.05f); assert(fastabs(_temp[1] - activationPrime(neuronOutput[1])) < 0.05f); assert(fastabs(_temp[2] - activationPrime(neuronOutput[2])) < 0.05f); assert(fastabs(_temp[3] - activationPrime(neuronOutput[3])) < 0.05f); assert(fastabs(_temp[4] - activationPrime(neuronOutput[4])) < 0.05f); assert(fastabs(_temp[5] - activationPrime(neuronOutput[5])) < 0.05f); assert(fastabs(_temp[6] - activationPrime(neuronOutput[6])) < 0.05f); assert(fastabs(_temp[7] - activationPrime(neuronOutput[7])) < 0.05f); #endif // return ans _mm256_store_ps(result, temp); };
void _mm256_print_ps(__m256 x) { size_t i; float tab[8]; _mm256_store_ps (tab,x); for (i=0;i<8;i++) printf ("%8.2f ",tab[i]); }
//------------------------------------------------------------------- // blend void effect(float *pBuffer[], const Cbmp *bmp, const float weight) { size_t width = bmp->getWidth(); size_t height = bmp->getAbsHeight(); const float fmul0 = weight; const float fmul1 = 1.0f - weight; __m256 weight0 = _mm256_broadcast_ss(&fmul0); __m256 weight1 = _mm256_broadcast_ss(&fmul1); float *pF[2]; pF[0] = pBuffer[0]; pF[1] = pBuffer[1]; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8) { __m256 p0 = _mm256_load_ps(pF[0]); p0 = _mm256_mul_ps(p0, weight0); __m256 p1 = _mm256_load_ps(pF[1]); p1 = _mm256_mul_ps(p1, weight1); __m256 r = _mm256_add_ps(p0, p1); _mm256_store_ps(pF[0], r); } } }
/* * compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr */ void comp_s(st_coords arr, int L) { for(int i=0; i<L; i+=8) { __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); register __m256 s0, s1; /* Temporaries */ /* _TODO_B_ Complete this function using intrinsics so that is computes: s[i:i+8] = sqrt(t[i:i+8]**2 - (x[i:i+8]**2 + y[i:i+8]**2 + z[i:i+8]**2)) where, s, t, x, y, z are members of arr. You will use: _mm256_mul_ps(); _mm256_add_ps(); _mm256_sub_ps(); _mm256_sqrt_ps(); */ _mm256_store_ps(&arr.s[i], s0); } return; }
//------------------------------------------------------------------- // effect static void effect(float *pBlu, float *pGrn, float *pRed, const size_t height, const size_t width) { __m256 zeroPs = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0); float *b = pBlu; float *r = pRed; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width / 8; x++, b += 8, r += 8) { _mm256_store_ps(b, zeroPs); // B // G, skip _mm256_store_ps(r, zeroPs); // R } } }
static __forceinline void convert_half_to_float(float* dstp, const uint8_t* srcp, size_t count) { for (size_t x = 0; x < count; x += 8) { __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + 2 * x)); __m256 d = _mm256_cvtph_ps(s); _mm256_store_ps(dstp + x, d); } }
void rectifier_avx_2(float *a, const size_t len) { float *p = a; for (; p + 8 <= &a[len]; p += 8) { _mm256_store_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) ); } for (; p < &a[len]; ++p) { *p = *p > 0.0 ? *p : 0.0; } }
// ============================================================= // ====================== RGBX2BGRX_32F ======================== // ============================================================= void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width, unsigned int _pitchs, unsigned int _pitchd, unsigned int _start, unsigned int _stop) { #ifdef USE_SSE const unsigned int widthz = (_pitchs/8); // Get start positions for buffers const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<widthz; ++x ) { #ifdef USE_AVX1 const __m256 v0 = _mm256_load_ps(tsrc); tsrc+=8; __m256 r0 = _mm256_permute_ps(v0,0xc6); _mm256_store_ps(tdest, r0 ); tdest+=8; #else // NOT TESTED const __m128 v0 = _mm_load_ps(tsrc); tsrc+=4; const __m128 v1 = _mm_load_ps(tsrc); tsrc+=4; //__m128 r0 = _mm_shuffle_ps(v0,0xc6); //__m128 r1 = _mm_shuffle_ps(v1,0xc6); //_mm_store_ps(tdest, r0 ); tdest+=4; //_mm_store_ps(tdest, r1 ); tdest+=4; #endif } } #else const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<_width; ++x ) { float t = tsrc[4*x]; tdest[4*x] = tsrc[4*x+2]; tdest[4*x+2] = t; } } #endif }
void AlignedAvxMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 8) { __m256 v1 = _mm256_load_ps(&a[i]); __m256 v2 = _mm256_load_ps(&b[i]); __m256 r = _mm256_mul_ps(v1, v2); _mm256_store_ps(&d[i], r); } }
//-------------------------------------------------------------------------- // amp AVX static void effectAvx(float fData[], const float amp, const size_t length) { __m256 psAmp = _mm256_broadcast_ss(&); __m256 *pIn = (__m256 *)fData; for (size_t i = 0; i < length; i += 8, pIn++) { __m256 a = _mm256_mul_ps(*pIn, psAmp); _mm256_store_ps(&fData[i], a); } }
static void scale_block(float *in_data, float *out_data) { __m256 in_vector, result; // Load the a1 values into a register static float a1_values[8] __attribute__((aligned(32))) = { ISQRT2, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __m256 a1 = _mm256_load_ps(a1_values); // Load the a2 values into a register for the exception case __m256 a2 = _mm256_set1_ps(ISQRT2); /* First row is an exception * Requires two _mm256_mul_ps operations */ in_vector = _mm256_load_ps(in_data); result = _mm256_mul_ps(in_vector, a1); result = _mm256_mul_ps(result, a2); _mm256_store_ps(out_data, result); // Remaining calculations can be done with one _mm256_mul_ps operation in_vector = _mm256_load_ps(in_data + 8); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 8, result); in_vector = _mm256_load_ps(in_data + 16); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 16, result); in_vector = _mm256_load_ps(in_data + 24); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 24, result); in_vector = _mm256_load_ps(in_data + 32); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 32, result); in_vector = _mm256_load_ps(in_data + 40); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 40, result); in_vector = _mm256_load_ps(in_data + 48); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 48, result); in_vector = _mm256_load_ps(in_data + 56); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 56, result); }
void nv_vector_add(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); NV_ASSERT(vec2->n == vec0->n); #if NV_ENABLE_AVX { __m256 x; int n; int pk_lp = (vec1->n & 0xfffffff8); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n)); _mm256_store_ps(&NV_MAT_V(vec0, m0, n), _mm256_add_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #elif NV_ENABLE_SSE2 { int n; int pk_lp = (vec1->n & 0xfffffffc); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), _mm_add_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #endif }
static void dct_1d_general(float* in_data, float* out_data, float lookup[64]) { __m256 current, dct_values, multiplied, sum; current = _mm256_broadcast_ss(in_data); dct_values = _mm256_load_ps(lookup); multiplied = _mm256_mul_ps(dct_values, current); sum = multiplied; // Broadcasts a single float (scalar) to every element in 'current'. current = _mm256_broadcast_ss(in_data + 1); // Loads DCT values from the lookup table. iDCT uses a transposed lookup table here. dct_values = _mm256_load_ps(lookup + 8); // Vertically multiply the scalar with the DCT values. multiplied = _mm256_mul_ps(dct_values, current); // Vertically add to the previous sum. sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 2); dct_values = _mm256_load_ps(lookup + 16); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 3); dct_values = _mm256_load_ps(lookup + 24); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 4); dct_values = _mm256_load_ps(lookup + 32); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 5); dct_values = _mm256_load_ps(lookup + 40); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 6); dct_values = _mm256_load_ps(lookup + 48); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 7); dct_values = _mm256_load_ps(lookup + 56); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); _mm256_store_ps(out_data, sum); }
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const { float pe = 0.0; int atom_i = 0; /* BEGIN SIMD SECTION */ // This performs the exact same calculation after the SIMD section // but doing it on 8 atoms at a time using SIMD instructions. coordinates8 rand_xyz8(rand_xyz), atom_xyz; __m256 r2_8, mask, r6, ri6, pe_tmp; __m256 pe_sum = _mm256_setzero_ps(); float result[n] __attribute__((aligned (16))); for (; atom_i < this->n-8; atom_i+=8) { atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i); r2_8 = distance2(atom_xyz, rand_xyz8, box); mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS); r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8)); ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6)); pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8))); pe_sum = _mm256_add_ps(pe_tmp, pe_sum); } _mm256_store_ps(result, pe_sum); for (int i = 0; i < 8; i++) { pe += result[i]; } /* END SIMD SECTION */ for (; atom_i < this->n; atom_i++) { coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i); float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box)); if (r2 < this->rcut2) { float ri6 = 1.0/(pow(r2,3)); pe += ri6*(this->c12*ri6 - this->c6); } } pe += this->n/vol * this->tail_factor;; return pe; }
void nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm) { NV_ASSERT(a->n >= x->n); #if NV_ENABLE_AVX { __m256 xx, vv; int n; int pk_lp = (x->n & 0xfffffff8); vv = _mm256_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 8) { xx = _mm256_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm256_div_ps(vv, xx); _mm256_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #elif NV_ENABLE_SSE2 { __m128 xx, vv; int n; int pk_lp = (x->n & 0xfffffffc); vv = _mm_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 4) { xx = _mm_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm_div_ps(vv, xx); _mm_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #else { int n; for (n = 0; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #endif }
void polynomial(float *ret, const float *const r_values, int num) { // r*r*r*(10+r*(-15+r*6)); __m256 const_6 = _mm256_set1_ps(6.0f); __m256 const_neg_15 = _mm256_set1_ps(-15.0f); __m256 const_10 = _mm256_set1_ps(10.0f); // constants const int loop_factor = 8; for (int i = 0; i < num; i+=loop_factor) { #ifdef USE_IACA IACA_START #endif __m256 r; __m256 left; __m256 right; // aligned load of 256 bits r r = _mm256_load_ps(&r_values[i]); left = _mm256_mul_ps(r, r); // r * r #ifndef __FMA__ right = _mm256_mul_ps(r, const_6); // r * 6 left = _mm256_mul_ps(left, r); // r * r * r right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6 right = _mm256_mul_ps(right, r); //r * (-15 + r * 6) right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6)) #else right = _mm256_fmadd_ps(r, const_6, const_neg_15); left = _mm256_mul_ps(left, r); right = _mm256_fmadd_ps(r, right, const_10); #endif right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6)) _mm256_store_ps(&ret[i], right); // store 8 values to ret[i] } #ifdef USE_IACA IACA_END #endif }
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result) { BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f); // code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html // approximates sigmoid function with coefficient 4.0f static const __m256 ones = _mm256_set1_ps(1.0f); static const __m256 oneFourths = _mm256_set1_ps(0.25f); static const __m256 fours = _mm256_set1_ps(4.0f); __m256 temp; const __m256* vOutput = (__m256*)_neuronOutput; // min (output, 4.0) temp = _mm256_min_ps(*vOutput, fours); // multiply by 0.25 temp = _mm256_mul_ps(temp, oneFourths); // 1 - ans temp = _mm256_sub_ps(ones, temp); // ans^16 temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); // 1 + ans temp = _mm256_add_ps(ones, temp); // 1 / ans temp = _mm256_rcp_ps(temp); #ifndef NDEBUG const float* _temp = (float*)&temp; assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f); assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f); assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f); assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f); assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f); assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f); assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f); assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f); #endif // return ans _mm256_store_ps(result, temp); };
void nv_vector_muls(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm, float v) { NV_ASSERT(a->n >= x->n); #if NV_ENABLE_AVX { __m256 vv; int n; int pk_lp = (x->n & 0xfffffff8); vv = _mm256_set1_ps(v); for (n = 0; n < pk_lp; n += 8) { _mm256_store_ps(&NV_MAT_V(a, am, n), _mm256_mul_ps(vv, *(const __m256 *)&NV_MAT_V(x, xm, n))); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v; } } #elif NV_ENABLE_SSE2 { __m128 vv; int n; int pk_lp = (x->n & 0xfffffffc); vv = _mm_set1_ps(v); for (n = 0; n < pk_lp; n += 4) { _mm_store_ps(&NV_MAT_V(a, am, n), _mm_mul_ps(vv, *(const __m128 *)&NV_MAT_V(x, xm, n))); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v; } } #else { int n; for (n = 0; n < x->n; ++n) { NV_MAT_V(a, am, n) = NV_MAT_V(x, xm, n) * v; } } #endif }
static void dequantize_block(float *in_data, float *out_data, float *quant_tbl) { int zigzag; // Temporary buffer float temp_buf[8] __attribute__((aligned(32))); __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Load dct-values dct_values = _mm256_load_ps(in_data + zigzag); quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_mul_ps(dct_values, quant_values); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(result, factor); // Round off products and store them temporarily result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(temp_buf, result); // Store the results at the correct places in the out_data buffer out_data[UV_indexes[zigzag]] = temp_buf[0]; out_data[UV_indexes[zigzag + 1]] = temp_buf[1]; out_data[UV_indexes[zigzag + 2]] = temp_buf[2]; out_data[UV_indexes[zigzag + 3]] = temp_buf[3]; out_data[UV_indexes[zigzag + 4]] = temp_buf[4]; out_data[UV_indexes[zigzag + 5]] = temp_buf[5]; out_data[UV_indexes[zigzag + 6]] = temp_buf[6]; out_data[UV_indexes[zigzag + 7]] = temp_buf[7]; } }
INLINE void store8b(void *ptr, const avxb& b) { return _mm256_store_ps((float*)ptr,b); }