/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
void mandel_avx(unsigned char *image, const struct spec *s) { __m256 xmin = _mm256_set1_ps(s->xlim[0]); __m256 ymin = _mm256_set1_ps(s->ylim[0]); __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width); __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height); __m256 threshold = _mm256_set1_ps(4); __m256 one = _mm256_set1_ps(1); __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations); __m256 depth_scale = _mm256_set1_ps(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 8) { __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x + 0); __m256 my = _mm256_set1_ps(y); __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin); __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin); __m256 zr = cr; __m256 zi = ci; int k = 1; __m256 mk = _mm256_set1_ps(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m256 zr2 = _mm256_mul_ps(zr, zr); __m256 zi2 = _mm256_mul_ps(zi, zi); __m256 zrzi = _mm256_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr); zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm256_mul_ps(zr, zr); zi2 = _mm256_mul_ps(zi, zi); __m256 mag2 = _mm256_add_ps(zr2, zi2); __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS); mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk); /* Early bailout? */ if (_mm256_testz_ps(mask, _mm256_set1_ps(-1))) break; } mk = _mm256_mul_ps(mk, iter_scale); mk = _mm256_sqrt_ps(mk); mk = _mm256_mul_ps(mk, depth_scale); __m256i pixels = _mm256_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 8; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
/* Sqrt */ __SIMD _SIMD_sqrt_ps(__SIMD a) { #ifdef USE_SSE return _mm_sqrt_ps(a); #elif defined USE_AVX return _mm256_sqrt_ps(a); #elif defined USE_IBM return vec_sqrt(a); #endif }
__m256 distance(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2) { const __m256 x_diff = _mm256_sub_ps(x1, x2); const __m256 y_diff = _mm256_sub_ps(y1, y2); const __m256 x_diff2 = _mm256_mul_ps(x_diff, x_diff); const __m256 y_diff2 = _mm256_mul_ps(y_diff, y_diff); const __m256 sum = _mm256_add_ps(x_diff2, y_diff2); const __m256 dist = _mm256_sqrt_ps(sum); return dist; }
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid) { // The following algorithm works until hexid=12,589,056 // const unsigned iarg = 1+4*(hexid-1)/3; // return (unsigned(std::sqrt(float(iarg)))+1)/2; __m256 arg = _mm256_cvtepi32_ps(hexid); arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds), calin::math::simd::c_m256(_c_m256_one_third)); arg = _mm256_sqrt_ps(arg); arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half), calin::math::simd::c_m256(_c_m256_one_half)); arg = _mm256_floor_ps(arg); return _mm256_cvtps_epi32(arg); }
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale)); __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2); for (int i = 0; i < 8; i++) { if (close.m256_f32[i] == 0) { numNeighbours[7 - i] = 0; } } __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
inline vec8 sqrt(vec8 a) { return _mm256_sqrt_ps(a); }
/*! * \brief Compute the square root of each element in the given vector * \return a vector containing the square root of each input element */ ETL_STATIC_INLINE(avx_simd_float) sqrt(avx_simd_float x) { return _mm256_sqrt_ps(x.value); }
// ----------------------------------------------------------------------------- // Main routine // ----------------------------------------------------------------------------- int main() { int i; srand48(0); // seed PRNG double e,s; // timestamp variables float *a, *b; // data pointers float *pA,*pB; // work pointer __m128 rA,rB; // variables for SSE __m256 rA_AVX, rB_AVX; // variables for AVX // define vector size const int vector_size = 10000000; // allocate memory a = (float*) _mm_malloc (vector_size*sizeof(float),32); b = (float*) _mm_malloc (vector_size*sizeof(float),32); // initialize vectors // for(i=0;i<vector_size;i++) { a[i]=fabs(drand48()); b[i]=0.0f; } // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // Naive implementation // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ s = getCurrentTime(); pA = __builtin_assume_aligned(a, 16); pB = __builtin_assume_aligned(b, 16); for (i=0; i<vector_size; i++){ // b[i] = sqrtf(sqrtf(sqrtf(a[i]))); pB[i] = sqrtf(sqrtf(sqrtf(pA[i]))); } e = getCurrentTime(); printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]); // cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl; // ----------------------------------------------------------------------------- for(i=0;i<vector_size;i++) { b[i]=0.0f; } // ----------------------------------------------------------------------------- // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // SSE2 implementation // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ pA = a; pB = b; s = getCurrentTime(); for (i=0; i<vector_size; i+=4){ rA = _mm_load_ps(pA); rB = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA))); _mm_store_ps(pB,rB); pA += 4; pB += 4; } e = getCurrentTime(); printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]); // ----------------------------------------------------------------------------- for(i=0;i<vector_size;i++) { b[i]=0.0f; } // ----------------------------------------------------------------------------- // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // AVX implementation // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ pA = a; pB = b; s = getCurrentTime(); for (i=0; i<vector_size; i+=8){ rA_AVX = _mm256_load_ps(pA); rB_AVX = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX))); _mm256_store_ps(pB,rB_AVX); pA += 8; pB += 8; } e = getCurrentTime(); printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]); _mm_free(a); _mm_free(b); return 0; }
DLL_LOCAL void fname(const float *a00, const float *a01, const float *a02, const float *a11, const float *a12, const float *a22, float *ev0, float *ev1, float *ev2, const size_t len) { const size_t avx_end = len & ~7; __m256 v_inv3 = _mm256_set1_ps(1.0 / 3.0); __m256 v_root3 = _mm256_sqrt_ps(_mm256_set1_ps(3.0)); __m256 two = _mm256_set1_ps(2.0); __m256 half = _mm256_set1_ps(0.5); __m256 zero = _mm256_setzero_ps(); for (size_t i = 0; i < avx_end; i += 8) { __m256 v_a00 = _mm256_loadu_ps(a00 + i); __m256 v_a01 = _mm256_loadu_ps(a01 + i); __m256 v_a02 = _mm256_loadu_ps(a02 + i); __m256 v_a11 = _mm256_loadu_ps(a11 + i); __m256 v_a12 = _mm256_loadu_ps(a12 + i); __m256 v_a22 = _mm256_loadu_ps(a22 + i); __m256 c0 = _avx_sub(_avx_sub(_avx_sub(_avx_add(_avx_mul(_avx_mul(v_a00, v_a11), v_a22), _avx_mul(_avx_mul(_avx_mul(two, v_a01), v_a02), v_a12)), _avx_mul(_avx_mul(v_a00, v_a12), v_a12)), _avx_mul(_avx_mul(v_a11, v_a02), v_a02)), _avx_mul(_avx_mul(v_a22, v_a01), v_a01)); __m256 c1 = _avx_sub(_avx_add(_avx_sub(_avx_add(_avx_sub(_avx_mul(v_a00, v_a11), _avx_mul(v_a01, v_a01)), _avx_mul(v_a00, v_a22)), _avx_mul(v_a02, v_a02)), _avx_mul(v_a11, v_a22)), _avx_mul(v_a12, v_a12)); __m256 c2 = _avx_add(_avx_add(v_a00, v_a11), v_a22); __m256 c2Div3 = _avx_mul(c2, v_inv3); __m256 aDiv3 = _avx_mul(_avx_sub(c1, _avx_mul(c2, c2Div3)), v_inv3); aDiv3 = _mm256_min_ps(aDiv3, zero); __m256 mbDiv2 = _avx_mul(half, _avx_add(c0, _avx_mul(c2Div3, _avx_sub(_avx_mul(_avx_mul(two, c2Div3), c2Div3), c1)))); __m256 q = _avx_add(_avx_mul(mbDiv2, mbDiv2), _avx_mul(_avx_mul(aDiv3, aDiv3), aDiv3)); q = _mm256_min_ps(q, zero); __m256 magnitude = _mm256_sqrt_ps(_avx_neg(aDiv3)); __m256 angle = _avx_mul(atan2_256_ps(_mm256_sqrt_ps(_avx_neg(q)), mbDiv2), v_inv3); __m256 cs, sn; sincos256_ps(angle, &sn, &cs); __m256 r0 = _avx_add(c2Div3, _avx_mul(_avx_mul(two, magnitude), cs)); __m256 r1 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_add(cs, _avx_mul(v_root3, sn)))); __m256 r2 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_sub(cs, _avx_mul(v_root3, sn)))); __m256 v_r0_tmp = _mm256_min_ps(r0, r1); __m256 v_r1_tmp = _mm256_max_ps(r0, r1); __m256 v_r0 = _mm256_min_ps(v_r0_tmp, r2); __m256 v_r2_tmp = _mm256_max_ps(v_r0_tmp, r2); __m256 v_r1 = _mm256_min_ps(v_r1_tmp, v_r2_tmp); __m256 v_r2 = _mm256_max_ps(v_r1_tmp, v_r2_tmp); _mm256_storeu_ps(ev2 + i, v_r0); _mm256_storeu_ps(ev1 + i, v_r1); _mm256_storeu_ps(ev0 + i, v_r2); } for (size_t i = avx_end; i < len; ++i) { float inv3 = 1.0 / 3.0; float root3 = sqrt(3.0); float c0 = a00[i] * a11[i] * a22[i] + 2.0 * a01[i] * a02[i] * a12[i] - a00[i] * a12[i] * a12[i] - a11[i] * a02[i] * a02[i] - a22[i] * a01[i] * a01[i]; float c1 = a00[i] * a11[i] - a01[i] * a01[i] + a00[i] * a22[i] - a02[i] * a02[i] + a11[i] * a22[i] - a12[i] * a12[i]; float c2 = a00[i] + a11[i] + a22[i]; float c2Div3 = c2 * inv3; float aDiv3 = (c1 - c2 * c2Div3) * inv3; if (aDiv3 > 0.0) aDiv3 = 0.0; float mbDiv2 = 0.5 * (c0 + c2Div3 * (2.0 * c2Div3 * c2Div3 - c1)); float q = mbDiv2 * mbDiv2 + aDiv3 * aDiv3 * aDiv3; if (q > 0.0) q = 0.0; float magnitude = sqrt(-aDiv3); float angle = atan2(sqrt(-q), mbDiv2) * inv3; float cs = cos(angle); float sn = sin(angle); float r0 = (c2Div3 + 2.0 * magnitude * cs); float r1 = (c2Div3 - magnitude * (cs + root3 * sn)); float r2 = (c2Div3 - magnitude * (cs - root3 * sn)); if (r0 < r1) swap(&r0, &r1); if (r0 < r2) swap(&r0, &r2); if (r1 < r2) swap(&r1, &r2); ev0[i] = r0; ev1[i] = r1; ev2[i] = r2; } }
template<class Extension,class Info> struct call<sqrt_,tag::simd_(tag::arithmetic_,Extension),Info> { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL_DISPATCH( 1, typename nt2::meta::scalar_of<A0>::type, (7, (float,double,uint64_t,uint32_t,uint16_t,uint8_t,arithmetic_)) ) NT2_FUNCTOR_CALL_EVAL_IF(1, float) { A0 that = { _mm256_sqrt_ps(a0)}; return that; } NT2_FUNCTOR_CALL_EVAL_IF(1, double) { A0 that = { _mm256_sqrt_pd(a0)}; return that; } NT2_FUNCTOR_CALL_EVAL_IF(1, uint64_t) { return simd::native_cast<A0>(toint(sqrt(tofloat(a0)))); } NT2_FUNCTOR_CALL_EVAL_IF(1, uint32_t) { A0 const na = isnez(a0); A0 const z1 = add(shri(a0, 6), integral_constant<A0,16>()); A0 const z2 = add(shri(a0,10), integral_constant<A0,256>()); A0 const z3 = add(shri(a0,13), integral_constant<A0,2048>());