/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid) { // The following algorithm works until hexid=12,589,056 // const unsigned iarg = 1+4*(hexid-1)/3; // return (unsigned(std::sqrt(float(iarg)))+1)/2; __m256 arg = _mm256_cvtepi32_ps(hexid); arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds), calin::math::simd::c_m256(_c_m256_one_third)); arg = _mm256_sqrt_ps(arg); arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half), calin::math::simd::c_m256(_c_m256_one_half)); arg = _mm256_floor_ps(arg); return _mm256_cvtps_epi32(arg); }
void check_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C) { union256 a, b, c, e; a.x = __A; b.x = __B; c.x = __C; float d[8]; int i; e.x = _mm256_fmsub_ps (__A, __B, __C); for (i = 0; i < 8; i++) { d[i] = a.a[i] * b.a[i] - c.a[i]; } if (check_union256 (e, d)) abort (); }
inline void avx2_xy_trans_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { // x = (x - dx)/scale; // y = (y - dy)/scale; // double xx = x*crot + y*srot; // y = y*crot - x*srot; // xy_to_uv(xx,y,u,v); const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); const __m256 vscale = _mm256_set1_ps(1.0f/scale); x = _mm256_mul_ps(_mm256_sub_ps(x, _mm256_set1_ps(dx)), vscale); y = _mm256_mul_ps(_mm256_sub_ps(y, _mm256_set1_ps(dy)), vscale); __m256 yy = _mm256_mul_ps(x, vsrot); yy = _mm256_fmsub_ps(y, vcrot, yy); x = _mm256_mul_ps(x, vcrot); x = _mm256_fmadd_ps(y, vsrot, x); avx2_xy_to_uv_f(x, yy, u, v); }
inline void avx2_uv_to_xy_trans_f(const __m256i u, const __m256i v, __m256& x, __m256& y, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { // uv_to_xy(u,v,x,y); // double xx = x*crot - y*srot; // y = scale * (y*crot + x*srot) + dy; // x = scale * xx + dx; avx2_uv_to_xy_f(u,v,x,y); const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); const __m256 vscale = _mm256_set1_ps(scale); __m256 xx = _mm256_mul_ps(y, vsrot); xx = _mm256_fmsub_ps(x, vcrot, xx); y = _mm256_mul_ps(y, vcrot); y = _mm256_fmadd_ps(x, vsrot, y); x = xx; x = _mm256_fmadd_ps(x, vscale, _mm256_set1_ps(dx)); y = _mm256_fmadd_ps(y, vscale, _mm256_set1_ps(dy)); }
inline void avx2_xy_trans_to_uv_with_remainder_f( __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); __m256 vscale = _mm256_set1_ps(1.0f/scale); x_in_dx_out = _mm256_mul_ps(_mm256_sub_ps(x_in_dx_out, _mm256_set1_ps(dx)), vscale); y_in_dy_out = _mm256_mul_ps(_mm256_sub_ps(y_in_dy_out, _mm256_set1_ps(dy)), vscale); __m256 yy = _mm256_mul_ps(x_in_dx_out, vsrot); yy = _mm256_fmsub_ps(y_in_dy_out, vcrot, yy); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fmadd_ps(y_in_dy_out, vsrot, x_in_dx_out); avx2_xy_to_uv_with_remainder_f(x_in_dx_out, yy, u, v); vscale = _mm256_set1_ps(scale); y_in_dy_out = _mm256_mul_ps(yy, vcrot); y_in_dy_out = _mm256_fmadd_ps(x_in_dx_out, vsrot, y_in_dy_out); y_in_dy_out = _mm256_mul_ps(y_in_dy_out, vscale); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fnmadd_ps(yy, vsrot, x_in_dx_out); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vscale); }
__m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmsub_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}} // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> %{{.+}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) return _mm256_fmsub_ps(a, b, c); }
__m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK: @llvm.x86.fma.vfmsub.ps.256 return _mm256_fmsub_ps(a, b, c); }
__m256 check_mm256_fmsub_ps (__m256 a, __m256 b, __m256 c) { return _mm256_fmsub_ps (a, b, c); }