inline void avx2_xy_to_uv_with_remainder_f( __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v) { avx2_xy_to_uv_f(x_in_dx_out, y_in_dy_out, u, v); x_in_dx_out = _mm256_sub_ps(x_in_dx_out, _mm256_cvtepi32_ps(u)); __m256 vf = _mm256_cvtepi32_ps(v); x_in_dx_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vx), x_in_dx_out); y_in_dy_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vy), y_in_dy_out); }
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
inline void avx2_xy_trans_to_uv_with_remainder_f( __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); __m256 vscale = _mm256_set1_ps(1.0f/scale); x_in_dx_out = _mm256_mul_ps(_mm256_sub_ps(x_in_dx_out, _mm256_set1_ps(dx)), vscale); y_in_dy_out = _mm256_mul_ps(_mm256_sub_ps(y_in_dy_out, _mm256_set1_ps(dy)), vscale); __m256 yy = _mm256_mul_ps(x_in_dx_out, vsrot); yy = _mm256_fmsub_ps(y_in_dy_out, vcrot, yy); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fmadd_ps(y_in_dy_out, vsrot, x_in_dx_out); avx2_xy_to_uv_with_remainder_f(x_in_dx_out, yy, u, v); vscale = _mm256_set1_ps(scale); y_in_dy_out = _mm256_mul_ps(yy, vcrot); y_in_dy_out = _mm256_fmadd_ps(x_in_dx_out, vsrot, y_in_dy_out); y_in_dy_out = _mm256_mul_ps(y_in_dy_out, vscale); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fnmadd_ps(yy, vsrot, x_in_dx_out); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vscale); }
__m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fnmadd_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}} // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> %{{.+}}) return _mm256_fnmadd_ps(a, b, c); }
__m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK: @llvm.x86.fma.vfnmadd.ps.256 return _mm256_fnmadd_ps(a, b, c); }
template <> __m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c) { return _mm256_fnmadd_ps(a, b, c); }
void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_) { #ifdef __AVX2__ const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ); const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ); const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2); const int N = q.N; const int N_n = q_n.N_pad; __m256 Epot8 = _mm256_setzero_ps(); __m256 _1 = _mm256_set1_ps(1.f); __m256 _2 = _mm256_set1_ps(2.f); __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ); for(int i = 0; i < N; ++i) { const __m256 xi = _mm256_set1_ps(q.x[i]); const __m256 yi = _mm256_set1_ps(q.y[i]); const __m256 zi = _mm256_set1_ps(q.z[i]); __m256 f_xi = _mm256_setzero_ps(); __m256 f_yi = _mm256_setzero_ps(); __m256 f_zi = _mm256_setzero_ps(); for(int j = 0; j < N_n; j += 8) { // count number of interactions if(MOLEC_CELLLIST_COUNT_INTERACTION) ++num_potential_interactions; // load coordinates and fores into AVX vectors const __m256 xj = _mm256_load_ps(&q_n.x[j]); const __m256 yj = _mm256_load_ps(&q_n.y[j]); const __m256 zj = _mm256_load_ps(&q_n.z[j]); __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]); __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]); __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]); // distance computation const __m256 xij = _mm256_sub_ps(xi, xj); const __m256 yij = _mm256_sub_ps(yi, yj); const __m256 zij = _mm256_sub_ps(zi, zj); const __m256 zij2 = _mm256_mul_ps(zij, zij); const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2)); // r2 < Rcut2 const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ); // if( any(r2 < R2) ) if(_mm256_movemask_ps(mask)) { const __m256 r2inv = _mm256_div_ps(_1, r2); const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv); const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2); const __m256 s12 = _mm256_mul_ps(s6, s6); const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6); const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6); Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask)); const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6); const __m256 fr_mask = _mm256_and_ps(fr, mask); // update forces f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi); f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi); f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi); f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj); f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj); f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj); // store back j-forces _mm256_store_ps(&q_n.f_x[j], f_xj); _mm256_store_ps(&q_n.f_y[j], f_yj); _mm256_store_ps(&q_n.f_z[j], f_zj); } } // update i-forces float MOLEC_ALIGNAS(32) f_array[8]; _mm256_store_ps(f_array, f_xi); q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_yi); q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_zi); q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; } float MOLEC_ALIGNAS(32) E_pot_array[8]; _mm256_store_ps(E_pot_array, Epot8); // perform reduction of potential energy *Epot_ += 4 * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2] + E_pot_array[3] + E_pot_array[4] + E_pot_array[5] + E_pot_array[6] + E_pot_array[7]); #endif }
__m256 check_mm256_fnmadd_ps (__m256 a, __m256 b, __m256 c) { return _mm256_fnmadd_ps (a, b, c); }