void gblas_quantizer::quantization(const double* input, double* output, int rows, int cols) { std::cerr << "Deprecated method: gblas_quantizer::quantization()" << std::endl; exit(0); // for (int i=0; i < rows; i++) // { // for (int j=0; j < cols; j++) // { // output[i*cols + j] = quantize_sample(&input[i*cols + j]); // } // } // for (int i=0; i < rows*cols; i++) // { // output[i] = (int)(input[i]/gblas_status.q_step + ZERO_DOT_FIVE); //quantize_sample(&input[i]); // } __m128d curr; __m128d inv_q_step = _mm_div_pd(_mm_set1_pd(1.0), _mm_set1_pd(q_step)); const double* in_p = input; double* out_p = output; for (int i=((rows*cols) >> 1); i > 0; i--) { curr = _mm_load_pd(in_p); in_p += 2; curr = _mm_mul_pd(curr, inv_q_step); curr = _mm_add_pd(curr, _MM_ZERO_DOT_FIVE_D); curr = _mm_cvtepi32_pd(_mm_cvttpd_epi32(curr)); _mm_store_pd(out_p, curr); out_p += 2; } }
__m128i test_mm_cvttpd_epi32(__m128d A) { // DAG-LABEL: test_mm_cvttpd_epi32 // DAG: call <4 x i32> @llvm.x86.sse2.cvttpd2dq // // ASM-LABEL: test_mm_cvttpd_epi32 // ASM: cvttpd2dq return _mm_cvttpd_epi32(A); }
SIMDValue SIMDInt32x4Operation::OpFromFloat64x2(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); // Converts the 2 double-precision, floating-point values to 32-bit signed integers // using truncate. using truncate one instead of _mm_cvtpd_epi32 x86Result.m128i_value = _mm_cvttpd_epi32(v.m128d_value); return X86SIMDValue::ToSIMDValue(x86Result); }
/* vms_expma: * Compute the component-wise exponential minus <a>: * r[i] <-- e^x[i] - a * * The following comments apply to the SSE2 version of this code: * * Computation is done four doubles as a time by doing computation in paralell * on two vectors of two doubles using SSE2 intrisics. If size is not a * multiple of 4, the remaining elements are computed using the stdlib exp(). * * The computation is done by first doing a range reduction of the argument of * the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5]. * Then 2^k can be computed exactly using bit operations to build the double * result and e^f can be efficiently computed with enough precision using a * polynomial approximation. * * The polynomial approximation is done with 11th order polynomial computed by * Remez algorithm with the Solya suite, instead of the more classical Pade * polynomial form cause it is better suited to parallel execution. In order * to achieve the same precision, a Pade form seems to require three less * multiplications but need a very costly division, so it will be less * efficient. * * The maximum error is less than 1lsb and special cases are correctly * handled: * +inf or +oor --> return +inf * -inf or -oor --> return 0.0 * qNaN or sNaN --> return qNaN * * This code is copyright 2004-2012 Thomas Lavergne and licenced under the * BSD licence like the remaining of Wapiti. */ void xvm_expma(double r[], const double x[], double a, uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v)))) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128i vl = _mm_set1_epi64x(0x3ff0000000000000ULL); const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL); const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL); const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL); const __m128d hal = xvm_vconst(0x3fe0000000000000ULL); const __m128d nan = xvm_vconst(0xfff8000000000000ULL); const __m128d inf = xvm_vconst(0x7ff0000000000000ULL); const __m128d c1 = xvm_vconst(0x3fe62e4000000000ULL); const __m128d c2 = xvm_vconst(0x3eb7f7d1cf79abcaULL); const __m128d p0 = xvm_vconst(0x3feffffffffffffeULL); const __m128d p1 = xvm_vconst(0x3ff000000000000bULL); const __m128d p2 = xvm_vconst(0x3fe0000000000256ULL); const __m128d p3 = xvm_vconst(0x3fc5555555553a2aULL); const __m128d p4 = xvm_vconst(0x3fa55555554e57d3ULL); const __m128d p5 = xvm_vconst(0x3f81111111362f4fULL); const __m128d p6 = xvm_vconst(0x3f56c16c25f3bae1ULL); const __m128d p7 = xvm_vconst(0x3f2a019fc9310c33ULL); const __m128d p8 = xvm_vconst(0x3efa01825f3cb28bULL); const __m128d p9 = xvm_vconst(0x3ec71e2bd880fdd8ULL); const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL); const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { __m128d mn1, mn2, mi1, mi2; __m128d t1, t2, d1, d2; __m128d v1, v2, w1, w2; __m128i k1, k2; __m128d f1, f2; // Load the next four values __m128d x1 = _mm_load_pd(x + n ); __m128d x2 = _mm_load_pd(x + n + 2); // Check for out of ranges, infinites and NaN mn1 = _mm_cmpneq_pd(x1, x1); mn2 = _mm_cmpneq_pd(x2, x2); mi1 = _mm_cmpgt_pd(x1, ehi); mi2 = _mm_cmpgt_pd(x2, ehi); x1 = _mm_max_pd(x1, elo); x2 = _mm_max_pd(x2, elo); // Range reduction: we search k and f such that e^x = 2^k * e^f // with f in [-0.5, 0.5] t1 = _mm_mul_pd(x1, l2e); t2 = _mm_mul_pd(x2, l2e); t1 = _mm_add_pd(t1, hal); t2 = _mm_add_pd(t2, hal); k1 = _mm_cvttpd_epi32(t1); k2 = _mm_cvttpd_epi32(t2); d1 = _mm_cvtepi32_pd(k1); d2 = _mm_cvtepi32_pd(k2); t1 = _mm_mul_pd(d1, c1); t2 = _mm_mul_pd(d2, c1); f1 = _mm_sub_pd(x1, t1); f2 = _mm_sub_pd(x2, t2); t1 = _mm_mul_pd(d1, c2); t2 = _mm_mul_pd(d2, c2); f1 = _mm_sub_pd(f1, t1); f2 = _mm_sub_pd(f2, t2); // Evaluation of e^f using a 11th order polynom in Horner form v1 = _mm_mul_pd(f1, p11); v2 = _mm_mul_pd(f2, p11); v1 = _mm_add_pd(v1, p10); v2 = _mm_add_pd(v2, p10); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p9); v2 = _mm_add_pd(v2, p9); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p8); v2 = _mm_add_pd(v2, p8); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p7); v2 = _mm_add_pd(v2, p7); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p6); v2 = _mm_add_pd(v2, p6); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p5); v2 = _mm_add_pd(v2, p5); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p4); v2 = _mm_add_pd(v2, p4); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p3); v2 = _mm_add_pd(v2, p3); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p2); v2 = _mm_add_pd(v2, p2); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p1); v2 = _mm_add_pd(v2, p1); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p0); v2 = _mm_add_pd(v2, p0); // Evaluation of 2^k using bitops to achieve exact computation k1 = _mm_slli_epi32(k1, 20); k2 = _mm_slli_epi32(k2, 20); k1 = _mm_shuffle_epi32(k1, 0x72); k2 = _mm_shuffle_epi32(k2, 0x72); k1 = _mm_add_epi32(k1, vl); k2 = _mm_add_epi32(k2, vl); w1 = _mm_castsi128_pd(k1); w2 = _mm_castsi128_pd(k2); // Return to full range to substract <a> v1 = _mm_mul_pd(v1, w1); v2 = _mm_mul_pd(v2, w2); v1 = _mm_sub_pd(v1, va); v2 = _mm_sub_pd(v2, va); // Finally apply infinite and NaN where needed v1 = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1)); v2 = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2)); v1 = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1)); v2 = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2)); // Store the results _mm_store_pd(r + n, v1); _mm_store_pd(r + n + 2, v2); } #else for (uint64_t n = 0; n < N; n++) r[n] = exp(x[n]) - a; #endif }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void nb_kernel430_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * vc, int * type, int * p_ntype, double * vdwparam, double * vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads; int n,ii,is3,ii3,k,nj0,nj1,ggid; double shX,shY,shZ; int offset,nti; int jnrA,jnrB; int j3A,j3B; int tjA,tjB; gmx_gbdata_t *gbdata; double * gpol; __m128d iq,qq,jq,isai; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; __m128d fix,fiy,fiz,tx,ty,tz,rsq; __m128d rinv,isaj,isaprod; __m128d vcoul,fscal,gbscale,c6,c12; __m128d rinvsq,r,rtab; __m128d eps,Y,F,G,H; __m128d VV,FF,Fp; __m128d vgb,fijGB,dvdatmp; __m128d rinvsix,vvdw6,vvdw12,vvdwtmp; __m128d facel,gbtabscale,dvdaj; __m128d fijD,fijR; __m128d xmm1,tabscale,eps2; __m128i n0, nnn; const __m128d neg = _mm_set1_pd(-1.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d minushalf = _mm_set1_pd(-0.5); const __m128d two = _mm_set1_pd(2.0); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); gbtabscale = _mm_load1_pd(p_gbtabscale); facel = _mm_load1_pd(p_facel); tabscale = _mm_load1_pd(p_tabscale); nj1 = 0; jnrA = jnrB = 0; j3A = j3B = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); c6 = _mm_setzero_pd(); c12 = _mm_setzero_pd(); for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; ii = iinr[n]; ii3 = 3*ii; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shY+pos[ii3+1]); iz = _mm_set1_pd(shZ+pos[ii3+2]); iq = _mm_load1_pd(charge+ii); iq = _mm_mul_pd(iq,facel); isai = _mm_load1_pd(invsqrta+ii); nti = 2*ntype*type[ii]; vctot = _mm_setzero_pd(); vvdwtot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1; k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = jnrA * 3; j3B = jnrB * 3; GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_pd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq); GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; tjB = nti+2*type[jnrB]; GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12); isaprod = _mm_mul_pd(isai,isaj); qq = _mm_mul_pd(iq,jq); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor)); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_pd(rsq,rinv); rtab = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) ); F = _mm_add_pd(F, _mm_add_pd( G , H ) ); Y = _mm_add_pd(Y, _mm_mul_pd(F, eps)); F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two))); vgb = _mm_mul_pd(Y, qq); fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale)); dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); dvdasum = _mm_add_pd(dvdasum, dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj)); GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); vvdwtmp = _mm_add_pd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijGB); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_pd(fscal,dx); ty = _mm_mul_pd(fscal,dy); tz = _mm_mul_pd(fscal,dz); /* Increment i atom force */ fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz); } /* In double precision, offset can only be either 0 or 1 */ if(k<nj1) { jnrA = jjnr[k]; j3A = jnrA * 3; GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_sd(rinv,rinv); /* These reason for zeroing these variables here is for fixing bug 585 * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0], * and r1=0, but it should be r1=a[1]. * This might be a compiler issue (tested with gcc-4.1.3 and -O3). * To work around it, we zero these variables and use _mm_add_pd (**) instead * Note that the only variables that get affected are the energies since * the total sum needs to be correct */ vgb = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); vvdw6 = _mm_setzero_pd(); vvdw12 = _mm_setzero_pd(); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq); GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12); isaprod = _mm_mul_sd(isai,isaj); qq = _mm_mul_sd(jq,iq); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* (**) */ /* Polarization interaction */ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor)); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_sd(rsq,rinv); rtab = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) ); F = _mm_add_sd(F, _mm_add_sd( G , H ) ); Y = _mm_add_sd(Y, _mm_mul_sd(F, eps)); F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two))); vgb = _mm_mul_sd(Y, qq); fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale)); dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj)); GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); vvdwtmp = _mm_add_sd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */ xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijGB); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_sd(fscal,dx); ty = _mm_mul_sd(fscal,dy); tz = _mm_mul_sd(fscal,dz); /* Increment i atom force */ fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz); } dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai)); gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3); ggid = gid[n]; gmx_mm_update_1pot_pd(vctot,vc+ggid); gmx_mm_update_1pot_pd(vgbtot,gpol+ggid); gmx_mm_update_1pot_pd(dvdasum,dvda+ii); gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid); } *outeriter = nri; *inneriter = nj1; }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }