__m128d test_mm_and_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_and_pd // DAG: and <4 x i32> // // ASM-LABEL: test_mm_and_pd // ASM: pand return _mm_and_pd(A, B); }
inline F64vec2 abs(const F64vec2 &a) { static const union { int i[4]; __m128d m; } __f64vec2_abs_mask = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; return _mm_and_pd(a, __f64vec2_abs_mask.m); }
/** * Processes two doubles at a time */ int _mandelbrot_2( double const * const c_re_arg, double const * const c_im_arg, int max_iter ) { __m128d z_re = _mm_load_pd(c_re_arg); __m128d z_im = _mm_load_pd(c_im_arg); __m128d y_re; __m128d y_im; __m128d c_re = z_re; __m128d c_im = z_im; __m128i count = _mm_set1_epi64x(0); __m128d md; __m128d mt; __m128i mi = _mm_set1_epi16(0xffff);; __m128d two = _mm_set1_pd(2.0); __m128i one = _mm_set1_epi64x(1); for (int i = 0; i<max_iter; i+=1) { // y = z .* z; y_re = _mm_mul_pd(z_re, z_re); y_im = _mm_mul_pd(z_im, z_im); // y = z * z; y_re = _mm_sub_pd(y_re, y_im); y_im = _mm_mul_pd(z_re, z_im); y_im = _mm_add_pd(y_im, y_im); // z = z * z + c z_re = _mm_add_pd(y_re, c_re); z_im = _mm_add_pd(y_im, c_im); // if condition // md = _mm_add_pd(z_re, z_im); // md = _mm_cmplt_pd(md, four); md = _mm_cmplt_pd(z_re, two); mt = _mm_cmplt_pd(z_im, two); md = _mm_and_pd(md, mt); mi = _mm_and_si128(mi, (__m128i) md); // PRINT_M128I(mi); if ( !_mm_movemask_pd(md) ) { break; } // count iterations count = _mm_add_epi64( count, _mm_and_si128( mi, one) ); } int val; count = _mm_add_epi64( _mm_srli_si128(count, 8), count ); val = _mm_cvtsi128_si64( count ); return val; }
__SIMDd _SIMD_and_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_and_pd(a,b); #elif defined USE_AVX return _m256_and_ps(a,b); #elif defined USE_IBM return vec_and(a,b); #endif }
__SIMDd _SIMD_sel_pd(__SIMDd a, __SIMDd b, void** resultPtr) { #ifdef USE_SSE __SIMDd* result = (__SIMDd*) (*resultPtr); return _mm_or_pd(_mm_andnot_pd(*result,a),_mm_and_pd(*result,b)); #elif defined USE_AVX __SIMDd* result = (__SIMDd*) resultPtr; return _mm256_or_pd(_mm256_andnot_pd(*result,a),_mm256_and_pd(*result,b)); #elif defined USE_IBM return vec_sel(a,b,c); #endif }
/** Return the significand and the exponent both in double precision **/ __m128d frexp_sse(__m128d x, __m128d *e) { /* Integer exponent */ __m128i ei; /* Save the exponent */ ei = _mm_and_si128(_mm_castpd_si128(x), *(__m128i*)pi64_mantissa_mask); ei = _mm_srli_epi64(ei, 52); ei = _mm_shuffle_epi32(ei,216); ei = _mm_sub_epi32(ei, *(__m128i*)pi32_bias4i); *e = _mm_cvtepi32_pd(ei); /* Save the significand */ x = _mm_and_pd(x, *(__m128d*)pi64_inv_mantissa_mask); x = _mm_or_pd(x, *(__m128d*)pd_half_mask); return x; }
/* vms_expma: * Compute the component-wise exponential minus <a>: * r[i] <-- e^x[i] - a * * The following comments apply to the SSE2 version of this code: * * Computation is done four doubles as a time by doing computation in paralell * on two vectors of two doubles using SSE2 intrisics. If size is not a * multiple of 4, the remaining elements are computed using the stdlib exp(). * * The computation is done by first doing a range reduction of the argument of * the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5]. * Then 2^k can be computed exactly using bit operations to build the double * result and e^f can be efficiently computed with enough precision using a * polynomial approximation. * * The polynomial approximation is done with 11th order polynomial computed by * Remez algorithm with the Solya suite, instead of the more classical Pade * polynomial form cause it is better suited to parallel execution. In order * to achieve the same precision, a Pade form seems to require three less * multiplications but need a very costly division, so it will be less * efficient. * * The maximum error is less than 1lsb and special cases are correctly * handled: * +inf or +oor --> return +inf * -inf or -oor --> return 0.0 * qNaN or sNaN --> return qNaN * * This code is copyright 2004-2012 Thomas Lavergne and licenced under the * BSD licence like the remaining of Wapiti. */ void xvm_expma(double r[], const double x[], double a, uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v)))) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128i vl = _mm_set1_epi64x(0x3ff0000000000000ULL); const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL); const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL); const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL); const __m128d hal = xvm_vconst(0x3fe0000000000000ULL); const __m128d nan = xvm_vconst(0xfff8000000000000ULL); const __m128d inf = xvm_vconst(0x7ff0000000000000ULL); const __m128d c1 = xvm_vconst(0x3fe62e4000000000ULL); const __m128d c2 = xvm_vconst(0x3eb7f7d1cf79abcaULL); const __m128d p0 = xvm_vconst(0x3feffffffffffffeULL); const __m128d p1 = xvm_vconst(0x3ff000000000000bULL); const __m128d p2 = xvm_vconst(0x3fe0000000000256ULL); const __m128d p3 = xvm_vconst(0x3fc5555555553a2aULL); const __m128d p4 = xvm_vconst(0x3fa55555554e57d3ULL); const __m128d p5 = xvm_vconst(0x3f81111111362f4fULL); const __m128d p6 = xvm_vconst(0x3f56c16c25f3bae1ULL); const __m128d p7 = xvm_vconst(0x3f2a019fc9310c33ULL); const __m128d p8 = xvm_vconst(0x3efa01825f3cb28bULL); const __m128d p9 = xvm_vconst(0x3ec71e2bd880fdd8ULL); const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL); const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { __m128d mn1, mn2, mi1, mi2; __m128d t1, t2, d1, d2; __m128d v1, v2, w1, w2; __m128i k1, k2; __m128d f1, f2; // Load the next four values __m128d x1 = _mm_load_pd(x + n ); __m128d x2 = _mm_load_pd(x + n + 2); // Check for out of ranges, infinites and NaN mn1 = _mm_cmpneq_pd(x1, x1); mn2 = _mm_cmpneq_pd(x2, x2); mi1 = _mm_cmpgt_pd(x1, ehi); mi2 = _mm_cmpgt_pd(x2, ehi); x1 = _mm_max_pd(x1, elo); x2 = _mm_max_pd(x2, elo); // Range reduction: we search k and f such that e^x = 2^k * e^f // with f in [-0.5, 0.5] t1 = _mm_mul_pd(x1, l2e); t2 = _mm_mul_pd(x2, l2e); t1 = _mm_add_pd(t1, hal); t2 = _mm_add_pd(t2, hal); k1 = _mm_cvttpd_epi32(t1); k2 = _mm_cvttpd_epi32(t2); d1 = _mm_cvtepi32_pd(k1); d2 = _mm_cvtepi32_pd(k2); t1 = _mm_mul_pd(d1, c1); t2 = _mm_mul_pd(d2, c1); f1 = _mm_sub_pd(x1, t1); f2 = _mm_sub_pd(x2, t2); t1 = _mm_mul_pd(d1, c2); t2 = _mm_mul_pd(d2, c2); f1 = _mm_sub_pd(f1, t1); f2 = _mm_sub_pd(f2, t2); // Evaluation of e^f using a 11th order polynom in Horner form v1 = _mm_mul_pd(f1, p11); v2 = _mm_mul_pd(f2, p11); v1 = _mm_add_pd(v1, p10); v2 = _mm_add_pd(v2, p10); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p9); v2 = _mm_add_pd(v2, p9); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p8); v2 = _mm_add_pd(v2, p8); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p7); v2 = _mm_add_pd(v2, p7); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p6); v2 = _mm_add_pd(v2, p6); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p5); v2 = _mm_add_pd(v2, p5); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p4); v2 = _mm_add_pd(v2, p4); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p3); v2 = _mm_add_pd(v2, p3); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p2); v2 = _mm_add_pd(v2, p2); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p1); v2 = _mm_add_pd(v2, p1); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p0); v2 = _mm_add_pd(v2, p0); // Evaluation of 2^k using bitops to achieve exact computation k1 = _mm_slli_epi32(k1, 20); k2 = _mm_slli_epi32(k2, 20); k1 = _mm_shuffle_epi32(k1, 0x72); k2 = _mm_shuffle_epi32(k2, 0x72); k1 = _mm_add_epi32(k1, vl); k2 = _mm_add_epi32(k2, vl); w1 = _mm_castsi128_pd(k1); w2 = _mm_castsi128_pd(k2); // Return to full range to substract <a> v1 = _mm_mul_pd(v1, w1); v2 = _mm_mul_pd(v2, w2); v1 = _mm_sub_pd(v1, va); v2 = _mm_sub_pd(v2, va); // Finally apply infinite and NaN where needed v1 = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1)); v2 = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2)); v1 = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1)); v2 = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2)); // Store the results _mm_store_pd(r + n, v1); _mm_store_pd(r + n + 2, v2); } #else for (uint64_t n = 0; n < N; n++) r[n] = exp(x[n]) - a; #endif }
mlib_status F_NAME( mlib_d64 *dst, const mlib_d64 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { mlib_u8 *buff, *buff1; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m128d *dp0, *dp1; __m128d aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1, t0, t1; __m128d e_mask; mlib_s32 i, j, wid16, tail; wid = (wid - 2) * SSIZE; wid16 = (wid + 15) & ~15; buff = __mlib_malloc(2 * wid16); buff1 = buff + wid16; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; tail = wid & 15; ((mlib_d64 *)&e_mask)[0] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0]; ((mlib_d64 *)&e_mask)[1] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1]; sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; sl += 2 * slb; for (i = 0; i < wid; i += 16) { c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; } for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; /* * line0: aa * line1: bb * line2: c0 c1 c2 * line3: d0 d1 d2 */ for (i = 0; i <= wid - 16; i += 16) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; _mm_storeu_pd((mlib_d64 *)dp1, r1); dp1++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; } if (tail) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t1 = _mm_loadu_pd((mlib_d64 *)dp1); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); t1 = _mm_or_pd(_mm_and_pd(e_mask, r1), _mm_andnot_pd(e_mask, t1)); _mm_storeu_pd((mlib_d64 *)dp0, t0); _mm_storeu_pd((mlib_d64 *)dp1, t1); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; for (i = 0; i <= wid - 16; i += 16) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); cc = C_COMP(c0, c1); cc = C_COMP(cc, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; sp0 += 16; sp1 += 16; sp2 += 16; } if (tail) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); c1 = C_COMP(c0, c1); cc = C_COMP(c1, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); _mm_storeu_pd((mlib_d64 *)dp0, t0); } } __mlib_free(buff); return (MLIB_SUCCESS); }
mlib_status F_NAME( mlib_d64 *dst, const mlib_d64 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m128d *dp0, *dp1; __m128d aa, bb, cc, dd, ee, ff, r0, r1, t0, t1; __m128d g0, g1, g2, g3, g4, g5, g6, gg; __m128d h0, h1, h2, h3, h4, h5, h6, hh; __m128d e_mask; mlib_s32 i, j, wid16, tail; wid = (wid - KSIZE1) * SSIZE; wid16 = (wid + 15) & ~15; pbuff = __mlib_malloc(KSIZE1 * wid16); buff0 = pbuff; buff1 = buff0 + wid16; buff2 = buff1 + wid16; buff3 = buff2 + wid16; buff4 = buff3 + wid16; buff5 = buff4 + wid16; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 15; ((mlib_d64 *)&e_mask)[0] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0]; ((mlib_d64 *)&e_mask)[1] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); sp0 += 16; sp1 += 16; sp6 += 16; sp7 += 16; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 16; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; _mm_storeu_pd((mlib_d64 *)dp1, r1); dp1++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; sp4 += 16; sp5 += 16; sp6 += 16; sp7 += 16; } if (tail) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t1 = _mm_loadu_pd((mlib_d64 *)dp1); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); t1 = _mm_or_pd(_mm_and_pd(e_mask, r1), _mm_andnot_pd(e_mask, t1)); _mm_storeu_pd((mlib_d64 *)dp0, t0); _mm_storeu_pd((mlib_d64 *)dp1, t1); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 16; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; sp4 += 16; sp5 += 16; sp6 += 16; } if (tail) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); _mm_storeu_pd((mlib_d64 *)dp0, t0); } } __mlib_free(pbuff); return (MLIB_SUCCESS); }
inline F64vec2 mask_and(const F64vec2 &l, const F64vec2 &r) { return _mm_and_pd(l, r); }
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, l_end_pre, j; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128i v_cur_roots, v_old_roots, v_new_roots; __m128 v_rootmask; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3, pad, pad_r; idx1 = (n+1)*(n+2)/2 + n/2; e[idx1] = q[n]; idx1++; pad = 1; // pad contains the padding for row i+1 // for row n it's always 1 for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } // idx2 now points to the beginning of the next line. idx2 += pad; // padding of line i+1 idx3 = idx1; pad_r = pad; // padding of line r for (r = i; r < n; ++r) { pad_r = !pad_r; // padding of line r+1 // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&3); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 4 ) { v01 = _mm_load_pd( &w[idx1 ] ); v11 = _mm_load_pd( &w[idx1+2] ); v00 = _mm_load_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency v10 = _mm_load_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_load_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_load_pd( &e[idx1+2] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 )); v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 )); _mm_store_pd( &e[idx1 ], v00 ); _mm_store_pd( &e[idx1+2], v10 ); v_rootmask = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_old_roots = _mm_lddqu_si128( &root[idx1] ); v_new_roots = _mm_or_si128( _mm_and_si128( v_cur_roots, _mm_castps_si128( v_rootmask ) ), _mm_andnot_si128( v_old_roots, _mm_castps_si128( v_rootmask ) ) ); _mm_storeu_si128( &root[idx1], v_new_roots ); idx1 += 4; } idx2 += pad_r; idx3++; } pad = !pad; // every other line as padding 0, or 1, respectively } // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[n + !(n&1)]; }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }
__m128d t1(void) { return _mm_and_pd (magic_a,magic_b); }
int calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm) { int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset; int jnrA,jnrB; int j3A,j3B; double shX,shY,shZ; double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr; double sum_ai2, sum_ai3,tsum,tchain,doffset; double *obc_param; double *gb_radius; double *work; int * jjnr; double *dadx; double *shiftvec; double min_rad,rad; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3,t4; __m128d rsq,rinv,r; __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij; __m128d uij,lij2,uij2,lij3,uij3,diff2; __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum; __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv; __m128d dadx1,dadx2; __m128d logterm; __m128d mask; __m128d obc_mask1,obc_mask2,obc_mask3; __m128d oneeighth = _mm_set1_pd(0.125); __m128d onefourth = _mm_set1_pd(0.25); const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d neg = _mm_set1_pd(-1.0); /* Set the dielectric offset */ doffset = born->gb_doffset; gb_radius = born->gb_radius; obc_param = born->param; work = born->gpol_hct_work; jjnr = nl->jjnr; dadx = fr->dadx; shiftvec = fr->shift_vec[0]; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); jnrA = jnrB = 0; for(i=0;i<born->nr;i++) { work[i] = 0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); rai = _mm_load1_pd(gb_radius+ii); rai_inv= gmx_mm_inv_pd(rai); sum_ai = _mm_setzero_pd(); sk_ai = _mm_load1_pd(born->param+ii); sk2_ai = _mm_mul_pd(sk_ai,sk_ai); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj); dx = _mm_sub_pd(ix, jx); dy = _mm_sub_pd(iy, jy); dz = _mm_sub_pd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_pd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_pd(r,sk_aj); t2 = _mm_sub_pd(r,sk_aj); t3 = _mm_sub_pd(sk_aj,r); obc_mask1 = _mm_cmplt_pd(rai, t1); obc_mask2 = _mm_cmplt_pd(rai, t2); obc_mask3 = _mm_cmplt_pd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_pd(sk_aj,sk_aj); sk2_rinv = _mm_mul_pd(sk2_aj,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_pd(r,sk_ai); t2 = _mm_sub_pd(r,sk_ai); t3 = _mm_sub_pd(sk_ai,r); obc_mask1 = _mm_cmplt_pd(raj, t1); obc_mask2 = _mm_cmplt_pd(raj, t2); obc_mask3 = _mm_cmplt_pd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_pd(sk2_ai,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } /* end normal inner loop */ if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj); dx = _mm_sub_sd(ix, jx); dy = _mm_sub_sd(iy, jy); dz = _mm_sub_sd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_sd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_sd(r,sk_aj); t2 = _mm_sub_sd(r,sk_aj); t3 = _mm_sub_sd(sk_aj,r); obc_mask1 = _mm_cmplt_sd(rai, t1); obc_mask2 = _mm_cmplt_sd(rai, t2); obc_mask3 = _mm_cmplt_sd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_sd(sk_aj,sk_aj); sk2_rinv = _mm_mul_sd(sk2_aj,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_sd(r,sk_ai); t2 = _mm_sub_sd(r,sk_ai); t3 = _mm_sub_sd(sk_ai,r); obc_mask1 = _mm_cmplt_sd(raj, t1); obc_mask2 = _mm_cmplt_sd(raj, t2); obc_mask3 = _mm_cmplt_sd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_sd(sk2_ai,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_sd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_sd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } gmx_mm_update_1pot_pd(sum_ai,work+ii); } /* Parallel summations */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } if(gb_algorithm==egbHCT) { /* HCT */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset; sum = 1.0/rr - work[i]; min_rad = rr + doffset; rad = 1.0/sum; born->bRad[i] = rad > min_rad ? rad : min_rad; fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } } else { /* OBC */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]; rr_inv2 = 1.0/rr; rr = rr-doffset; rr_inv = 1.0/rr; sum = rr * work[i]; sum2 = sum * sum; sum3 = sum2 * sum; tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3); born->bRad[i] = rr_inv - tsum*rr_inv2; born->bRad[i] = 1.0 / born->bRad[i]; fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]); tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2); born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2; } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); dd_atom_spread_real(cr->dd, born->drobc); } } return 0; }
BOOST_FORCEINLINE __m128d __vectorcall operator & ( __m128d const left, __m128d const right ) { return _mm_and_pd ( left, right ); }