Beispiel #1
__m128d test_mm_and_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_and_pd
  // DAG: and <4 x i32>
  // ASM-LABEL: test_mm_and_pd
  // ASM: pand
  return _mm_and_pd(A, B);
Beispiel #2
 inline F64vec2 abs(const F64vec2 &a)
     static const union
         int i[4];
         __m128d m;
     } __f64vec2_abs_mask = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
     return _mm_and_pd(a, __f64vec2_abs_mask.m);
Beispiel #3
 * Processes two doubles at a time
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
Beispiel #4
__SIMDd _SIMD_and_pd(__SIMDd a, __SIMDd b)
#ifdef  USE_SSE
  return _mm_and_pd(a,b);
#elif defined USE_AVX
  return _m256_and_ps(a,b);
#elif defined USE_IBM
  return vec_and(a,b);
Beispiel #5
__SIMDd _SIMD_sel_pd(__SIMDd a, __SIMDd b, void** resultPtr)
#ifdef  USE_SSE
  __SIMDd* result = (__SIMDd*) (*resultPtr);
  return _mm_or_pd(_mm_andnot_pd(*result,a),_mm_and_pd(*result,b));
#elif defined USE_AVX
  __SIMDd* result = (__SIMDd*) resultPtr;
  return _mm256_or_pd(_mm256_andnot_pd(*result,a),_mm256_and_pd(*result,b));
#elif defined USE_IBM
  return vec_sel(a,b,c);
Beispiel #6
/** Return the significand and the exponent both in double precision **/
__m128d frexp_sse(__m128d x, __m128d *e)
    /* Integer exponent */
    __m128i ei;

    /* Save the exponent */
    ei = _mm_and_si128(_mm_castpd_si128(x), *(__m128i*)pi64_mantissa_mask);
    ei = _mm_srli_epi64(ei, 52);
    ei = _mm_shuffle_epi32(ei,216);
    ei = _mm_sub_epi32(ei, *(__m128i*)pi32_bias4i);

    *e = _mm_cvtepi32_pd(ei);

    /* Save the significand */
    x = _mm_and_pd(x, *(__m128d*)pi64_inv_mantissa_mask);
    x = _mm_or_pd(x, *(__m128d*)pd_half_mask);

    return x;
Beispiel #7
/* vms_expma:
 *   Compute the component-wise exponential minus <a>:
 *       r[i] <-- e^x[i] - a
 *   The following comments apply to the SSE2 version of this code:
 *   Computation is done four doubles as a time by doing computation in paralell
 *   on two vectors of two doubles using SSE2 intrisics.  If size is not a
 *   multiple of 4, the remaining elements are computed using the stdlib exp().
 *   The computation is done by first doing a range reduction of the argument of
 *   the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5].
 *   Then 2^k can be computed exactly using bit operations to build the double
 *   result and e^f can be efficiently computed with enough precision using a
 *   polynomial approximation.
 *   The polynomial approximation is done with 11th order polynomial computed by
 *   Remez algorithm with the Solya suite, instead of the more classical Pade
 *   polynomial form cause it is better suited to parallel execution. In order
 *   to achieve the same precision, a Pade form seems to require three less
 *   multiplications but need a very costly division, so it will be less
 *   efficient.
 *   The maximum error is less than 1lsb and special cases are correctly
 *   handled:
 *     +inf or +oor  -->   return +inf
 *     -inf or -oor  -->   return  0.0
 *     qNaN or sNaN  -->   return qNaN
 *   This code is copyright 2004-2012 Thomas Lavergne and licenced under the
 *   BSD licence like the remaining of Wapiti.
void xvm_expma(double r[], const double x[], double a, uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
  #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v))))
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	const __m128i vl  = _mm_set1_epi64x(0x3ff0000000000000ULL);
	const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL);
	const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL);
	const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL);
	const __m128d hal = xvm_vconst(0x3fe0000000000000ULL);
	const __m128d nan = xvm_vconst(0xfff8000000000000ULL);
	const __m128d inf = xvm_vconst(0x7ff0000000000000ULL);
	const __m128d c1  = xvm_vconst(0x3fe62e4000000000ULL);
	const __m128d c2  = xvm_vconst(0x3eb7f7d1cf79abcaULL);
	const __m128d p0  = xvm_vconst(0x3feffffffffffffeULL);
	const __m128d p1  = xvm_vconst(0x3ff000000000000bULL);
	const __m128d p2  = xvm_vconst(0x3fe0000000000256ULL);
	const __m128d p3  = xvm_vconst(0x3fc5555555553a2aULL);
	const __m128d p4  = xvm_vconst(0x3fa55555554e57d3ULL);
	const __m128d p5  = xvm_vconst(0x3f81111111362f4fULL);
	const __m128d p6  = xvm_vconst(0x3f56c16c25f3bae1ULL);
	const __m128d p7  = xvm_vconst(0x3f2a019fc9310c33ULL);
	const __m128d p8  = xvm_vconst(0x3efa01825f3cb28bULL);
	const __m128d p9  = xvm_vconst(0x3ec71e2bd880fdd8ULL);
	const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL);
	const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL);
	const __m128d va  = _mm_set1_pd(a);
	for (uint64_t n = 0; n < N; n += 4) {
		__m128d mn1, mn2, mi1, mi2;
		__m128d t1,  t2,  d1,  d2;
		__m128d v1,  v2,  w1,  w2;
		__m128i k1,  k2;
		__m128d f1,  f2;
		// Load the next four values
		__m128d x1 = _mm_load_pd(x + n    );
		__m128d x2 = _mm_load_pd(x + n + 2);
		// Check for out of ranges, infinites and NaN
		mn1 = _mm_cmpneq_pd(x1, x1);	mn2 = _mm_cmpneq_pd(x2, x2);
		mi1 = _mm_cmpgt_pd(x1, ehi);	mi2 = _mm_cmpgt_pd(x2, ehi);
		x1  = _mm_max_pd(x1, elo);	x2  = _mm_max_pd(x2, elo);
		// Range reduction: we search k and f such that e^x = 2^k * e^f
		// with f in [-0.5, 0.5]
		t1  = _mm_mul_pd(x1, l2e);	t2  = _mm_mul_pd(x2, l2e);
		t1  = _mm_add_pd(t1, hal);	t2  = _mm_add_pd(t2, hal);
		k1  = _mm_cvttpd_epi32(t1);	k2  = _mm_cvttpd_epi32(t2);
		d1  = _mm_cvtepi32_pd(k1);	d2  = _mm_cvtepi32_pd(k2);
		t1  = _mm_mul_pd(d1, c1);	t2  = _mm_mul_pd(d2, c1);
		f1  = _mm_sub_pd(x1, t1);	f2  = _mm_sub_pd(x2, t2);
		t1  = _mm_mul_pd(d1, c2);	t2  = _mm_mul_pd(d2, c2);
		f1  = _mm_sub_pd(f1, t1);	f2  = _mm_sub_pd(f2, t2);
		// Evaluation of e^f using a 11th order polynom in Horner form
		v1  = _mm_mul_pd(f1, p11);	v2  = _mm_mul_pd(f2, p11);
		v1  = _mm_add_pd(v1, p10);	v2  = _mm_add_pd(v2, p10);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p9);	v2  = _mm_add_pd(v2, p9);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p8);	v2  = _mm_add_pd(v2, p8);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p7);	v2  = _mm_add_pd(v2, p7);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p6);	v2  = _mm_add_pd(v2, p6);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p5);	v2  = _mm_add_pd(v2, p5);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p4);	v2  = _mm_add_pd(v2, p4);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p3);	v2  = _mm_add_pd(v2, p3);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p2);	v2  = _mm_add_pd(v2, p2);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p1);	v2  = _mm_add_pd(v2, p1);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p0);	v2  = _mm_add_pd(v2, p0);
		// Evaluation of 2^k using bitops to achieve exact computation
		k1  = _mm_slli_epi32(k1, 20);	k2  = _mm_slli_epi32(k2, 20);
		k1  = _mm_shuffle_epi32(k1, 0x72);
		k2  = _mm_shuffle_epi32(k2, 0x72);
		k1  = _mm_add_epi32(k1, vl);	k2  = _mm_add_epi32(k2, vl);
		w1  = _mm_castsi128_pd(k1);	w2  = _mm_castsi128_pd(k2);
		// Return to full range to substract <a>
	        v1  = _mm_mul_pd(v1, w1);	v2  = _mm_mul_pd(v2, w2);
		v1  = _mm_sub_pd(v1, va);	v2  = _mm_sub_pd(v2, va);
		// Finally apply infinite and NaN where needed
		v1  = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2));
		v1  = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2));
		// Store the results
		_mm_store_pd(r + n,     v1);
		_mm_store_pd(r + n + 2, v2);
	for (uint64_t n = 0; n < N; n++)
		r[n] = exp(x[n]) - a;
    mlib_d64 *dst,
    const mlib_d64 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
	mlib_u8 *buff, *buff1;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
	__m128d *dp0, *dp1;
	__m128d aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1, t0, t1;
	__m128d e_mask;
	mlib_s32 i, j, wid16, tail;

	wid = (wid - 2) * SSIZE;
	wid16 = (wid + 15) & ~15;
	buff = __mlib_malloc(2 * wid16);
	buff1 = buff + wid16;

	sl = (mlib_u8 *)src;
/* dst ptrs skip top j and left col */
	dl = (mlib_u8 *)dst + dlb + SSIZE;

	tail = wid & 15;

	((mlib_d64 *)&e_mask)[0] =
		((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0];
	((mlib_d64 *)&e_mask)[1] =
		((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1];

	sp0 = buff;
	sp1 = buff1;
	sp2 = sl;
	sp3 = sp2 + slb;
	sl += 2 * slb;

	for (i = 0; i < wid; i += 16) {
		c0 = _mm_loadu_pd((mlib_d64 *)sp2);
		c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE));
		c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE));
		d0 = _mm_loadu_pd((mlib_d64 *)sp3);
		d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE));
		d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE));

		cc = C_COMP(c0, c1);
		dd = C_COMP(d0, d1);
		cc = C_COMP(cc, c2);
		dd = C_COMP(dd, d2);

		_mm_storeu_pd((mlib_d64 *)sp0, cc);
		_mm_storeu_pd((mlib_d64 *)sp1, dd);

		sp0 += 16;
		sp1 += 16;
		sp2 += 16;
		sp3 += 16;

	for (j = 0; j <= (hgt - 2 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;
		sp3 = sp2 + slb;

 *    line0:     aa
 *    line1:     bb
 *    line2:  c0 c1 c2
 *    line3:  d0 d1 d2

		for (i = 0; i <= wid - 16; i += 16) {

			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			c0 = _mm_loadu_pd((mlib_d64 *)sp2);
			c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE));
			c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE));
			d0 = _mm_loadu_pd((mlib_d64 *)sp3);
			d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE));
			d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE));

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			_mm_storeu_pd((mlib_d64 *)sp0, cc);
			_mm_storeu_pd((mlib_d64 *)sp1, dd);

			_mm_storeu_pd((mlib_d64 *)dp0, r0);
			_mm_storeu_pd((mlib_d64 *)dp1, r1);

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;
			sp3 += 16;

		if (tail) {
			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			c0 = _mm_loadu_pd((mlib_d64 *)sp2);
			c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE));
			c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE));
			d0 = _mm_loadu_pd((mlib_d64 *)sp3);
			d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE));
			d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE));

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			_mm_storeu_pd((mlib_d64 *)sp0, cc);
			_mm_storeu_pd((mlib_d64 *)sp1, dd);

			t0 = _mm_loadu_pd((mlib_d64 *)dp0);
			t1 = _mm_loadu_pd((mlib_d64 *)dp1);
			t0 =
			    _mm_or_pd(_mm_and_pd(e_mask, r0),
			    _mm_andnot_pd(e_mask, t0));
			t1 =
			    _mm_or_pd(_mm_and_pd(e_mask, r1),
			    _mm_andnot_pd(e_mask, t1));
			_mm_storeu_pd((mlib_d64 *)dp0, t0);
			_mm_storeu_pd((mlib_d64 *)dp1, t1);

		sl += 2 * slb;
		dl += 2 * dlb;

/* last line */

	if (j == (hgt - 3)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;

		for (i = 0; i <= wid - 16; i += 16) {
			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			c0 = _mm_loadu_pd((mlib_d64 *)sp2);
			c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE));
			c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE));

			cc = C_COMP(c0, c1);
			cc = C_COMP(cc, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			_mm_storeu_pd((mlib_d64 *)dp0, r0);

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;

		if (tail) {
			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			c0 = _mm_loadu_pd((mlib_d64 *)sp2);
			c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE));
			c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE));

			c1 = C_COMP(c0, c1);
			cc = C_COMP(c1, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			t0 = _mm_loadu_pd((mlib_d64 *)dp0);
			t0 =
			    _mm_or_pd(_mm_and_pd(e_mask, r0),
			    _mm_andnot_pd(e_mask, t0));
			_mm_storeu_pd((mlib_d64 *)dp0, t0);


	return (MLIB_SUCCESS);
    mlib_d64 *dst,
    const mlib_d64 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl;
	__m128d *dp0, *dp1;
	__m128d aa, bb, cc, dd, ee, ff, r0, r1, t0, t1;
	__m128d g0, g1, g2, g3, g4, g5, g6, gg;
	__m128d h0, h1, h2, h3, h4, h5, h6, hh;
	__m128d e_mask;
	mlib_s32 i, j, wid16, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid16 = (wid + 15) & ~15;
	pbuff = __mlib_malloc(KSIZE1 * wid16);
	buff0 = pbuff;
	buff1 = buff0 + wid16;
	buff2 = buff1 + wid16;
	buff3 = buff2 + wid16;
	buff4 = buff3 + wid16;
	buff5 = buff4 + wid16;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE);

	tail = wid & 15;

	((mlib_d64 *)&e_mask)[0] =
		((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0];
	((mlib_d64 *)&e_mask)[1] =
		((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1];

	for (j = 0; j < 3; j++) {
		sp0 = buff4;
		sp1 = buff5;
		sp6 = sl;
		sp7 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 16) {
			g0 = _mm_loadu_pd((mlib_d64 *)sp6);
			g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE));
			g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE));
			g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE));
			g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE));
			g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE));
			g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE));
			h0 = _mm_loadu_pd((mlib_d64 *)sp7);
			h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE));
			h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE));
			h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE));
			h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE));
			h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE));
			h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE));

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			_mm_storeu_pd((mlib_d64 *)sp0, gg);
			_mm_storeu_pd((mlib_d64 *)sp1, hh);

			sp0 += 16;
			sp1 += 16;
			sp6 += 16;
			sp7 += 16;

		if (j < 2) {
			buffT = buff0;
			buff0 = buff2;
			buff2 = buff4;
			buff4 = buffT;
			buffT = buff1;
			buff1 = buff3;
			buff3 = buff5;
			buff5 = buffT;

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;
		sp7 = sl + slb;

 *    line0:           aa
 *    line1:           bb
 *    line2:           cc
 *    line3:           dd
 *    line4:           ee
 *    line5:           ff
 *    line4:  g0 g1 g2 g3 g4 g5 g6
 *    line5:  h0 h1 h2 h3 h4 h5 h6

		for (i = 0; i <= wid - 16; i += 16) {
			g0 = _mm_loadu_pd((mlib_d64 *)sp6);
			g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE));
			g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE));
			g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE));
			g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE));
			g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE));
			g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE));
			h0 = _mm_loadu_pd((mlib_d64 *)sp7);
			h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE));
			h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE));
			h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE));
			h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE));
			h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE));
			h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE));

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			cc = _mm_loadu_pd((mlib_d64 *)sp2);
			dd = _mm_loadu_pd((mlib_d64 *)sp3);
			ee = _mm_loadu_pd((mlib_d64 *)sp4);
			ff = _mm_loadu_pd((mlib_d64 *)sp5);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			_mm_storeu_pd((mlib_d64 *)sp0, gg);
			_mm_storeu_pd((mlib_d64 *)sp1, hh);

			_mm_storeu_pd((mlib_d64 *)dp0, r0);
			_mm_storeu_pd((mlib_d64 *)dp1, r1);

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;
			sp3 += 16;
			sp4 += 16;
			sp5 += 16;
			sp6 += 16;
			sp7 += 16;

		if (tail) {
			g0 = _mm_loadu_pd((mlib_d64 *)sp6);
			g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE));
			g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE));
			g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE));
			g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE));
			g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE));
			g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE));
			h0 = _mm_loadu_pd((mlib_d64 *)sp7);
			h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE));
			h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE));
			h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE));
			h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE));
			h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE));
			h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE));

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			cc = _mm_loadu_pd((mlib_d64 *)sp2);
			dd = _mm_loadu_pd((mlib_d64 *)sp3);
			ee = _mm_loadu_pd((mlib_d64 *)sp4);
			ff = _mm_loadu_pd((mlib_d64 *)sp5);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			_mm_storeu_pd((mlib_d64 *)sp0, gg);
			_mm_storeu_pd((mlib_d64 *)sp1, hh);

			t0 = _mm_loadu_pd((mlib_d64 *)dp0);
			t1 = _mm_loadu_pd((mlib_d64 *)dp1);
			t0 =
			    _mm_or_pd(_mm_and_pd(e_mask, r0),
			    _mm_andnot_pd(e_mask, t0));
			t1 =
			    _mm_or_pd(_mm_and_pd(e_mask, r1),
			    _mm_andnot_pd(e_mask, t1));
			_mm_storeu_pd((mlib_d64 *)dp0, t0);
			_mm_storeu_pd((mlib_d64 *)dp1, t1);

		buffT = buff0;
		buff0 = buff2;
		buff2 = buff4;
		buff4 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buff5;
		buff5 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;

		for (i = 0; i <= wid - 16; i += 16) {
			g0 = _mm_loadu_pd((mlib_d64 *)sp6);
			g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE));
			g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE));
			g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE));
			g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE));
			g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE));
			g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE));

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			cc = _mm_loadu_pd((mlib_d64 *)sp2);
			dd = _mm_loadu_pd((mlib_d64 *)sp3);
			ee = _mm_loadu_pd((mlib_d64 *)sp4);
			ff = _mm_loadu_pd((mlib_d64 *)sp5);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);

			_mm_storeu_pd((mlib_d64 *)dp0, r0);

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;
			sp3 += 16;
			sp4 += 16;
			sp5 += 16;
			sp6 += 16;

		if (tail) {
			g0 = _mm_loadu_pd((mlib_d64 *)sp6);
			g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE));
			g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE));
			g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE));
			g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE));
			g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE));
			g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE));

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = _mm_loadu_pd((mlib_d64 *)sp0);
			bb = _mm_loadu_pd((mlib_d64 *)sp1);
			cc = _mm_loadu_pd((mlib_d64 *)sp2);
			dd = _mm_loadu_pd((mlib_d64 *)sp3);
			ee = _mm_loadu_pd((mlib_d64 *)sp4);
			ff = _mm_loadu_pd((mlib_d64 *)sp5);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);

			t0 = _mm_loadu_pd((mlib_d64 *)dp0);
			t0 =
			    _mm_or_pd(_mm_and_pd(e_mask, r0),
			    _mm_andnot_pd(e_mask, t0));
			_mm_storeu_pd((mlib_d64 *)dp0, t0);


	return (MLIB_SUCCESS);
Beispiel #10
inline F64vec2 mask_and(const F64vec2 &l, const F64vec2 &r)
    return _mm_and_pd(l, r);
Beispiel #11
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, l_end_pre, j;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128i v_cur_roots, v_old_roots, v_new_roots;
    __m128 v_rootmask;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3, pad, pad_r;
    idx1 = (n+1)*(n+2)/2 + n/2;
    e[idx1] = q[n];
    pad = 1;
    // pad contains the padding for row i+1
    // for row n it's always 1
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1 + pad;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        // idx2 now points to the beginning of the next line.
        idx2 += pad; // padding of line i+1

        idx3 = idx1;
        pad_r = pad; // padding of line r
        for (r = i; r < n; ++r) {
            pad_r = !pad_r; // padding of line r+1
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&3);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;

            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 4 ) {
                v01 = _mm_load_pd( &w[idx1  ] );
                v11 = _mm_load_pd( &w[idx1+2] );

                v00 = _mm_load_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency
                v10 = _mm_load_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_load_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_load_pd( &e[idx1+2] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );

                v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 ));
                v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 ));

                _mm_store_pd( &e[idx1  ], v00 );
                _mm_store_pd( &e[idx1+2], v10 );

                v_rootmask = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );

                v_old_roots = _mm_lddqu_si128( &root[idx1] );
                v_new_roots = _mm_or_si128(
                        _mm_and_si128(    v_cur_roots, 
                            _mm_castps_si128( v_rootmask ) ),
                        _mm_andnot_si128( v_old_roots,
                            _mm_castps_si128( v_rootmask ) )
                _mm_storeu_si128( &root[idx1], v_new_roots );

                idx1 += 4;

            idx2 += pad_r;
        pad = !pad;
        // every other line as padding 0, or 1, respectively

    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[n + !(n&1)];
Beispiel #12
// The input must be in domain [-1686629712, 1686629712].
// I tried to optimize the double to int conversion by using `magic`, but
// it was actually slower than using `_mm_cvttpd_epi32()` and it didn't
// offer greater domain for `x`.
static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) {
    SIMD_CONST_SQ(sign     , SIMD_UINT64_C(0x8000000000000000));
    SIMD_CONST_SI(int32_one, 1);
    SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698);
    SIMD_CONST_SD(DP1      , 7.85398125648498535156e-1);
    SIMD_CONST_SD(DP2      , 3.77489470793079817668e-8);
    SIMD_CONST_SD(DP3      , 2.69515142907905952645e-15);

#define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \
  SIMD_ALIGN_VAR(static const double, name[], 16) = { \
    x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \
    y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \
    x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \
    y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya  \

                2.75573136213857245213e-6 ,-1.98412698295895385996e-4,
                8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0,

                -1.13585365213876817300e-11, 2.08757008419747316778e-9,
                -2.75573141792967388112e-7 , 2.48015872888517045348e-5,
                -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0);

    __m128d y;
    __m128d sign = x;                                        // Sign bit.

    x = _mm_and_pd(x, SIMD_GET_PD(inv_sign));                // Take the absolute value.
    y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI));                // Integer part of `x * 4 / PI`.

    __m128i ival = _mm_cvttpd_epi32(y);                      // Extract the integer part of y.
    __m128i ione = SIMD_GET_PI(int32_one);

    ival = _mm_add_epi32(ival, ione);                        // j += 1.
    ival = _mm_andnot_si128(ione, ival);                     // j &=~1.

    y = _mm_cvtepi32_pd(ival);
    ival = _mm_unpacklo_epi32(ival, ival);

    sign = _mm_xor_pd(sign,                                  // Swap the sign bit if `j & 4`.
                      _mm_castsi128_pd(_mm_slli_epi64(ival, 61)));
    sign = _mm_and_pd(sign, SIMD_GET_PD(sign));              // Keep only the sign bit.

    // Get the polynom selection mask (j & 2):
    //   1. `0x0000000000000000` => `0    <= x <= PI/4`
    //   2. `0xFFFFFFFFFFFFFFFF` => `PI/4 <  x <= PI/2`
    ival = _mm_slli_epi32(ival, 30);
    ival = _mm_srai_epi32(ival, 31);

    // Extended precision modular arithmetic:
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3)));

    // Get the polynom coefficients for each lane (sin/cos).
    __m128d poly_mask = _mm_castsi128_pd(ival);
    const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) +
                           static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8;

    __m128d xx = _mm_mul_pd(x, x);
    y = coeff[0];
    y = Simd128::mad(y, xx, coeff[1]);
    y = Simd128::mad(y, xx, coeff[2]);
    y = Simd128::mad(y, xx, coeff[3]);
    y = Simd128::mad(y, xx, coeff[4]);
    y = Simd128::mad(y, xx, coeff[5]);
    y = _mm_mul_pd(y, xx);

    __m128d x_or_xx = _mm_or_pd(
                          _mm_and_pd(xx, poly_mask),
                          _mm_andnot_pd(poly_mask, x));

    y = _mm_mul_pd(y, x_or_xx);
    y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6]));
    y = _mm_add_pd(y, coeff[7]);

    return _mm_xor_pd(y, sign);
Beispiel #13
return _mm_and_pd (magic_a,magic_b);
calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
                                const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
	int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
    int jnrA,jnrB;
    int j3A,j3B;
	double shX,shY,shZ;
	double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
	double sum_ai2, sum_ai3,tsum,tchain,doffset;
	double *obc_param;
    double *gb_radius;
    double *work;
    int *  jjnr;
    double *dadx;
    double *shiftvec;
    double min_rad,rad;
	__m128d ix,iy,iz,jx,jy,jz;
	__m128d dx,dy,dz,t1,t2,t3,t4;
	__m128d rsq,rinv,r;
	__m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
	__m128d uij,lij2,uij2,lij3,uij3,diff2;
	__m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
	__m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
	__m128d dadx1,dadx2;
    __m128d logterm;
	__m128d mask;
	__m128d obc_mask1,obc_mask2,obc_mask3;    
    __m128d oneeighth   = _mm_set1_pd(0.125);
    __m128d onefourth   = _mm_set1_pd(0.25);
	const __m128d half  = _mm_set1_pd(0.5);
	const __m128d three = _mm_set1_pd(3.0);
	const __m128d one   = _mm_set1_pd(1.0);
	const __m128d two   = _mm_set1_pd(2.0);
	const __m128d zero  = _mm_set1_pd(0.0);
	const __m128d neg   = _mm_set1_pd(-1.0);
	/* Set the dielectric offset */
	doffset   = born->gb_doffset;
	gb_radius = born->gb_radius;
    obc_param = born->param;
    work      = born->gpol_hct_work;
    jjnr      = nl->jjnr;
    dadx      = fr->dadx;
    shiftvec  = fr->shift_vec[0];
    jx        = _mm_setzero_pd();
    jy        = _mm_setzero_pd();
    jz        = _mm_setzero_pd();
    jnrA = jnrB = 0;
		work[i] = 0;
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		rai    = _mm_load1_pd(gb_radius+ii);
		rai_inv= gmx_mm_inv_pd(rai);
		sum_ai = _mm_setzero_pd();
		sk_ai  = _mm_load1_pd(born->param+ii);
		sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            dx    = _mm_sub_pd(ix, jx);
			dy    = _mm_sub_pd(iy, jy);
			dz    = _mm_sub_pd(iz, jz);
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_pd(rsq,rinv);
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_pd(r,sk_aj);
            t2            = _mm_sub_pd(r,sk_aj);
            t3            = _mm_sub_pd(sk_aj,r);
            obc_mask1     = _mm_cmplt_pd(rai, t1);
            obc_mask2     = _mm_cmplt_pd(rai, t2);
            obc_mask3     = _mm_cmplt_pd(rai, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
            sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
            t1            = _mm_sub_pd(t1,
            t2            = _mm_mul_pd(onefourth,
            t2            = _mm_sub_pd(t2,
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
            t3            = _mm_sub_pd(t3,
            t1            = _mm_mul_pd(rinv,
            dadx1         = _mm_and_pd(t1,obc_mask1);
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_pd(r,sk_ai);
            t2            = _mm_sub_pd(r,sk_ai);
            t3            = _mm_sub_pd(sk_ai,r);
            obc_mask1     = _mm_cmplt_pd(raj, t1);
            obc_mask2     = _mm_cmplt_pd(raj, t2);
            obc_mask3     = _mm_cmplt_pd(raj, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
            t1            = _mm_sub_pd(t1,
            t2            = _mm_mul_pd(onefourth,
            t2            = _mm_sub_pd(t2,
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
            t3            = _mm_sub_pd(t3,
            t1            = _mm_mul_pd(rinv,
            dadx2         = _mm_and_pd(t1,obc_mask1);
            dadx += 2;
            dadx += 2;
        } /* end normal inner loop */
			jnrA        = jjnr[k];   
            j3A         = 3*jnrA;  
            dx    = _mm_sub_sd(ix, jx);
			dy    = _mm_sub_sd(iy, jy);
			dz    = _mm_sub_sd(iz, jz);
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_sd(rsq,rinv);
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_sd(r,sk_aj);
            t2            = _mm_sub_sd(r,sk_aj);
            t3            = _mm_sub_sd(sk_aj,r);
            obc_mask1     = _mm_cmplt_sd(rai, t1);
            obc_mask2     = _mm_cmplt_sd(rai, t2);
            obc_mask3     = _mm_cmplt_sd(rai, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
            t1            = _mm_sub_sd(t1,
            t2            = _mm_mul_sd(onefourth,
            t2            = _mm_sub_sd(t2,
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
            t3            = _mm_sub_sd(t3,
            t1            = _mm_mul_sd(rinv,
            dadx1         = _mm_and_pd(t1,obc_mask1);
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_sd(r,sk_ai);
            t2            = _mm_sub_sd(r,sk_ai);
            t3            = _mm_sub_sd(sk_ai,r);
            obc_mask1     = _mm_cmplt_sd(raj, t1);
            obc_mask2     = _mm_cmplt_sd(raj, t2);
            obc_mask3     = _mm_cmplt_sd(raj, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
            t1            = _mm_sub_sd(t1,
            t2            = _mm_mul_sd(onefourth,
            t2            = _mm_sub_sd(t2,
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
            t3            = _mm_sub_sd(t3,
            t1            = _mm_mul_sd(rinv,
            dadx2         = _mm_and_pd(t1,obc_mask1);
            dadx += 2;
            dadx += 2;
	/* Parallel summations */
		gmx_sum(natoms, work, cr);
	else if(DOMAINDECOMP(cr))
		dd_atom_sum_real(cr->dd, work);
        /* HCT */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
			if(born->use[i] != 0)
                rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
                sum     = 1.0/rr - work[i];
                min_rad = rr + doffset;
                rad     = 1.0/sum; 
                born->bRad[i]   = rad > min_rad ? rad : min_rad;
                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
        /* Extra communication required for DD */
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
        /* OBC */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
			if(born->use[i] != 0)
                rr      = top->atomtypes.gb_radius[md->typeA[i]];
                rr_inv2 = 1.0/rr;
                rr      = rr-doffset; 
                rr_inv  = 1.0/rr;
                sum     = rr * work[i];
                sum2    = sum  * sum;
                sum3    = sum2 * sum;
                tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
                born->bRad[i] = rr_inv - tsum*rr_inv2;
                born->bRad[i] = 1.0 / born->bRad[i];
                tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
                born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
        /* Extra (local) communication required for DD */
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
            dd_atom_spread_real(cr->dd, born->drobc);
	return 0;
Beispiel #15
BOOST_FORCEINLINE __m128d __vectorcall operator & ( __m128d const left, __m128d const right ) {
    return _mm_and_pd   ( left, right );