Exemple #1
 * Processes two doubles at a time
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
Exemple #2
	int c=col++;
	unsigned _m=mx,mxx=16777216/_m;
	double _x=xx,_y=yy,_w=wh;
		__m128d cr=_mm_set1_pd(_x+_w*c);
		for(int j=0;j<512;j+=2){
			__m128d zr=cr,
			unsigned mk=mx-1;
			uint64_t kk[2]__attribute__((aligned(16)))={mk,mk};
			__m128i k=_mm_load_si128((__m128i*)kk);
				__m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4));
Exemple #3
__m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_cmplt_pd
  // DAG: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
  // ASM-LABEL: test_mm_cmplt_pd
  // ASM: cmpltpd
  return _mm_cmplt_pd(A, B);
Exemple #4
void _SIMD_cmplt_pd(__SIMDd a, __SIMDd b, void** resultPtr)
  __SIMDd* result = (__SIMDd*)malloc(sizeof(__SIMDd));
  *resultPtr = result;
#ifdef  USE_SSE
  *result = _mm_cmplt_pd(a,b);
#elif defined USE_AVX
  *result = _mm256_cmp(a,b,17);
#elif defined USE_IBM
  *result = vec_cmplt(a,b);
Exemple #5
BI_FORCE_INLINE inline sse_double operator<(const sse_double& o1,
    const sse_double& o2) {
  sse_double res;
  res.packed = _mm_cmplt_pd(o1.packed, o2.packed);
  return res;
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128d v20, v21, v22, v23;
    __m128d v30, v31, v32, v33;
    __m128i v_cur_roots;
    __m128 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3;
    idx1 = IDX(n,n);
    e[idx1] = q[n];
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        idx3 = idx1; 
        for (r = i; r < n; ++r) {
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&7);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;
            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 8 ) {
                v01 = _mm_loadu_pd( &w[idx1  ] );
                v11 = _mm_loadu_pd( &w[idx1+2] );
                v21 = _mm_loadu_pd( &w[idx1+4] );
                v31 = _mm_loadu_pd( &w[idx1+6] );

                v00 = _mm_loadu_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); 
                v10 = _mm_loadu_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );
                v20 = _mm_loadu_pd( &e[idx2+4] );
                v21 = _mm_add_pd( v21, v_tmp );
                v30 = _mm_loadu_pd( &e[idx2+6] );
                v31 = _mm_add_pd( v31, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_loadu_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_loadu_pd( &e[idx1+2] );
                v21 = _mm_add_pd( v21, v20 );
                v23 = _mm_loadu_pd( &e[idx1+4] );
                v31 = _mm_add_pd( v31, v30 );
                v33 = _mm_loadu_pd( &e[idx1+6] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );
                v22 = _mm_cmplt_pd( v21, v23 );
                v32 = _mm_cmplt_pd( v31, v33 );

                _mm_maskstore_pd( &e[idx1  ],
                        _mm_castpd_si128( v02 ), v01 );
                _mm_maskstore_pd( &e[idx1+2],
                        _mm_castpd_si128( v12 ), v11 );
                _mm_maskstore_pd( &e[idx1+4],
                        _mm_castpd_si128( v22 ), v21 );
                _mm_maskstore_pd( &e[idx1+6], 
                        _mm_castpd_si128( v32 ), v31 );

                v_rootmask0 = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );
                v_rootmask1 = _mm_shuffle_ps(
                        _mm_castpd_ps( v12 ),
                        _mm_castpd_ps( v22 ),
                        _MM_SHUFFLE(0,2,0,2) );

                _mm_maskstore_ps( &root[idx1],
                        _mm_castps_si128( v_rootmask0 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                _mm_maskstore_ps( &root[idx1+4],
                        _mm_castps_si128( v_rootmask1 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                idx1 += 8;

    return e[IDX(0,n)];
Exemple #7
  template<class Dummy>
  struct  call< tag::is_less_ ( tag::simd_<tag::double_,tag::sse_>
                              , tag::simd_<tag::double_,tag::sse_>
              , tag::cpu_, Dummy
        : callable
    template<class Sig>           struct result;
    template<class This,class A0>
    struct result<This(A0,A0)> : meta::strip<A0> {};

      A0 that = { _mm_cmplt_pd(a0,a1) };
      return that;
} }

// Overloads implementation for float
NT2_REGISTER_DISPATCH ( tag::is_less_, tag::cpu_, (A0)
                      , ((simd_<float_<A0>,tag::sse_>))

namespace nt2 { namespace ext
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, l_end_pre, j;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128i v_cur_roots, v_old_roots, v_new_roots;
    __m128 v_rootmask;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3, pad, pad_r;
    idx1 = (n+1)*(n+2)/2 + n/2;
    e[idx1] = q[n];
    pad = 1;
    // pad contains the padding for row i+1
    // for row n it's always 1
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1 + pad;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        // idx2 now points to the beginning of the next line.
        idx2 += pad; // padding of line i+1

        idx3 = idx1;
        pad_r = pad; // padding of line r
        for (r = i; r < n; ++r) {
            pad_r = !pad_r; // padding of line r+1
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&3);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;

            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 4 ) {
                v01 = _mm_load_pd( &w[idx1  ] );
                v11 = _mm_load_pd( &w[idx1+2] );

                v00 = _mm_load_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency
                v10 = _mm_load_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_load_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_load_pd( &e[idx1+2] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );

                v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 ));
                v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 ));

                _mm_store_pd( &e[idx1  ], v00 );
                _mm_store_pd( &e[idx1+2], v10 );

                v_rootmask = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );

                v_old_roots = _mm_lddqu_si128( &root[idx1] );
                v_new_roots = _mm_or_si128(
                        _mm_and_si128(    v_cur_roots, 
                            _mm_castps_si128( v_rootmask ) ),
                        _mm_andnot_si128( v_old_roots,
                            _mm_castps_si128( v_rootmask ) )
                _mm_storeu_si128( &root[idx1], v_new_roots );

                idx1 += 4;

            idx2 += pad_r;
        pad = !pad;
        // every other line as padding 0, or 1, respectively

    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[n + !(n&1)];
calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
                                const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
	int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
    int jnrA,jnrB;
    int j3A,j3B;
	double shX,shY,shZ;
	double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
	double sum_ai2, sum_ai3,tsum,tchain,doffset;
	double *obc_param;
    double *gb_radius;
    double *work;
    int *  jjnr;
    double *dadx;
    double *shiftvec;
    double min_rad,rad;
	__m128d ix,iy,iz,jx,jy,jz;
	__m128d dx,dy,dz,t1,t2,t3,t4;
	__m128d rsq,rinv,r;
	__m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
	__m128d uij,lij2,uij2,lij3,uij3,diff2;
	__m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
	__m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
	__m128d dadx1,dadx2;
    __m128d logterm;
	__m128d mask;
	__m128d obc_mask1,obc_mask2,obc_mask3;    
    __m128d oneeighth   = _mm_set1_pd(0.125);
    __m128d onefourth   = _mm_set1_pd(0.25);
	const __m128d half  = _mm_set1_pd(0.5);
	const __m128d three = _mm_set1_pd(3.0);
	const __m128d one   = _mm_set1_pd(1.0);
	const __m128d two   = _mm_set1_pd(2.0);
	const __m128d zero  = _mm_set1_pd(0.0);
	const __m128d neg   = _mm_set1_pd(-1.0);
	/* Set the dielectric offset */
	doffset   = born->gb_doffset;
	gb_radius = born->gb_radius;
    obc_param = born->param;
    work      = born->gpol_hct_work;
    jjnr      = nl->jjnr;
    dadx      = fr->dadx;
    shiftvec  = fr->shift_vec[0];
    jx        = _mm_setzero_pd();
    jy        = _mm_setzero_pd();
    jz        = _mm_setzero_pd();
    jnrA = jnrB = 0;
		work[i] = 0;
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		rai    = _mm_load1_pd(gb_radius+ii);
		rai_inv= gmx_mm_inv_pd(rai);
		sum_ai = _mm_setzero_pd();
		sk_ai  = _mm_load1_pd(born->param+ii);
		sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            dx    = _mm_sub_pd(ix, jx);
			dy    = _mm_sub_pd(iy, jy);
			dz    = _mm_sub_pd(iz, jz);
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_pd(rsq,rinv);
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_pd(r,sk_aj);
            t2            = _mm_sub_pd(r,sk_aj);
            t3            = _mm_sub_pd(sk_aj,r);
            obc_mask1     = _mm_cmplt_pd(rai, t1);
            obc_mask2     = _mm_cmplt_pd(rai, t2);
            obc_mask3     = _mm_cmplt_pd(rai, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
            sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
            t1            = _mm_sub_pd(t1,
            t2            = _mm_mul_pd(onefourth,
            t2            = _mm_sub_pd(t2,
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
            t3            = _mm_sub_pd(t3,
            t1            = _mm_mul_pd(rinv,
            dadx1         = _mm_and_pd(t1,obc_mask1);
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_pd(r,sk_ai);
            t2            = _mm_sub_pd(r,sk_ai);
            t3            = _mm_sub_pd(sk_ai,r);
            obc_mask1     = _mm_cmplt_pd(raj, t1);
            obc_mask2     = _mm_cmplt_pd(raj, t2);
            obc_mask3     = _mm_cmplt_pd(raj, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
            t1            = _mm_sub_pd(t1,
            t2            = _mm_mul_pd(onefourth,
            t2            = _mm_sub_pd(t2,
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
            t3            = _mm_sub_pd(t3,
            t1            = _mm_mul_pd(rinv,
            dadx2         = _mm_and_pd(t1,obc_mask1);
            dadx += 2;
            dadx += 2;
        } /* end normal inner loop */
			jnrA        = jjnr[k];   
            j3A         = 3*jnrA;  
            dx    = _mm_sub_sd(ix, jx);
			dy    = _mm_sub_sd(iy, jy);
			dz    = _mm_sub_sd(iz, jz);
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_sd(rsq,rinv);
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_sd(r,sk_aj);
            t2            = _mm_sub_sd(r,sk_aj);
            t3            = _mm_sub_sd(sk_aj,r);
            obc_mask1     = _mm_cmplt_sd(rai, t1);
            obc_mask2     = _mm_cmplt_sd(rai, t2);
            obc_mask3     = _mm_cmplt_sd(rai, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
            t1            = _mm_sub_sd(t1,
            t2            = _mm_mul_sd(onefourth,
            t2            = _mm_sub_sd(t2,
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
            t3            = _mm_sub_sd(t3,
            t1            = _mm_mul_sd(rinv,
            dadx1         = _mm_and_pd(t1,obc_mask1);
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_sd(r,sk_ai);
            t2            = _mm_sub_sd(r,sk_ai);
            t3            = _mm_sub_sd(sk_ai,r);
            obc_mask1     = _mm_cmplt_sd(raj, t1);
            obc_mask2     = _mm_cmplt_sd(raj, t2);
            obc_mask3     = _mm_cmplt_sd(raj, t3);
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
            t1            = _mm_sub_sd(t1,
            t2            = _mm_mul_sd(onefourth,
            t2            = _mm_sub_sd(t2,
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
            t3            = _mm_sub_sd(t3,
            t1            = _mm_mul_sd(rinv,
            dadx2         = _mm_and_pd(t1,obc_mask1);
            dadx += 2;
            dadx += 2;
	/* Parallel summations */
		gmx_sum(natoms, work, cr);
	else if(DOMAINDECOMP(cr))
		dd_atom_sum_real(cr->dd, work);
        /* HCT */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
			if(born->use[i] != 0)
                rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
                sum     = 1.0/rr - work[i];
                min_rad = rr + doffset;
                rad     = 1.0/sum; 
                born->bRad[i]   = rad > min_rad ? rad : min_rad;
                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
        /* Extra communication required for DD */
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
        /* OBC */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
			if(born->use[i] != 0)
                rr      = top->atomtypes.gb_radius[md->typeA[i]];
                rr_inv2 = 1.0/rr;
                rr      = rr-doffset; 
                rr_inv  = 1.0/rr;
                sum     = rr * work[i];
                sum2    = sum  * sum;
                sum3    = sum2 * sum;
                tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
                born->bRad[i] = rr_inv - tsum*rr_inv2;
                born->bRad[i] = 1.0 / born->bRad[i];
                tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
                born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
        /* Extra (local) communication required for DD */
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
            dd_atom_spread_real(cr->dd, born->drobc);
	return 0;
Exemple #10
__m128d test_mm_cmplt_pd(__m128d __a, __m128d __b) {
  // CHECK-LABEL: @test_mm_cmplt_pd
  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
  return _mm_cmplt_pd(__a, __b);
Packet2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }