示例#1
0
/**
 * Processes two doubles at a time
 */
int
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	         )
{
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
	{
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );
	}

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
}
示例#2
0
void*drawman(void*x){
	int c=col++;
	unsigned _m=mx,mxx=16777216/_m;
	double _x=xx,_y=yy,_w=wh;
	do{
		__m128d cr=_mm_set1_pd(_x+_w*c);
		for(int j=0;j<512;j+=2){
			__m128d zr=cr,
				zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi,
				zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi);
			unsigned mk=mx-1;
			uint64_t kk[2]__attribute__((aligned(16)))={mk,mk};
			__m128i k=_mm_load_si128((__m128i*)kk);
			do{
				zi=_mm_mul_pd(zi,zr);
				zi=_mm_add_pd(_mm_add_pd(zi,zi),ci);
				zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr);
				zr2=_mm_mul_pd(zr,zr);
				zi2=_mm_mul_pd(zi,zi);
				__m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4));
				if(!_mm_movemask_pd(n))break;
				k=_mm_add_epi64(k,_mm_castpd_si128(n));
			}while(--mk);
			_mm_store_si128((__m128i*)kk,k);
			manor[c][j]=kk[1]*mxx>>16;
			manor[c][j+1]=kk[0]*mxx>>16;
		}
		done[c>>6]|=1ULL<<(c&63);
		c=col++;
	}while(c<512&&!pull);
}
示例#3
0
__m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_cmplt_pd
  // DAG: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
  //
  // ASM-LABEL: test_mm_cmplt_pd
  // ASM: cmpltpd
  return _mm_cmplt_pd(A, B);
}
示例#4
0
void _SIMD_cmplt_pd(__SIMDd a, __SIMDd b, void** resultPtr)
{
  __SIMDd* result = (__SIMDd*)malloc(sizeof(__SIMDd));
  *resultPtr = result;
#ifdef  USE_SSE
  *result = _mm_cmplt_pd(a,b);
#elif defined USE_AVX
  *result = _mm256_cmp(a,b,17);
#elif defined USE_IBM
  *result = vec_cmplt(a,b);
#endif
}
示例#5
0
BI_FORCE_INLINE inline sse_double operator<(const sse_double& o1,
    const sse_double& o2) {
  sse_double res;
  res.packed = _mm_cmplt_pd(o1.packed, o2.packed);
  return res;
}
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128d v20, v21, v22, v23;
    __m128d v30, v31, v32, v33;
    __m128i v_cur_roots;
    __m128 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3;
    
    idx1 = IDX(n,n);
    e[idx1] = q[n];
    idx1++;
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx3 = idx1; 
        for (r = i; r < n; ++r) {
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&7);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;
                }
                idx1++;
            }
            
            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 8 ) {
                v01 = _mm_loadu_pd( &w[idx1  ] );
                v11 = _mm_loadu_pd( &w[idx1+2] );
                v21 = _mm_loadu_pd( &w[idx1+4] );
                v31 = _mm_loadu_pd( &w[idx1+6] );

                v00 = _mm_loadu_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); 
                v10 = _mm_loadu_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );
                v20 = _mm_loadu_pd( &e[idx2+4] );
                v21 = _mm_add_pd( v21, v_tmp );
                v30 = _mm_loadu_pd( &e[idx2+6] );
                v31 = _mm_add_pd( v31, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_loadu_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_loadu_pd( &e[idx1+2] );
                v21 = _mm_add_pd( v21, v20 );
                v23 = _mm_loadu_pd( &e[idx1+4] );
                v31 = _mm_add_pd( v31, v30 );
                v33 = _mm_loadu_pd( &e[idx1+6] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );
                v22 = _mm_cmplt_pd( v21, v23 );
                v32 = _mm_cmplt_pd( v31, v33 );

                _mm_maskstore_pd( &e[idx1  ],
                        _mm_castpd_si128( v02 ), v01 );
                _mm_maskstore_pd( &e[idx1+2],
                        _mm_castpd_si128( v12 ), v11 );
                _mm_maskstore_pd( &e[idx1+4],
                        _mm_castpd_si128( v22 ), v21 );
                _mm_maskstore_pd( &e[idx1+6], 
                        _mm_castpd_si128( v32 ), v31 );

                v_rootmask0 = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );
                v_rootmask1 = _mm_shuffle_ps(
                        _mm_castpd_ps( v12 ),
                        _mm_castpd_ps( v22 ),
                        _MM_SHUFFLE(0,2,0,2) );

                _mm_maskstore_ps( &root[idx1],
                        _mm_castps_si128( v_rootmask0 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                _mm_maskstore_ps( &root[idx1+4],
                        _mm_castps_si128( v_rootmask1 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                
                idx1 += 8;
            }
            idx3++;
        }
    }


    return e[IDX(0,n)];
}
示例#7
0
{
  template<class Dummy>
  struct  call< tag::is_less_ ( tag::simd_<tag::double_,tag::sse_>
                              , tag::simd_<tag::double_,tag::sse_>
                              )
              , tag::cpu_, Dummy
              >
        : callable
  {
    template<class Sig>           struct result;
    template<class This,class A0>
    struct result<This(A0,A0)> : meta::strip<A0> {};

    NT2_FUNCTOR_CALL(2)
    {
      A0 that = { _mm_cmplt_pd(a0,a1) };
      return that;
    }
  };
} }

////////////////////////////////////////////////////////////////////////////////
// Overloads implementation for float
////////////////////////////////////////////////////////////////////////////////
NT2_REGISTER_DISPATCH ( tag::is_less_, tag::cpu_, (A0)
                      , ((simd_<float_<A0>,tag::sse_>))
                        ((simd_<float_<A0>,tag::sse_>))
                      );

namespace nt2 { namespace ext
{
示例#8
0
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, l_end_pre, j;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128i v_cur_roots, v_old_roots, v_new_roots;
    __m128 v_rootmask;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3, pad, pad_r;
    
    idx1 = (n+1)*(n+2)/2 + n/2;
    e[idx1] = q[n];
    idx1++;
    pad = 1;
    // pad contains the padding for row i+1
    // for row n it's always 1
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1 + pad;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        // idx2 now points to the beginning of the next line.
        idx2 += pad; // padding of line i+1

        idx3 = idx1;
        pad_r = pad; // padding of line r
        for (r = i; r < n; ++r) {
            pad_r = !pad_r; // padding of line r+1
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&3);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;
                }
                idx1++;
            }

            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 4 ) {
                v01 = _mm_load_pd( &w[idx1  ] );
                v11 = _mm_load_pd( &w[idx1+2] );

                v00 = _mm_load_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency
                v10 = _mm_load_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_load_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_load_pd( &e[idx1+2] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );

                v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 ));
                v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 ));

                _mm_store_pd( &e[idx1  ], v00 );
                _mm_store_pd( &e[idx1+2], v10 );

                v_rootmask = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );

                v_old_roots = _mm_lddqu_si128( &root[idx1] );
                v_new_roots = _mm_or_si128(
                        _mm_and_si128(    v_cur_roots, 
                            _mm_castps_si128( v_rootmask ) ),
                        _mm_andnot_si128( v_old_roots,
                            _mm_castps_si128( v_rootmask ) )
                        );
                _mm_storeu_si128( &root[idx1], v_new_roots );

                idx1 += 4;
            }

            idx2 += pad_r;
            idx3++;
        }
        pad = !pad;
        // every other line as padding 0, or 1, respectively
    }

    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[n + !(n&1)];
}
int 
calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
                                const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
{
	int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
    int jnrA,jnrB;
    int j3A,j3B;
	double shX,shY,shZ;
	double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
	double sum_ai2, sum_ai3,tsum,tchain,doffset;
	double *obc_param;
    double *gb_radius;
    double *work;
    int *  jjnr;
    double *dadx;
    double *shiftvec;
    double min_rad,rad;
    
	__m128d ix,iy,iz,jx,jy,jz;
	__m128d dx,dy,dz,t1,t2,t3,t4;
	__m128d rsq,rinv,r;
	__m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
	__m128d uij,lij2,uij2,lij3,uij3,diff2;
	__m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
	__m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
	__m128d dadx1,dadx2;
    __m128d logterm;
	__m128d mask;
	__m128d obc_mask1,obc_mask2,obc_mask3;    
    
    __m128d oneeighth   = _mm_set1_pd(0.125);
    __m128d onefourth   = _mm_set1_pd(0.25);
    
	const __m128d half  = _mm_set1_pd(0.5);
	const __m128d three = _mm_set1_pd(3.0);
	const __m128d one   = _mm_set1_pd(1.0);
	const __m128d two   = _mm_set1_pd(2.0);
	const __m128d zero  = _mm_set1_pd(0.0);
	const __m128d neg   = _mm_set1_pd(-1.0);
	
	/* Set the dielectric offset */
	doffset   = born->gb_doffset;
	gb_radius = born->gb_radius;
    obc_param = born->param;
    work      = born->gpol_hct_work;
    jjnr      = nl->jjnr;
    dadx      = fr->dadx;
    shiftvec  = fr->shift_vec[0];
    
    jx        = _mm_setzero_pd();
    jy        = _mm_setzero_pd();
    jz        = _mm_setzero_pd();
    
    jnrA = jnrB = 0;
    
	for(i=0;i<born->nr;i++)
	{
		work[i] = 0;
	}
	
	for(i=0;i<nl->nri;i++)
	{
        ii     = nl->iinr[i];
		ii3	   = ii*3;
        is3    = 3*nl->shift[i];     
        shX    = shiftvec[is3];  
        shY    = shiftvec[is3+1];
        shZ    = shiftvec[is3+2];
        nj0    = nl->jindex[i];      
        nj1    = nl->jindex[i+1];    
        
        ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
		        
		rai    = _mm_load1_pd(gb_radius+ii);
		rai_inv= gmx_mm_inv_pd(rai);
        
		sum_ai = _mm_setzero_pd();
		
		sk_ai  = _mm_load1_pd(born->param+ii);
		sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
        
		for(k=nj0;k<nj1-1;k+=2)
		{
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
			
            j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
            GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
            GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
			
            dx    = _mm_sub_pd(ix, jx);
			dy    = _mm_sub_pd(iy, jy);
			dz    = _mm_sub_pd(iz, jz);
			
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_pd(rsq,rinv);
            
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_pd(r,sk_aj);
            t2            = _mm_sub_pd(r,sk_aj);
            t3            = _mm_sub_pd(sk_aj,r);
            obc_mask1     = _mm_cmplt_pd(rai, t1);
            obc_mask2     = _mm_cmplt_pd(rai, t2);
            obc_mask3     = _mm_cmplt_pd(rai, t3);
            
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
                                      _mm_andnot_pd(obc_mask2,rai_inv));
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
                        
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
                        
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
                                       _mm_sub_pd(_mm_mul_pd(onefourth,r),
                                                  prod));
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
                        
            sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
                                       _mm_mul_pd(prod,lij3));
            t1            = _mm_sub_pd(t1,
                                       _mm_mul_pd(onefourth,
                                                  _mm_add_pd(_mm_mul_pd(lij,rinv),
                                                             _mm_mul_pd(lij3,r))));
            t2            = _mm_mul_pd(onefourth,
                                       _mm_add_pd(_mm_mul_pd(uij,rinv),
                                                  _mm_mul_pd(uij3,r)));
            t2            = _mm_sub_pd(t2,
                                       _mm_add_pd(_mm_mul_pd(half,uij2),
                                                  _mm_mul_pd(prod,uij3)));
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
                                       _mm_mul_pd(rinv,rinv));
            t3            = _mm_sub_pd(t3,
                                       _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
                                                  _mm_add_pd(one,
                                                             _mm_mul_pd(sk2_rinv,rinv))));
            t1            = _mm_mul_pd(rinv,
                                       _mm_add_pd(_mm_mul_pd(dlij,t1),
                                                  _mm_add_pd(t2,t3)));
            
            dadx1         = _mm_and_pd(t1,obc_mask1);
            
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_pd(r,sk_ai);
            t2            = _mm_sub_pd(r,sk_ai);
            t3            = _mm_sub_pd(sk_ai,r);
            obc_mask1     = _mm_cmplt_pd(raj, t1);
            obc_mask2     = _mm_cmplt_pd(raj, t2);
            obc_mask3     = _mm_cmplt_pd(raj, t3);
            
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
                                      _mm_andnot_pd(obc_mask2,raj_inv));
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_pd(uij, uij);
            uij3          = _mm_mul_pd(uij2,uij);
            lij2          = _mm_mul_pd(lij, lij);
            lij3          = _mm_mul_pd(lij2,lij);
                        
            diff2         = _mm_sub_pd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
            prod          = _mm_mul_pd(onefourth,sk2_rinv);
                        
            logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
            
            t1            = _mm_sub_pd(lij,uij);
            t2            = _mm_mul_pd(diff2,
                                       _mm_sub_pd(_mm_mul_pd(onefourth,r),
                                                  prod));
            t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
            t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
            t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
                        
            GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
            
            t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
                                       _mm_mul_pd(prod,lij3));
            t1            = _mm_sub_pd(t1,
                                       _mm_mul_pd(onefourth,
                                                  _mm_add_pd(_mm_mul_pd(lij,rinv),
                                                             _mm_mul_pd(lij3,r))));
            t2            = _mm_mul_pd(onefourth,
                                       _mm_add_pd(_mm_mul_pd(uij,rinv),
                                                  _mm_mul_pd(uij3,r)));
            t2            = _mm_sub_pd(t2,
                                       _mm_add_pd(_mm_mul_pd(half,uij2),
                                                  _mm_mul_pd(prod,uij3)));
            t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
                                       _mm_mul_pd(rinv,rinv));
            t3            = _mm_sub_pd(t3,
                                       _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
                                                  _mm_add_pd(one,
                                                             _mm_mul_pd(sk2_rinv,rinv))));
            t1            = _mm_mul_pd(rinv,
                                       _mm_add_pd(_mm_mul_pd(dlij,t1),
                                                  _mm_add_pd(t2,t3)));
            
            dadx2         = _mm_and_pd(t1,obc_mask1);
            
            _mm_store_pd(dadx,dadx1);
            dadx += 2;
            _mm_store_pd(dadx,dadx2);
            dadx += 2;
        } /* end normal inner loop */
        
		if(k<nj1)
		{
			jnrA        = jjnr[k];   
			
            j3A         = 3*jnrA;  
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
            GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
            GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
			
            dx    = _mm_sub_sd(ix, jx);
			dy    = _mm_sub_sd(iy, jy);
			dz    = _mm_sub_sd(iz, jz);
			
            rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv        = gmx_mm_invsqrt_pd(rsq);
            r           = _mm_mul_sd(rsq,rinv);
            
			/* Compute raj_inv aj1-4 */
            raj_inv     = gmx_mm_inv_pd(raj);
            
            /* Evaluate influence of atom aj -> ai */
            t1            = _mm_add_sd(r,sk_aj);
            t2            = _mm_sub_sd(r,sk_aj);
            t3            = _mm_sub_sd(sk_aj,r);
            obc_mask1     = _mm_cmplt_sd(rai, t1);
            obc_mask2     = _mm_cmplt_sd(rai, t2);
            obc_mask3     = _mm_cmplt_sd(rai, t3);
            
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
                                      _mm_andnot_pd(obc_mask2,rai_inv));
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
            sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
                                       _mm_sub_sd(_mm_mul_pd(onefourth,r),
                                                  prod));
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            
            sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
            
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
                                       _mm_mul_sd(prod,lij3));
            t1            = _mm_sub_sd(t1,
                                       _mm_mul_sd(onefourth,
                                                  _mm_add_sd(_mm_mul_sd(lij,rinv),
                                                             _mm_mul_sd(lij3,r))));
            t2            = _mm_mul_sd(onefourth,
                                       _mm_add_sd(_mm_mul_sd(uij,rinv),
                                                  _mm_mul_sd(uij3,r)));
            t2            = _mm_sub_sd(t2,
                                       _mm_add_sd(_mm_mul_sd(half,uij2),
                                                  _mm_mul_sd(prod,uij3)));
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
                                       _mm_mul_sd(rinv,rinv));
            t3            = _mm_sub_sd(t3,
                                       _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
                                                  _mm_add_sd(one,
                                                             _mm_mul_sd(sk2_rinv,rinv))));
            t1            = _mm_mul_sd(rinv,
                                       _mm_add_sd(_mm_mul_sd(dlij,t1),
                                                  _mm_add_pd(t2,t3)));
            
            dadx1         = _mm_and_pd(t1,obc_mask1);
            
            /* Evaluate influence of atom ai -> aj */
            t1            = _mm_add_sd(r,sk_ai);
            t2            = _mm_sub_sd(r,sk_ai);
            t3            = _mm_sub_sd(sk_ai,r);
            obc_mask1     = _mm_cmplt_sd(raj, t1);
            obc_mask2     = _mm_cmplt_sd(raj, t2);
            obc_mask3     = _mm_cmplt_sd(raj, t3);
            
            uij           = gmx_mm_inv_pd(t1);
            lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
                                      _mm_andnot_pd(obc_mask2,raj_inv));
            dlij          = _mm_and_pd(one,obc_mask2);
            uij2          = _mm_mul_sd(uij, uij);
            uij3          = _mm_mul_sd(uij2,uij);
            lij2          = _mm_mul_sd(lij, lij);
            lij3          = _mm_mul_sd(lij2,lij);
            
            diff2         = _mm_sub_sd(uij2,lij2);
            lij_inv       = gmx_mm_invsqrt_pd(lij2);
            sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
            prod          = _mm_mul_sd(onefourth,sk2_rinv);
            
            logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
            
            t1            = _mm_sub_sd(lij,uij);
            t2            = _mm_mul_sd(diff2,
                                       _mm_sub_sd(_mm_mul_sd(onefourth,r),
                                                  prod));
            t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
            t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
            t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
            t4            = _mm_and_pd(t4,obc_mask3);
            t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
            
            GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
            
            t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
                                       _mm_mul_sd(prod,lij3));
            t1            = _mm_sub_sd(t1,
                                       _mm_mul_sd(onefourth,
                                                  _mm_add_sd(_mm_mul_sd(lij,rinv),
                                                             _mm_mul_sd(lij3,r))));
            t2            = _mm_mul_sd(onefourth,
                                       _mm_add_sd(_mm_mul_sd(uij,rinv),
                                                  _mm_mul_sd(uij3,r)));
            t2            = _mm_sub_sd(t2,
                                       _mm_add_sd(_mm_mul_sd(half,uij2),
                                                  _mm_mul_sd(prod,uij3)));
            t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
                                       _mm_mul_sd(rinv,rinv));
            t3            = _mm_sub_sd(t3,
                                       _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
                                                  _mm_add_sd(one,
                                                             _mm_mul_sd(sk2_rinv,rinv))));
            t1            = _mm_mul_sd(rinv,
                                       _mm_add_sd(_mm_mul_sd(dlij,t1),
                                                  _mm_add_sd(t2,t3)));
            
            dadx2         = _mm_and_pd(t1,obc_mask1);
            
            _mm_store_pd(dadx,dadx1);
            dadx += 2;
            _mm_store_pd(dadx,dadx2);
            dadx += 2;
        } 
        gmx_mm_update_1pot_pd(sum_ai,work+ii);
        
	}
	
	/* Parallel summations */
	if(PARTDECOMP(cr))
	{
		gmx_sum(natoms, work, cr);
	}
	else if(DOMAINDECOMP(cr))
	{
		dd_atom_sum_real(cr->dd, work);
	}
	
    if(gb_algorithm==egbHCT)
    {
        /* HCT */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
        {
			if(born->use[i] != 0)
            {
                rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
                sum     = 1.0/rr - work[i];
                min_rad = rr + doffset;
                rad     = 1.0/sum; 
                
                born->bRad[i]   = rad > min_rad ? rad : min_rad;
                fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
            }
        }
        
        /* Extra communication required for DD */
        if(DOMAINDECOMP(cr))
        {
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
        }
    }
    else
    {
        /* OBC */
        for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
        {
			if(born->use[i] != 0)
            {
                rr      = top->atomtypes.gb_radius[md->typeA[i]];
                rr_inv2 = 1.0/rr;
                rr      = rr-doffset; 
                rr_inv  = 1.0/rr;
                sum     = rr * work[i];
                sum2    = sum  * sum;
                sum3    = sum2 * sum;
                
                tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
                born->bRad[i] = rr_inv - tsum*rr_inv2;
                born->bRad[i] = 1.0 / born->bRad[i];
                
                fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
                
                tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
                born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
            }
        }
        /* Extra (local) communication required for DD */
        if(DOMAINDECOMP(cr))
        {
            dd_atom_spread_real(cr->dd, born->bRad);
            dd_atom_spread_real(cr->dd, fr->invsqrta);
            dd_atom_spread_real(cr->dd, born->drobc);
        }
    }
    
	
	
	return 0;
}
示例#10
0
__m128d test_mm_cmplt_pd(__m128d __a, __m128d __b) {
  // CHECK-LABEL: @test_mm_cmplt_pd
  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
  return _mm_cmplt_pd(__a, __b);
}
Packet2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }