static std::complex<double> ZDot(int n, const double* A, const std::complex<double>* B) { if (n) { #ifdef __SSE2__ std::complex<double> sum(0); while (n && !IsAligned(A) ) { sum += *A * *B; ++A; Maybe<!c2>::increment(B); --n; } int n_2 = (n>>1); int nb = n-(n_2<<1); if (n_2) { union { __m128d xm; double xd[2]; } xsum; xsum.xm = _mm_set1_pd(0.); __m128d xsum2 = _mm_set1_pd(0.); const std::complex<double>* B1 = Maybe<!c2>::plus(B,1); assert(IsAligned(A)); assert(IsAligned(B)); do { const __m128d& xA = *(const __m128d*)(A); const __m128d& xB1 = *(const __m128d*)(B); const __m128d& xB2 = *(const __m128d*)(B1); A += 2; Maybe<!c2>::increment(B,2); Maybe<!c2>::increment(B1,2); __m128d xA1 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(0,0)); __m128d xA2 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(1,1)); __m128d x1 = _mm_mul_pd(xA1,xB1); __m128d x2 = _mm_mul_pd(xA2,xB2); xsum.xm = _mm_add_pd(xsum.xm,x1); xsum2 = _mm_add_pd(xsum2,x2); } while (--n_2); xsum.xm = _mm_add_pd(xsum.xm,xsum2); sum += std::complex<double>(xsum.xd[0],xsum.xd[1]); } if (nb) { sum += *A * *B; ++A; Maybe<!c2>::increment(B); } return Maybe<c2>::conj(sum); #else std::complex<double> sum = 0.; do { sum += *A * *B; ++A; Maybe<!c2>::increment(B); } while (--n); return Maybe<c2>::conj(sum); #endif } else { return 0.;
dcomplex zdotc_( int* n, dcomplex* x, int* inc_x, dcomplex* z, int* inc_z ) { dcomplex* restrict x1; dcomplex* restrict z1; int i; v2df_t rho1v; v2df_t z11v, z12v; v2df_t x1v, x1rv; dcomplex rho; int n1 = *n; int incx = *inc_x; int incz = *inc_z; x1 = x; z1 = z; rho1v.v = _mm_setzero_pd(); { v2df_t bcac, adbd; for ( i = 0; i < n1; ++i ) { z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) ); z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) ); x1v.v = _mm_load_pd( ( double* )x1 ); x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) ); bcac.v = x1rv.v * z11v.v; adbd.v = x1v.v * z12v.v; rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v ); x1 += incx; z1 += incz; } rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) ); rho1v.d[1] = -rho1v.d[1]; } rho.real = rho1v.d[0]; rho.imag = rho1v.d[1]; return rho; }
/* xvm_dot: * Return the dot product of the two given vectors. */ double xvm_dot(const double x[], const double y[], uint64_t N) { double r = 0.0; #if defined(__SSE2__) && !defined(XVM_ANSI) assert(x != NULL && ((uintptr_t)x % 16) == 0); assert(y != NULL && ((uintptr_t)y % 16) == 0); uint64_t n, d = N % 4; __m128d s0 = _mm_setzero_pd(); __m128d s1 = _mm_setzero_pd(); for (n = 0; n < N - d; n += 4) { const __m128d x0 = _mm_load_pd(x + n ); const __m128d x1 = _mm_load_pd(x + n + 2); const __m128d y0 = _mm_load_pd(y + n ); const __m128d y1 = _mm_load_pd(y + n + 2); const __m128d r0 = _mm_mul_pd(x0, y0); const __m128d r1 = _mm_mul_pd(x1, y1); s0 = _mm_add_pd(s0, r0); s1 = _mm_add_pd(s1, r1); } s0 = _mm_add_pd(s0, s1); s1 = _mm_shuffle_pd(s0, s0, _MM_SHUFFLE2(1, 1)); s0 = _mm_add_pd(s0, s1); _mm_store_sd(&r, s0); for ( ; n < N; n++) r += x[n] * y[n]; #else for (uint64_t n = 0; n < N; n++) r += x[n] * y[n]; #endif return r; }
/* use compiler intrinsics for 2x parallel processing */ static inline double chi2_intrinsic_double(int n, const double* x, const double* y) { double result=0; const __m128d eps = _mm_set1_pd(DBL_MIN); const __m128d zero = _mm_setzero_pd(); __m128d chi2 = _mm_setzero_pd(); for ( ; n>1; n-=2) { const __m128d a = _mm_loadu_pd(x); const __m128d b = _mm_loadu_pd(y); x+=2; y+=2; const __m128d a_plus_b = _mm_add_pd(a,b); const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps); const __m128d a_minus_b = _mm_sub_pd(a,b); const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b); const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_pd(chi2, quotient); } const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1)); const __m128d sum = _mm_add_pd(chi2, shuffle); // with SSE3, we could use hadd_pd, but the difference is negligible _mm_store_sd(&result,sum); _mm_empty(); if (n) result += chi2_baseline_double(n, x, y); // remaining entries return result; }
static inline void inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i; __m128d f[2], sum[4], t; const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride), (gdouble *) ((gint8 *) b + 2 * bstride), (gdouble *) ((gint8 *) b + 3 * bstride) }; f[0] = _mm_loadu_pd (icoeff + 0); f[1] = _mm_loadu_pd (icoeff + 2); sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); for (i = 0; i < len; i += 2) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); } sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[2] = _mm_add_pd (sum[2], sum[3]); sum[0] = _mm_add_pd (sum[0], sum[2]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
int main(){ __m128d a,b,c; double res[2] __attribute__((aligned(16))); a = _mm_set_pd(1,2); b = _mm_set_pd(3,4); c = _mm_shuffle_pd(a,b, _MM_SHUFFLE2(0,1)); _mm_store_pd(res, c); /* 0 1 */ printf("%f %f\n", res[0] , res[1]); return 0; }
BOOST_FORCEINLINE __m128d shuffle(__m128d const lower, __m128d const upper) { return _mm_shuffle_pd(lower, upper, _MM_SHUFFLE2(upper_i0, lower_i0)); }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) { #ifdef HAVE_NEON #if 0 neon_transpose4(in, out, w, h); #else neon_transpose8(in, out, w, h); #endif #elif HAVE_SSE2 uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE]; int tx, ty; /* int x; */ int y; int tw = w / TSIZE; int th = h / TSIZE; for (ty = 0; ty < th; ty++) { for (tx = 0; tx < tw; tx++) { uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */ /* copy/transpose to tmp */ for (y = 0; y < TSIZE; y += 2) { /* for (x=0;x<TSIZE;x+=2) { op[x*TSIZE] = ip[x]; */ __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w)); __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w)); __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w)); __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w)); __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w)); __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w)); __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w)); __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w)); __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); ip0 += 2; /* _mm_store_pd((double *)(op0 + y*h + x), t0); _mm_store_pd((double *)(op0 + y*h + x + h), t1); */ _mm_store_pd((double*)(op0 + 0 ), t0); _mm_store_pd((double*)(op0 + 0 + TSIZE), t1); _mm_store_pd((double*)(op0 + 2 ), t2); _mm_store_pd((double*)(op0 + 2 + TSIZE), t3); _mm_store_pd((double*)(op0 + 4 ), t4); _mm_store_pd((double*)(op0 + 4 + TSIZE), t5); _mm_store_pd((double*)(op0 + 6 ), t6); _mm_store_pd((double*)(op0 + 6 + TSIZE), t7); /* } */ op0 += 2*TSIZE; } op0 = out + h*tx*TSIZE + ty*TSIZE; ip0 = tmp; for (y = 0; y < TSIZE; y += 1) { /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */ __m128d q0 = _mm_load_pd((double*)(ip0 + 0)); __m128d q1 = _mm_load_pd((double*)(ip0 + 2)); __m128d q2 = _mm_load_pd((double*)(ip0 + 4)); __m128d q3 = _mm_load_pd((double*)(ip0 + 6)); _mm_store_pd((double*)(op0 + 0), q0); _mm_store_pd((double*)(op0 + 2), q1); _mm_store_pd((double*)(op0 + 4), q2); _mm_store_pd((double*)(op0 + 6), q3); op0 += h; ip0 += TSIZE; } } } /* size_t i,j; for(i=0;i<w;i+=2) { for(j=0;j<h;j+=2) { // out[i*h + j] = in[j*w + i]; __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); _mm_store_pd((double *)(out + i*h + j), t0); _mm_store_pd((double *)(out + i*h + j + h), t1); } } */ #else const int bw = 1; const int bh = 8; int i = 0, j = 0; for (; i <= h - bh; i += bh) { for (j = 0; j <= w - bw; j += bw) { uint64_t const *ib = &in[w*i + j]; uint64_t *ob = &out[h*j + i]; uint64_t s_0_0 = ib[0*w + 0]; uint64_t s_1_0 = ib[1*w + 0]; uint64_t s_2_0 = ib[2*w + 0]; uint64_t s_3_0 = ib[3*w + 0]; uint64_t s_4_0 = ib[4*w + 0]; uint64_t s_5_0 = ib[5*w + 0]; uint64_t s_6_0 = ib[6*w + 0]; uint64_t s_7_0 = ib[7*w + 0]; ob[0*h + 0] = s_0_0; ob[0*h + 1] = s_1_0; ob[0*h + 2] = s_2_0; ob[0*h + 3] = s_3_0; ob[0*h + 4] = s_4_0; ob[0*h + 5] = s_5_0; ob[0*h + 6] = s_6_0; ob[0*h + 7] = s_7_0; } } if (i < h) { int i1; for (i1 = 0; i1 < w; i1++) { for (j = i; j < h; j++) { out[i1*h + j] = in[j*w + i1]; } } } if (j < w) { int j1; for (i = j; i < w; i++) { for (j1 = 0; j1 < h; j1++) { out[i*h + j1] = in[j1*w + i]; } } } #endif }
static inline Simd shuffle(const Simd& arg) { Simd ret; ret.reg = _mm_shuffle_pd(arg.reg, arg.reg, _MM_SHUFFLE2(i0, i1)); return ret; }