Predictor(const Particle &p, const v2df ti) { const v4sf dt = __builtin_ia32_cvtpd2ps(ti - v2df(p.time)); const v4sf s0 = __builtin_ia32_shufps(dt, dt, 0x00); const v4sf s1 = s0 + s0; const v4sf s2 = s0 * (v4sf)REP4(1.5f); this->posH = p.posH; this->posL = v4sf(p.posL) + s0*(v4sf(p.vel) + s0*(v4sf(p.acc2) + s0*(v4sf(p.jrk6)))); this->vel = v4sf(p.vel) + s1*(v4sf(p.acc2) + s2*(v4sf(p.jrk6))); }
Predictor( const Particle &p, const double ti) { const float dt = float(ti - p.time); const v4sf s0 = {dt, dt, dt, dt}; const v4sf s1 = s0 + s0; const v4sf s2 = s0 * (v4sf){1.5f, 1.5f, 1.5f, 1.5f}; posH = p.posH; posL = v4sf(p.posL) + s0*(v4sf(p.vel) + s0*(v4sf(p.acc2) + s0*(v4sf(p.jrk6)))); vel = v4sf(p.vel) + s1*(v4sf(p.acc2) + s2*(v4sf(p.jrk6))); }
void cnbint( int i, const double pos[][3], const double vel[][3], const double mass[], int nnb, int list[], double f[3], double fdot[3]){ const int NBMAX = 512; assert(nnb <= NBMAX); float xbuf[NBMAX] __attribute__ ((aligned(16))); float ybuf[NBMAX] __attribute__ ((aligned(16))); float zbuf[NBMAX] __attribute__ ((aligned(16))); float vxbuf[NBMAX] __attribute__ ((aligned(16))); float vybuf[NBMAX] __attribute__ ((aligned(16))); float vzbuf[NBMAX] __attribute__ ((aligned(16))); float mbuf[NBMAX] __attribute__ ((aligned(16))); assert((unsigned long)xbuf % 16 == 0); double xi = pos[i][0]; double yi = pos[i][1]; double zi = pos[i][2]; float vxi = vel[i][0]; float vyi = vel[i][1]; float vzi = vel[i][2]; for(int k=0; k<nnb; k++){ int j = list[k]; #if 1 int jj = list[k+4]; __builtin_prefetch(pos[jj]); __builtin_prefetch(pos[jj]+2); __builtin_prefetch(vel[jj]); __builtin_prefetch(vel[jj]+2); __builtin_prefetch(&mass[jj]); #endif #if 0 xbuf[k] = pos[j][0] - xi; ybuf[k] = pos[j][1] - yi; zbuf[k] = pos[j][2] - zi; vxbuf[k] = vel[j][0] - vxi; vybuf[k] = vel[j][1] - vyi; vzbuf[k] = vel[j][2] - vzi; mbuf[k] = mass[j]; #else double xj = pos[j][0]; double yj = pos[j][1]; double zj = pos[j][2]; float vxj = vel[j][0]; float vyj = vel[j][1]; float vzj = vel[j][2]; float mj = mass[j]; xj -= xi; yj -= yi; zj -= zi; vxj -= vxi; vyj -= vyi; vzj -= vzi; xbuf[k] = xj; ybuf[k] = yj; zbuf[k] = zj; vxbuf[k] = vxj; vybuf[k] = vyj; vzbuf[k] = vzj; mbuf[k] = mj; #endif } for(int k=nnb; k%4; k++){ xbuf[k] = 16.0f; ybuf[k] = 16.0f; zbuf[k] = 16.0f; vxbuf[k] = 0.0f; vybuf[k] = 0.0f; vzbuf[k] = 0.0f; mbuf[k] = 0.0f; } v4df ax(0.0), ay(0.0), az(0.0); v4sf jx(0.0), jy(0.0), jz(0.0); for(int k=0; k<nnb; k+=4){ v4sf dx(xbuf + k); v4sf dy(ybuf + k); v4sf dz(zbuf + k); v4sf dvx(vxbuf + k); v4sf dvy(vybuf + k); v4sf dvz(vzbuf + k); v4sf mj(mbuf + k); v4sf r2 = dx*dx + dy*dy + dz*dz; v4sf rv = dx*dvx + dy*dvy + dz*dvz; rv *= v4sf(-3.0f); // v4sf rinv1 = r2.rsqrt() & v4sf(mask); #if 1 v4sf rinv1 = r2.rsqrt(); #else v4sf rinv1 = v4sf(v4df(1.0) / v4df(r2).sqrt()); #endif v4sf rinv2 = rinv1 * rinv1; rv *= rinv2; v4sf rinv3 = mj * rinv1 * rinv2; dx *= rinv3; ax += v4df(dx); dy *= rinv3; ay += v4df(dy); dz *= rinv3; az += v4df(dz); dvx *= rinv3; jx += dvx; dvy *= rinv3; jy += dvy; dvz *= rinv3; jz += dvz; dx *= rv; jx += dx; dy *= rv; jy += dy; dz *= rv; jz += dz; } f[0] = ax.sum(); f[1] = ay.sum(); f[2] = az.sum(); fdot[0] = (v4df(jx)).sum(); fdot[1] = (v4df(jy)).sum(); fdot[2] = (v4df(jz)).sum(); assert(f[0] == f[0]); assert(f[1] == f[1]); assert(f[2] == f[2]); assert(fdot[0] == fdot[0]); assert(fdot[1] == fdot[1]); assert(fdot[2] == fdot[2]); }
void cnbint( int i, const double pos[][3], const double vel[][3], const double mass[], int nnb, int list[], double f[3], double fdot[3]){ const int NBMAX = 512; assert(nnb <= NBMAX); double xbuf[NBMAX] __attribute__ ((aligned(16))); double ybuf[NBMAX] __attribute__ ((aligned(16))); double zbuf[NBMAX] __attribute__ ((aligned(16))); float vxbuf[NBMAX] __attribute__ ((aligned(16))); float vybuf[NBMAX] __attribute__ ((aligned(16))); float vzbuf[NBMAX] __attribute__ ((aligned(16))); float mbuf[NBMAX] __attribute__ ((aligned(16))); for(int k=0; k<nnb; k++){ int j = list[k]; xbuf[k] = pos[j][0]; ybuf[k] = pos[j][1]; zbuf[k] = pos[j][2]; vxbuf[k] = vel[j][0]; vybuf[k] = vel[j][1]; vzbuf[k] = vel[j][2]; mbuf[k] = mass[j]; } for(int k=nnb; k%4; k++){ xbuf[k] = 16.0; ybuf[k] = 16.0; zbuf[k] = 16.0; vxbuf[k] = 0.0f; vybuf[k] = 0.0f; vzbuf[k] = 0.0f; mbuf[k] = 0.0f; } v4df xi(pos[i][0]); v4df yi(pos[i][1]); v4df zi(pos[i][2]); v4sf vxi(vel[i][0]); v4sf vyi(vel[i][1]); v4sf vzi(vel[i][2]); v4df ax(0.0), ay(0.0), az(0.0); v4sf jx(0.0), jy(0.0), jz(0.0); for(int k=0; k<nnb; k+=4){ v4df xj(xbuf + k); v4df yj(ybuf + k); v4df zj(zbuf + k); v4sf vxj(vxbuf + k); v4sf vyj(vybuf + k); v4sf vzj(vzbuf + k); v4sf mj(mbuf + k); v4sf dx = v4sf::_v4sf(xj - xi); v4sf dy = v4sf::_v4sf(yj - yi); v4sf dz = v4sf::_v4sf(zj - zi); v4sf dvx = vxj - vxi; v4sf dvy = vyj - vyi; v4sf dvz = vzj - vzi; v4sf r2 = dx*dx + dy*dy + dz*dz; v4sf rv = dx*dvx + dy*dvy + dz*dvz; rv *= v4sf(-3.0f); // v4sf rinv1 = r2.rsqrt() & v4sf(mask); v4sf rinv1 = r2.rsqrt(); v4sf rinv2 = rinv1 * rinv1; rv *= rinv2; v4sf rinv3 = mj * rinv1 * rinv2; dx *= rinv3; ax += v4df(dx); dy *= rinv3; ay += v4df(dy); dz *= rinv3; az += v4df(dz); dvx *= rinv3; jx += dvx; dvy *= rinv3; jy += dvy; dvz *= rinv3; jz += dvz; dx *= rv; jx += dx; dy *= rv; jy += dy; dz *= rv; jz += dz; } f[0] = ax.sum(); f[1] = ay.sum(); f[2] = az.sum(); fdot[0] = (v4df(jx)).sum(); fdot[1] = (v4df(jy)).sum(); fdot[2] = (v4df(jz)).sum(); assert(f[0] == f[0]); assert(f[1] == f[1]); assert(f[2] == f[2]); assert(fdot[0] == fdot[0]); assert(fdot[1] == fdot[1]); assert(fdot[2] == fdot[2]); }
static inline v4sf rsqrt_1st(const v4sf rhs) { v4sf x0 = v4sf(_mm_rsqrt_ps(rhs.val)); v4sf h = v4sf(1.) - rhs * x0 * x0; return x0 + v4sf(0.5) * h * x0; }
static inline v4sf rsqrt_1st_phantom(const v4sf rhs) { v4sf x0 = v4sf(_mm_rsqrt_ps(rhs.val)); return v4sf(x0 * (rhs * x0 * x0 - v4sf(3.))); }
static inline v4sf sqrt(const v4sf rhs) { return v4sf(_mm_sqrt_ps(rhs.val)); }
static inline v4sf rsqrt_0th(const v4sf rhs) { return v4sf(_mm_rsqrt_ps(rhs.val)); }
static inline v4sf rcp_1st(const v4sf rhs) { v4sf x0 = _mm_rcp_ps(rhs.val); v4sf h = v4sf(1.) - rhs * x0; return x0 + h * x0; }
v4sf operator / (const v4sf rhs) const { return v4sf(_mm_div_ps(val, rhs.val)); }
v4sf operator * (const v4sf rhs) const { return v4sf(val * rhs.val); }
v4sf operator - (const v4sf rhs) const { return v4sf(val - rhs.val); }
v4sf operator + (const v4sf rhs) const { return v4sf(val + rhs.val); }