void GPUNB_regf( int ni, double h2d[], double dtr[], double xid[][3], double vid[][3], double acc[][3], double jrk[][3], double pot[], int lmax, int nbmax, int *listbase, int m_flag){ // std::cout << " Call GPUNB_regf " << ni << std::endl; time_grav -= get_wtime(); numInter += ni * nbody; ::icall++; ::ini +=ni; #pragma omp parallel for for(int i=0; i<ni; i+=4){ int tid = omp_get_thread_num(); nblist[tid][0].clear(); nblist[tid][1].clear(); nblist[tid][2].clear(); nblist[tid][3].clear(); int nii = std::min(4, ni-i); v4sf xi = {xid[i+0][0], xid[i+1][0], xid[i+2][0], xid[i+3][0]}; v4sf yi = {xid[i+0][1], xid[i+1][1], xid[i+2][1], xid[i+3][1]}; v4sf zi = {xid[i+0][2], xid[i+1][2], xid[i+2][2], xid[i+3][2]}; v4sf vxi = {vid[i+0][0], vid[i+1][0], vid[i+2][0], vid[i+3][0]}; v4sf vyi = {vid[i+0][1], vid[i+1][1], vid[i+2][1], vid[i+3][1]}; v4sf vzi = {vid[i+0][2], vid[i+1][2], vid[i+2][2], vid[i+3][2]}; v4sf h2i = {h2d[i+0], h2d[i+1], h2d[i+2], h2d[i+3]}; static const v4sf h2mask[5] = { {0.0, 0.0, 0.0, 0.0}, {1.0, 0.0, 0.0, 0.0}, {1.0, 1.0, 0.0, 0.0}, {1.0, 1.0, 1.0, 0.0}, {1.0, 1.0, 1.0, 1.0}, }; h2i *= h2mask[nii]; v4sf dtri = {dtr[i+0], dtr[i+1], dtr[i+2], dtr[i+3]}; v4sf Ax = {0.f, 0.f, 0.f, 0.f}; v4sf Ay = {0.f, 0.f, 0.f, 0.f}; v4sf Az = {0.f, 0.f, 0.f, 0.f}; v4sf Jx = {0.f, 0.f, 0.f, 0.f}; v4sf Jy = {0.f, 0.f, 0.f, 0.f}; v4sf Jz = {0.f, 0.f, 0.f, 0.f}; v4sf poti = {0.f, 0.f, 0.f, 0.f}; v4sf *jpp = (v4sf *)jp_host; for(int j=0; j<nbody; j++, jpp+=2){ v4sf jp0 = jpp[0]; v4sf jp1 = jpp[1]; v4sf xj = __builtin_ia32_shufps(jp0, jp0, 0x00); v4sf yj = __builtin_ia32_shufps(jp0, jp0, 0x55); v4sf zj = __builtin_ia32_shufps(jp0, jp0, 0xaa); v4sf mj = __builtin_ia32_shufps(jp0, jp0, 0xff); v4sf vxj = __builtin_ia32_shufps(jp1, jp1, 0x00); v4sf vyj = __builtin_ia32_shufps(jp1, jp1, 0x55); v4sf vzj = __builtin_ia32_shufps(jp1, jp1, 0xaa); v4sf dx = xj - xi; v4sf dy = yj - yi; v4sf dz = zj - zi; v4sf dvx = vxj - vxi; v4sf dvy = vyj - vyi; v4sf dvz = vzj - vzi; v4sf dxp = dx + dtri * dvx; v4sf dyp = dy + dtri * dvy; v4sf dzp = dz + dtri * dvz; v4sf r2 = dx*dx + dy*dy + dz*dz; v4sf rv = dx*dvx + dy*dvy + dz*dvz; v4sf r2p = dxp*dxp + dyp*dyp + dzp*dzp; v4sf mask; // v4sf mask = (v4sf)__builtin_ia32_cmpltps(r2, h2i); if(m_flag) { v4sf mh2i = mj * h2i; mask = (v4sf)__builtin_ia32_cmpltps( __builtin_ia32_minps(r2,r2p), mh2i); } else { mask = (v4sf)__builtin_ia32_cmpltps( __builtin_ia32_minps(r2,r2p), h2i); } int bits = __builtin_ia32_movmskps(mask); // mj = __builtin_ia32_andnps(mask, mj); if(bits){ if (bits&1) nblist[tid][0].push_back(j); if (bits&2) nblist[tid][1].push_back(j); if (bits&4) nblist[tid][2].push_back(j); if (bits&8) nblist[tid][3].push_back(j); } v4sf rinv1 = v4sf_rsqrt(r2); rinv1 = __builtin_ia32_andnps(mask, rinv1); // v4sf rinv1 = __builtin_ia32_rsqrtps(r2); v4sf rinv2 = rinv1 * rinv1; rinv1 *= mj; poti += rinv1; v4sf rinv3 = rinv1 * rinv2; rv *= (v4sf){-3.f, -3.f, -3.f, -3.f} * rinv2; Ax += rinv3 * dx; Ay += rinv3 * dy; Az += rinv3 * dz; Jx += rinv3 * (dvx + rv * dx); Jy += rinv3 * (dvy + rv * dy); Jz += rinv3 * (dvz + rv * dz); } // for(j) union { struct{ v4sf Ax, Ay, Az, Jx, Jy, Jz, Pot; }; struct{ float acc[3][4], jrk[3][4], pot[4]; }; } u; u.Ax = Ax; u.Ay = Ay; u.Az = Az; u.Jx = Jx; u.Jy = Jy; u.Jz = Jz; u.Pot = poti; for(int ii=0; ii<nii; ii++){ for(int k=0; k<3; k++){ acc[i+ii][k] = u.acc[k][ii]; jrk[i+ii][k] = u.jrk[k][ii]; } pot[i+ii] = u.pot[ii]; int nnb = nblist[tid][ii].size(); int *nnbp = listbase + lmax * (i+ii); int *nblistp = nnbp + 1; if(nnb > nbmax){ *nnbp = -nnb; }else{ *nnbp = nnb; for(int k=0; k<nnb; k++){ nblistp[k] = nblist[tid][ii][k]; } } } } // printf("gpu: %e %e %e %d\n", xid[0][0], acc[0][0], jrk[0][0], *listbase); #if 0 if(ni > 0){ FILE *fp = fopen("Force.sse", "w"); assert(fp); for(int i=0; i<ni; i++){ int nnb = listbase[i*lmax]; fprintf(fp, "%d %9.2e %9.2e %9.2e %9.2e %9.2e %9.2e %d\n", i, acc[i][0], acc[i][1], acc[i][2], jrk[i][0], jrk[i][1], jrk[i][2], nnb); } fprintf(fp, "\n"); fclose(fp); exit(1); } #endif time_grav += get_wtime(); }
void cmpltps(float * arg1,float * arg2,float * retval) { v4sf x = __builtin_ia32_loadups(arg1); v4sf y = __builtin_ia32_loadups(arg2); __builtin_ia32_storeups(retval,__builtin_ia32_cmpltps(x,y)); }