Exemplo n.º 1
0
void GPUNB_regf(
		int ni,
		double h2d[],
		double dtr[],
		double xid[][3],
		double vid[][3],
		double acc[][3],
		double jrk[][3],
		double pot[],
		int lmax,
		int nbmax,
		int *listbase,
        int m_flag){
	// std::cout << " Call GPUNB_regf " << ni << std::endl;
	time_grav -= get_wtime();
	numInter += ni * nbody;
    ::icall++;
    ::ini +=ni;
#pragma omp parallel for
	for(int i=0; i<ni; i+=4){
		int tid = omp_get_thread_num();
		nblist[tid][0].clear();
		nblist[tid][1].clear();
		nblist[tid][2].clear();
		nblist[tid][3].clear();
		int nii = std::min(4, ni-i);

		v4sf xi  = {xid[i+0][0], xid[i+1][0], xid[i+2][0], xid[i+3][0]}; 
		v4sf yi  = {xid[i+0][1], xid[i+1][1], xid[i+2][1], xid[i+3][1]}; 
		v4sf zi  = {xid[i+0][2], xid[i+1][2], xid[i+2][2], xid[i+3][2]}; 
		v4sf vxi = {vid[i+0][0], vid[i+1][0], vid[i+2][0], vid[i+3][0]}; 
		v4sf vyi = {vid[i+0][1], vid[i+1][1], vid[i+2][1], vid[i+3][1]}; 
		v4sf vzi = {vid[i+0][2], vid[i+1][2], vid[i+2][2], vid[i+3][2]}; 
		v4sf h2i = {h2d[i+0], h2d[i+1], h2d[i+2], h2d[i+3]}; 
		static const v4sf h2mask[5] = {
			{0.0, 0.0, 0.0, 0.0},
			{1.0, 0.0, 0.0, 0.0},
			{1.0, 1.0, 0.0, 0.0},
			{1.0, 1.0, 1.0, 0.0},
			{1.0, 1.0, 1.0, 1.0},
		};
		h2i *= h2mask[nii];
		v4sf dtri = {dtr[i+0], dtr[i+1], dtr[i+2], dtr[i+3]}; 
		v4sf Ax = {0.f, 0.f, 0.f, 0.f};
		v4sf Ay = {0.f, 0.f, 0.f, 0.f};
		v4sf Az = {0.f, 0.f, 0.f, 0.f};
		v4sf Jx = {0.f, 0.f, 0.f, 0.f};
		v4sf Jy = {0.f, 0.f, 0.f, 0.f};
		v4sf Jz = {0.f, 0.f, 0.f, 0.f};
		v4sf poti = {0.f, 0.f, 0.f, 0.f};
		v4sf *jpp = (v4sf *)jp_host;
		for(int j=0; j<nbody; j++, jpp+=2){
			v4sf jp0 = jpp[0];
			v4sf jp1 = jpp[1];

			v4sf xj = __builtin_ia32_shufps(jp0, jp0, 0x00);
			v4sf yj = __builtin_ia32_shufps(jp0, jp0, 0x55);
			v4sf zj = __builtin_ia32_shufps(jp0, jp0, 0xaa);
			v4sf mj = __builtin_ia32_shufps(jp0, jp0, 0xff);
			v4sf vxj = __builtin_ia32_shufps(jp1, jp1, 0x00);
			v4sf vyj = __builtin_ia32_shufps(jp1, jp1, 0x55);
			v4sf vzj = __builtin_ia32_shufps(jp1, jp1, 0xaa);

			v4sf dx = xj - xi;
			v4sf dy = yj - yi;
			v4sf dz = zj - zi;
			v4sf dvx = vxj - vxi;
			v4sf dvy = vyj - vyi;
			v4sf dvz = vzj - vzi;

			v4sf dxp = dx + dtri * dvx;
			v4sf dyp = dy + dtri * dvy;
			v4sf dzp = dz + dtri * dvz;

			v4sf r2 = dx*dx + dy*dy + dz*dz;
			v4sf rv = dx*dvx + dy*dvy + dz*dvz;
			v4sf r2p = dxp*dxp + dyp*dyp + dzp*dzp;
            v4sf mask;
            //          v4sf mask = (v4sf)__builtin_ia32_cmpltps(r2, h2i);
            if(m_flag) {
              v4sf mh2i = mj * h2i;
              mask = (v4sf)__builtin_ia32_cmpltps(
                       __builtin_ia32_minps(r2,r2p), mh2i);
            }
            else {
              mask = (v4sf)__builtin_ia32_cmpltps(
                       __builtin_ia32_minps(r2,r2p), h2i);
            }
			int bits = __builtin_ia32_movmskps(mask);
			// mj = __builtin_ia32_andnps(mask, mj);
			if(bits){
				if (bits&1) nblist[tid][0].push_back(j);
				if (bits&2) nblist[tid][1].push_back(j);
				if (bits&4) nblist[tid][2].push_back(j);
				if (bits&8) nblist[tid][3].push_back(j);
			}

			v4sf rinv1 = v4sf_rsqrt(r2);
			rinv1 = __builtin_ia32_andnps(mask, rinv1);
			// v4sf rinv1 = __builtin_ia32_rsqrtps(r2);
			v4sf rinv2 = rinv1 * rinv1;
			rinv1 *= mj;
			poti += rinv1;
			v4sf rinv3 = rinv1 * rinv2;
			rv *= (v4sf){-3.f, -3.f, -3.f, -3.f} * rinv2;

			Ax += rinv3 * dx;
			Ay += rinv3 * dy;
			Az += rinv3 * dz;
			Jx += rinv3 * (dvx + rv * dx);
			Jy += rinv3 * (dvy + rv * dy);
			Jz += rinv3 * (dvz + rv * dz);
		} // for(j)
		union {
			struct{
				v4sf Ax, Ay, Az, Jx, Jy, Jz, Pot;
			};
			struct{
				float acc[3][4], jrk[3][4], pot[4];
			};
		} u;
		u.Ax = Ax;
		u.Ay = Ay;
		u.Az = Az;
		u.Jx = Jx;
		u.Jy = Jy;
		u.Jz = Jz;
		u.Pot = poti;
		for(int ii=0; ii<nii; ii++){
			for(int k=0; k<3; k++){
				acc[i+ii][k] = u.acc[k][ii];
				jrk[i+ii][k] = u.jrk[k][ii];
			}
			pot[i+ii] = u.pot[ii];
			int nnb = nblist[tid][ii].size();
			int *nnbp = listbase + lmax * (i+ii);
			int *nblistp = nnbp + 1;
			if(nnb > nbmax){
				*nnbp = -nnb;
			}else{
				*nnbp = nnb;
				for(int k=0; k<nnb; k++){
					nblistp[k] = nblist[tid][ii][k];
				}
			}
		}
	}
	// printf("gpu: %e %e %e %d\n", xid[0][0], acc[0][0], jrk[0][0], *listbase);
#if 0
	if(ni > 0){
		FILE *fp = fopen("Force.sse", "w");
		assert(fp);
		for(int i=0; i<ni; i++){
			int nnb =  listbase[i*lmax];
			fprintf(fp, "%d %9.2e %9.2e %9.2e %9.2e %9.2e %9.2e %d\n",
					i, acc[i][0], acc[i][1], acc[i][2], 
					   jrk[i][0], jrk[i][1], jrk[i][2], nnb);
		}
		fprintf(fp, "\n");
		fclose(fp);
		exit(1);
	}
#endif
	time_grav += get_wtime();
}
Exemplo n.º 2
0
void cmpltps(float * arg1,float * arg2,float * retval) {
    v4sf x = __builtin_ia32_loadups(arg1);
    v4sf y = __builtin_ia32_loadups(arg2);
    __builtin_ia32_storeups(retval,__builtin_ia32_cmpltps(x,y));
}