PotentialFieldSolver::PotentialFieldSolver() { m_M_eval=0; m_N_mass=0; m_cell_h=0; m_gridx=m_gridy=m_gridz=0; m_hashx=m_hashy=m_hashz=0; m_initialized=false; m_origin=make_double4(0,0,0,0); m_L=0; m_K=0; m_center=make_double3(0,0,0); m_total_mass=0; m_evalPos = new GpuArrayf4; m_evalNormal = new GpuArrayd3; m_evalPos_Reorder = new GpuArrayf4; m_p_massPos = new GpuArrayf4; m_p_massPos_Reorder = new GpuArrayf4; m_particle_mass = new GpuArrayd; m_particle_mass_Reorder = new GpuArrayd; m_grid_density = new GpuArrayd; m_grid_Rhs = new GpuArrayd; m_grid_phi = new GpuArrayd; m_particle_gradPhi = new GpuArrayd3; m_particle_dphidn = new GpuArrayd; m_particle_gradPhi_deorder = new GpuArrayd3; m_grid_gradPhi = new GpuArrayd3; m_far_gradPhi = new GpuArrayd3; m_SLP_area = new GpuArrayd; }
BiotSavartSolver::BiotSavartSolver() { m_N_vort = 0; m_M_eval = 0; m_gridx = m_gridy = m_gridz = 0; m_hashx = m_hashy = m_hashz = 0; m_cell_h = 0.0; m_origin = make_double4(0,0,0,0); m_center = make_double3(0,0,0); m_L = 0.0; m_initialized = false; m_K = 0; m_evalPos = new gf_GpuArray<float4>; m_evalPos_Reorder = new gf_GpuArray<float4>; m_p_vortPos = new gf_GpuArray<float4>; m_p_vortPos_Reorder = new GpuArrayf4; m_isVIC = false; for (int i=0;i<NUM_COMPONENTS; i++) { m_total_vort[i]=0; m_grid_Rhs[i] = new GpuArrayd; m_particle_vort[i] = new GpuArrayd; m_particle_vort_Reorder[i] = new GpuArrayd; m_grid_vort[i] = new GpuArrayd; m_grid_Psi[i] = new GpuArrayd; m_particle_U[i] = new GpuArrayd; m_particle_U_deorder[i] = new GpuArrayd; m_grid_U[i] = new GpuArrayd; m_far_U[i] = new GpuArrayd; } }
double4 ParticleListCPUSorted::subcycle_stats(PlasmaData* pdata) { double scale = pdata->npiccard_outer; double mean = 0; double mean2 = 0; double mins = num_subcycles[0]/scale; double maxs = num_subcycles[0]/scale; int imax = 0; int imin = 0; double nsubcycles_total = 0; for(int i=0;i<nptcls;i++) { if(mins > num_subcycles[i]/scale) { mins = num_subcycles[i]/scale; imin = i; } if(maxs < num_subcycles[i]/scale) { maxs = num_subcycles[i]/scale; imax = i; } nsubcycles_total += num_subcycles[i]; mean += num_subcycles[i]/((double)nptcls*scale); //mean2 += num_subcycles[i]*num_subcycles[i]/((double)nptcls*scale*scale*nptcls); mean2 = mean*mean; } double std_diff; std_diff = sqrt(fabs(mean*mean - mean2)/((double)nptcls*scale)); printf("Particle Subcycle Stats:\n"); printf("Avg Subcycles: %f +/- %f\n",mean,std_diff); printf("Min / Max: %f[%i] / %f[%i]\n",mins,imin,maxs,imax); printf("Total number of subcycles was %e\n",nsubcycles_total); return make_double4(mean,std_diff,mins,maxs); }
double4 ParticleListCPUSorted::piccard_stats(PlasmaData* pdata) { double scale = pdata->npiccard_outer; double mean = 0; double mean2 = 0; double mins = num_piccard[0]/num_subcycles[0]; double maxs = num_piccard[0]/num_subcycles[0]; int imax = 0; int imin = 0; for(int i=0;i<nptcls;i++) { if(mins > num_piccard[i]/num_subcycles[i]) { mins = num_piccard[i]/num_subcycles[i]; imin = i; } if(maxs < num_piccard[i]/num_subcycles[i]) { maxs = num_piccard[i]/num_subcycles[i]; imax = i; } mean += num_piccard[i]/((double)nptcls*num_subcycles[i]); mean2 += num_piccard[i]*num_piccard[i]/((double)nptcls*num_subcycles[i]*num_subcycles[i]); } double std_diff; std_diff = sqrt(fabs(mean*mean - mean2)); printf("Particle Piccard Stats(CPU):\n"); printf("Avg Piccard: %f +/- %f\n",mean,std_diff); printf("Min / Max: %f[%i] / %f[%i]\n",mins,imin,maxs,imax); return make_double4(mean,std_diff,mins,maxs); }
static inline __host__ __device__ double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
template<> inline __host__ __device__ double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
void sapporo::startGravCalc(int nj, int ni, int id[], double xi[][3], double vi[][3], double a[][3], double j6old[][3], double phiold[3], double eps2, double h2[], double eps2_i[]) { #ifdef DEBUG_PRINT cerr << "calc_firsthalf ni: " << ni << "\tnj: " << nj << "integrationOrder: "<< integrationOrder << endl; #endif if(ni == 0 || nj == 0) return; //Prevent unused compiler warning j6old = j6old; phiold = phiold; //Its not allowed to send more particles than n_pipes assert(ni <= get_n_pipes()); EPS2 = eps2; //Copy i-particles to device structures, first only to device 0 //then from device 0 we use memcpy to get the data into the //other devs buffers int toDevice = 0; for (int i = 0; i < ni; i++) { deviceList[toDevice]->pos_i[i] = make_double4(xi[i][0], xi[i][1], xi[i][2], h2[i]); if(integrationOrder > GRAPE5) { deviceList[toDevice]->id_i[i] = id[i]; deviceList[toDevice]->vel_i[i] = make_double4(vi[i][0], vi[i][1], vi[i][2], eps2); if(eps2_i != NULL) //Seperate softening for i-particles deviceList[toDevice]->vel_i[i].w = eps2_i[i]; if(integrationOrder > FOURTH) { deviceList[toDevice]->acc_i[i] = make_double4(a[i][0], a[i][1], a[i][2], 0); } } #ifdef DEBUG_PRINT if(integrationOrder == GRAPE5) { fprintf(stderr, "Inpdevice= %d,\ti: %d\tindex: %d\teps2: %f\t%f\t%f\t%f \n", -1,i, 0, eps2, xi[i][0],xi[i][1],xi[i][2]); } else { fprintf(stderr, "Inpdevice= %d,\ti: %d\tindex: %d\teps2: %f\t%f\t%f\t%f\t%f\t%f\t%f", -1,i,id[i], eps2, xi[i][0],xi[i][1],xi[i][2],vi[i][0],vi[i][1] ,vi[i][2]); if(integrationOrder > FOURTH) fprintf(stderr, "\t%f %f %f\n", a[i][0], a[i][1], a[i][2]); else fprintf(stderr, "\n"); } #endif }//for i //Copy i particles from host buffer of device 0 to the devices host side buffers for(int i = toDevice+1; i < nCUDAdevices; i++) { memcpy(&deviceList[i]->pos_i[0], &deviceList[0]->pos_i[0], sizeof(double4) * ni); if(integrationOrder > GRAPE5) { memcpy(&deviceList[i]->vel_i[0], &deviceList[0]->vel_i[0], sizeof(double4) * ni); memcpy(&deviceList[i]->id_i[0], &deviceList[0]->id_i[0], sizeof(int) * ni); if(integrationOrder > FOURTH) memcpy(&deviceList[i]->acc_i[0], &deviceList[0]->acc_i[0], sizeof(double4) * ni); } } #pragma omp parallel num_threads(numberOfGPUUsedBySapporo) { if (nj_updated) { //Get the number of particles set for this device int devCount = jCopyInformation[omp_get_thread_num()].count; if(devCount > 0) { send_j_particles_to_device(devCount); } } //ni is the number of particles in the pipes send_i_particles_to_device(ni); //nj is the total number of particles to which the i particles have to //be calculated. For direct N-body this is usually equal to the total //number of nj particles that have been set by the calling code //Calculate the number of nj particles that are used per device int nj_per_dev = nj / nCUDAdevices; if(omp_get_thread_num() < (nj % nCUDAdevices)) nj_per_dev++; evaluate_gravity(ni, nj_per_dev); sapdevice->dev_ni = ni; }//end parallel section nj_modified = -1; predict = false; nj_updated = false; //Clear the address to dev/location mapping mappingFromIndexToDevIndex.clear(); } //end calc_first
int sapporo::set_j_particle(int address, int id, double tj, double dtj, double mass, double k18[3], double j6[3], double a2[3], double v[3], double x[3], double snp[3], double crk[3], double eps) { #ifdef DEBUG_PRINT cerr << "set_j_particle (Addr: " << address << " Id: " << id << " )\n"; #endif //Prevent unused compiler warning k18 = k18; predJOnHost = false; //Reset the buffers on the device since they can be modified nj_updated = true; //There are particles that are updated //Check if the address does not fall outside the allocated memory range //if it falls outside that range increase j-memory by 10% if (address >= nj_max) { fprintf(stderr, "Increasing nj_max! Nj_max was: %d to be stored address: %d \n", nj_max, address); increase_jMemory(); //Extra check, if we are still outside nj_max, we quit since particles are not //nicely send in order if (address >= nj_max) { fprintf(stderr, "Increasing nj_max was not enough! Send particles in order to the library! Exit\n"); exit(-1); } } //Memory has been allocated, now we can store the particles //First calculate on which device this particle has to be stored //and on which physical address on that device. Note that the particles //are distributed to the different devices in a round-robin way (based on the addres) int dev = address % nCUDAdevices; int devAddr = address / nCUDAdevices; int storeLoc = jCopyInformation[dev].count; //Store this information, incase particles get overwritten map<int, int4>::iterator iterator = mappingFromIndexToDevIndex.find(address); map<int, int4>::iterator end = mappingFromIndexToDevIndex.end(); if(iterator != end) { //Particle with this address has been set before, retrieve previous //calculated indices and overwrite them with the new info int4 addrInfo = (*iterator).second; dev = addrInfo.x; storeLoc = addrInfo.y; devAddr = addrInfo.z; } else { //New particle not set before, save address info and increase particles //on that specific device by one mappingFromIndexToDevIndex[address] = make_int4(dev, storeLoc, devAddr, -1); jCopyInformation[dev].count++; } deviceList[dev]->pos_j_temp[storeLoc] = make_double4(x[0], x[1], x[2], mass); deviceList[dev]->address_j[storeLoc] = devAddr; if(integrationOrder > GRAPE5) { deviceList[dev]->t_j_temp[storeLoc] = make_double2(tj, dtj); deviceList[dev]->vel_j_temp[storeLoc] = make_double4(v[0], v[1], v[2], eps); deviceList[dev]->acc_j_temp[storeLoc] = make_double4(a2[0], a2[1], a2[2], 0.0); deviceList[dev]->jrk_j_temp[storeLoc] = make_double4(j6[0], j6[1], j6[2], 0.0); deviceList[dev]->id_j_temp[storeLoc] = id; //For 6th and 8 order we need more parameters if(integrationOrder > FOURTH) { deviceList[dev]->snp_j_temp[storeLoc] = make_double4(snp[0], snp[1], snp[2], 0.0); deviceList[dev]->crk_j_temp[storeLoc] = make_double4(crk[0], crk[1], crk[2], 0.0); } } #ifdef CPU_SUPPORT //Put the new j particles directly in the correct location on the host side. deviceList[dev]->pos_j[devAddr] = make_double4(x[0], x[1], x[2], mass); if(integrationOrder > GRAPE5) { deviceList[dev]->t_j[devAddr] = make_double2(tj, dtj); deviceList[dev]->vel_j[devAddr] = make_double4(v[0], v[1], v[2], eps); deviceList[dev]->acc_j[devAddr] = make_double4(a2[0], a2[1], a2[2], 0.0); deviceList[dev]->jrk_j[devAddr] = make_double4(j6[0], j6[1], j6[2], 0.0); deviceList[dev]->id_j[devAddr] = id; //For 6th and 8 order we need more parameters if(integrationOrder > FOURTH) { deviceList[dev]->snp_j[devAddr] = make_double4(snp[0], snp[1], snp[2], 0.0); deviceList[dev]->crk_j[devAddr] = make_double4(crk[0], crk[1], crk[2], 0.0); } } #endif #ifdef DEBUG_PRINT if(integrationOrder == GRAPE5) { fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f m: %f \n", address, id, storeLoc, x[0],x[1],x[2], mass); } else { fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f\t mass: %f \tvel: %f %f %f", address, id, storeLoc, x[0],x[1],x[2],mass, v[0],v[1],v[2]); fprintf(stderr, "\tacc: %f %f %f \n", a2[0],a2[1],a2[2]); if(integrationOrder > FOURTH) { fprintf(stderr, "\tsnp: %f %f %f ", snp[0],snp[1],snp[2]); fprintf(stderr, "\tcrk: %f %f %f \n", crk[0],crk[1],crk[2]); } } #endif return 0; };
void sapporo::evaluate_gravity_host(int ni_total, int nj) { executedOnHost = true; #pragma omp for for(int i=0; i < ni_total; i++) { double4 pos_i = sapdevice->pos_i[i]; double4 vel_i = sapdevice->vel_i[i]; int id_i = sapdevice->id_i[i]; double4 acc_i = make_double4(0,0,0,0); double4 jrk_i = make_double4(0,0,0,0); double ds_min = 10e10; int nnb = -1; for(int j=0; j < nj; j++) { double4 pos_j = sapdevice->pPos_j[j]; double4 vel_j = sapdevice->pVel_j[j]; int id_j = sapdevice->id_j [j]; if(id_i == id_j) continue; //Skip self-gravity //Compute the force const double4 dr = make_double4(pos_j.x - pos_i.x, pos_j.y - pos_i.y, pos_j.z - pos_i.z, 0); const double ds2 = ((dr.x*dr.x + (dr.y*dr.y)) + dr.z*dr.z); if(ds2 < ds_min) //keep track of nearest neighbour { ds_min = ds2; nnb = id_j; } const double inv_ds = 1.0/sqrt(ds2+EPS2); const double mass = pos_j.w; const double minvr1 = mass*inv_ds; const double invr2 = inv_ds*inv_ds; const double minvr3 = minvr1*invr2; // Acceleration acc_i.x += minvr3 * dr.x; acc_i.y += minvr3 * dr.y; acc_i.z += minvr3 * dr.z; acc_i.w += (-1.0)*minvr1; //Jerk const double4 dv = make_double4(vel_j.x - vel_i.x, vel_j.y - vel_i.y, vel_j.z - vel_i.z, 0); const double drdv = (-3.0) * (minvr3*invr2) * (dr.x*dv.x + dr.y*dv.y + dr.z*dv.z); jrk_i.x += minvr3 * dv.x + drdv * dr.x; jrk_i.y += minvr3 * dv.y + drdv * dr.y; jrk_i.z += minvr3 * dv.z + drdv * dr.z; }//for j sapdevice->ds_i[i].x = nnb; sapdevice->ds_i[i].y = ds_min; sapdevice->iParticleResults[i ] = acc_i; sapdevice->iParticleResults[i+ni_total] = jrk_i; }//for i }
void sapporo::predictJParticles_host(int nj) { if(integrationOrder == GRAPE5) { //GRAPE5 has no prediction memcpy(&sapdevice->pPos_j[0], &sapdevice->pos_j[0], sizeof(double4) * nj); return; } double4 snp = make_double4(0,0,0,0); double4 crk = make_double4(0,0,0,0); for(int i=0; i < nj; i++) { double dt = t_i - sapdevice->t_j[i].x; double dt2 = (1./2.)*dt; double dt3 = (1./3.)*dt; double dt4 = (1./4.)*dt; double dt5 = (1./5.)*dt; double4 pos = sapdevice->pos_j[i]; double4 vel = sapdevice->vel_j[i]; double4 acc = sapdevice->acc_j[i]; double4 jrk = sapdevice->jrk_j[i]; if(integrationOrder > FOURTH) { snp = sapdevice->snp_j[i]; crk = sapdevice->crk_j[i]; } //Positions pos.x += dt * (vel.x + dt2 * (acc.x + dt3 * (jrk.x + dt4 * (snp.x + dt5 * (crk.x))))); pos.y += dt * (vel.y + dt2 * (acc.y + dt3 * (jrk.y + dt4 * (snp.y + dt5 * (crk.y))))); pos.z += dt * (vel.z + dt2 * (acc.z + dt3 * (jrk.z + dt4 * (snp.z + dt5 * (crk.z))))); sapdevice->pPos_j[i] = pos; //Velocities vel.x += dt * (acc.x + dt2 * (jrk.x + dt3 * (snp.x + dt4 * (crk.x)))); vel.y += dt * (acc.y + dt2 * (jrk.y + dt3 * (snp.y + dt4 * (crk.y)))); vel.z += dt * (acc.z + dt2 * (jrk.z + dt3 * (snp.z + dt4 * (crk.z)))); sapdevice->pVel_j[i] = vel; if(integrationOrder > FOURTH) { //Accelerations acc.x += dt * (jrk.x + dt2 * (snp.x + dt3 * (crk.x))); acc.y += dt * (jrk.y + dt2 * (snp.y + dt3 * (crk.y))); acc.z += dt * (jrk.z + dt2 * (snp.z + dt3 * (crk.z))); sapdevice->pAcc_j[i] = acc; } }//for i }
bool PotentialFieldSolver::m_ParticleToMesh() { m_SpatialHasher_mass.setSpatialHashGrid(m_gridx, m_L/(double)m_gridx, make_float3(m_origin.x,m_origin.y,m_origin.z), m_N_mass); m_SpatialHasher_mass.setHashParam(); m_SpatialHasher_mass.doSpatialHash(m_p_massPos->getDevicePtr(),m_N_mass); m_p_massPos_Reorder->memset(make_float4(0,0,0,0)); m_SpatialHasher_mass.reorderData(m_N_mass, (void*)(m_p_massPos->getDevicePtr()), (void*)(m_p_massPos_Reorder->getDevicePtr()), 4, 1); m_particle_mass_Reorder->memset(0); m_SpatialHasher_mass.reorderData(m_N_mass, (void*)(m_particle_mass->getDevicePtr()), (void*)(m_particle_mass_Reorder->getDevicePtr()), 1, 2); m_grid_density->memset(0); ParticleToMesh(m_SpatialHasher_mass.getStartTable(), m_SpatialHasher_mass.getEndTable(), m_p_massPos_Reorder->getDevicePtr(), m_particle_mass_Reorder->getDevicePtr(), m_SpatialHasher_mass.getCellSize().x, m_grid_density->getDevicePtr(), make_uint3(m_gridx,m_gridy,m_gridz), make_uint3(m_gridx,m_gridy,m_gridz), m_N_mass, m_origin); cudaMemcpy(m_grid_Rhs->getDevicePtr(), m_grid_density->getDevicePtr(), m_grid_Rhs->getSize()*m_grid_Rhs->typeSize(), cudaMemcpyDeviceToDevice); ComputeRHS(m_grid_Rhs->getDevicePtr(), m_SpatialHasher_mass.getCellSize().x*m_SpatialHasher_mass.getCellSize().x, -1.0, m_gridx*m_gridy*m_gridz); m_p_massPos_Reorder->copy(gf_GpuArray<float4>::DEVICE_TO_HOST); m_particle_mass_Reorder->copy(gf_GpuArray<double>::DEVICE_TO_HOST); double total_weight = 0; double total_mass = 0; for(int i=0; i<m_N_mass; i++) { double *host = m_particle_mass_Reorder->getHostPtr(); total_weight += fabs(host[i]); total_mass += host[i]; } double cx=0, cy=0, cz=0; for(int i=0; i<m_N_mass; i++) { float4 *hpos = m_p_massPos_Reorder->getHostPtr(); double *hmass = m_particle_mass_Reorder->getHostPtr(); cx+=hpos[i].x*fabs(hmass[i]); cy+=hpos[i].y*fabs(hmass[i]); cz+=hpos[i].z*fabs(hmass[i]); //printf("%f,%f,%f\n",cx,cy,cz); } cx=cx/total_weight; cy=cy/total_weight; cz=cz/total_weight; m_center.x = cx; m_center.y = cy; m_center.z = cz; m_total_mass = total_mass; applyDirichlet(m_grid_Rhs->getDevicePtr(), make_double4(cx,cy,cz,0), m_total_mass, make_double4(m_origin.x,m_origin.y,m_origin.z,0), m_SpatialHasher_mass.getCellSize().x, m_gridx, m_gridy, m_gridz); return true; }