void octgrav::allocate_cuda_memory() { allocateCUDAarray((void**)&dev.bodies_pos, n_bodies * sizeof(float4)); allocateCUDAarray((void**)&dev.bodies_grav, n_norm(n_bodies, 256) * sizeof(float4)); allocateCUDAarray((void**)&dev.children, children_list.size() * sizeof(int4)); allocateCUDAarray((void**)&dev.node_Qu, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.node_Qd, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.Oct1, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.Oct2, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.Oct3, node_list.size() * sizeof(float2)); allocateCUDAarray((void**)&dev.node_pos, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.node_com, node_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.n_in_node, node_list.size() * sizeof(int)); allocateCUDAarray((void**)&dev.node_bodies_offset, node_list.size() * sizeof(int)); // allocateCUDAarray((void**)&dev.leaf_pos, leaf_list.size() * sizeof(float4)); // allocateCUDAarray((void**)&dev.leaf_com, leaf_list.size() * sizeof(float4)); // allocateCUDAarray((void**)&dev.n_in_leaf, leaf_list.size() * sizeof(int)); // allocateCUDAarray((void**)&dev.leaf_bodies_offset, leaf_list.size() * sizeof(int)); allocateCUDAarray((void**)&dev.cell_pos, cell_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.cell_com, cell_list.size() * sizeof(float4)); allocateCUDAarray((void**)&dev.n_in_cell, cell_list.size() * sizeof(int)); allocateCUDAarray((void**)&dev.cell_bodies_offset, cell_list.size() * sizeof(int)); }
double sapporo::evaluate_gravity(int ni_total, int nj) { //This function is called inside an omp parallel section #ifdef DEBUG_PRINT cerr << "evaluate_gravity ni: " << ni_total << "\tnj: " << nj << endl; #endif if(ni_total == 0 || nj == 0) return 0.0; //Use this to indicate we did gravity on the host, to disable memory copies executedOnHost = false; #ifdef CPU_SUPPORT //Compute number of interactions to be done and compare to CPU threshold long long int nInter = ni_total* (long long int) nj; if (nInter < CPUThreshold) { fprintf(stderr, "CPU EXEC || ni: %d nj: %d BThreshold: %d nInter: %lld\n", ni_total, nj, CPUThreshold, nInter); predictJParticles_host(nj); //Predict the particles //evaluate_gravity_host(ni_total, nj); //Non-vector version evaluate_gravity_host_vector(ni_total, nj); //Vector version executedOnHost = true; return 0.0; } #endif //ni is the number of i-particles that is set and for which we compute the force //nj is the current number of j-particles that are used as sources //If there are particles updated, put them in the correct locations //in the device memory. From the temp buffers to the final location. copyJInDev(nj); //Execute prediction if necessary predictJParticles(nj); //Reset the memory buffers in order to be able to do atomicAdds int argIdx = 0; int doNGB = true; int doNGBList = true; sapdevice->resetDevBuffers.set_arg<int >(argIdx++, &ni_total); sapdevice->resetDevBuffers.set_arg<int >(argIdx++, &doNGB); sapdevice->resetDevBuffers.set_arg<int >(argIdx++, &doNGBList); sapdevice->resetDevBuffers.set_arg<int >(argIdx++, &integrationOrder); sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->iParticleResults.ptr()); sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->ds_i.ptr()); sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->ngb_count_i.ptr()); sapdevice->resetDevBuffers.setWork_2D(256, ni_total); sapdevice->resetDevBuffers.execute(); int ni = 0; //Loop over the ni-particles in jumps equal to the number of threads for(int ni_offset = 0; ni_offset < ni_total; ni_offset += NTHREADS) { //Determine number of particles to be integrated ni = min(ni_total - ni_offset, NTHREADS); //Setting the properties for the gravity kernel //Calculate the number of blocks, groups, etc. For efficiency we always //launch a multiple number of blocks of the warpsize/wavefront size int multipleSize = 1; if(integrationOrder <= FOURTH) { //This is only possible for 4th order, the 6th order requires too many //resources to launch big thread-blocks. The get_workGroupMultiple //retrieves the warp/wavefront size if(ni > 128) multipleSize = sapdevice->evalgravKernelTemplate.get_workGroupMultiple(); else if(ni > 96) multipleSize = sapdevice->evalgravKernelTemplate.get_workGroupMultiple() / 2; } //Force ni to be a multiple of the warp/wavefront size. Note we can let ni be a multiple //since we ignore all results of non-used (non-requested) particles #if 1 //Disable this for block timing. BLOCK_TIMING int temp = ni / multipleSize; if((ni % multipleSize) != 0 ) temp++; ni = temp * multipleSize; #endif //Dimensions of one thread-block, this can be of the 2D form if there are multiple //y dimensions (q) with an x-dimension of p. int p = ni; int q = min(NTHREADS2/ni, 32); //NOTE NTHREADS2 to make 2D blocks for devices with enough resources //to allow for 2x NTHREADS block-sizes //The above is the default and works all the time, we can do some extra device/algorithm //specific tunings using the code below. //Set the amount of shared-memory and possibly improve the 2D block sizes //by using specific optimizations. //NOTE this is also device/resource dependend and can cause 'out of resource' crashes!! if(integrationOrder == FOURTH) { if(integrationPrecision == DOUBLESINGLE) { #ifdef ENABLE_THREAD_BLOCK_SIZE_OPTIMIZATION //This is most optimal one for Fourth order Double-Single. if(ni <= 256 && ni >= 32) q = min(sapdevice->evalgravKernelTemplate.get_workGroupMaxSize()/ni, 32); #endif } } // q = 1; //Use this when testing optimal thread/block/multi size. Disables 2D thread-blocks . BLOCK_TIMING int sharedMemSizeEval = p*q*(sapdevice->sharedMemPerThread); //Compute the number of nj particles used per-block (note can have multiple blocks per thread-block in 2D case) int nj_scaled = n_norm(nj, q*(sapdevice->get_NBLOCKS())); int thisBlockScaled = nj_scaled/((sapdevice->get_NBLOCKS())*q); #ifdef DEBUG_PRINT fprintf(stderr, "Offset: %d --> Total: %d Current step: %d \n", ni_offset, ni_total, ni); fprintf(stderr, "EvalGrav config: p: %d q: %d nj: %d nj_scaled: %d thisblockscaled: %d ni: %d EPS: %f \n", p,q,nj, nj_scaled, thisBlockScaled, ni, EPS2); fprintf(stderr, "Shared memory configuration, size eval: %d Per thread: %d (bytes)\n", sharedMemSizeEval, sapdevice->sharedMemPerThread); #endif argIdx = 0; sapdevice->evalgravKernelTemplate.set_arg<int >(argIdx++, &nj); //Total number of j particles sapdevice->evalgravKernelTemplate.set_arg<int >(argIdx++, &thisBlockScaled); sapdevice->evalgravKernelTemplate.set_arg<int >(argIdx++, &ni_offset); sapdevice->evalgravKernelTemplate.set_arg<int >(argIdx++, &ni_total); sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++, sapdevice->pPos_j.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++, sapdevice->pos_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++, sapdevice->iParticleResults.ptr()); sapdevice->evalgravKernelTemplate.set_arg<double>(argIdx++, &EPS2); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->pVel_j.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->id_j.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->vel_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->id_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->ds_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->ngb_count_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->ngb_list_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->acc_i.ptr()); sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++, sapdevice->pAcc_j.ptr()); sapdevice->evalgravKernelTemplate.set_arg<int>(argIdx++, NULL, (sharedMemSizeEval)/sizeof(int)); //Shared memory sapdevice->evalgravKernelTemplate.setWork_threadblock2D(p, q, (sapdevice->get_NBLOCKS()), 1); //Default sapdevice->evalgravKernelTemplate.execute(); } //Loop over ni return 0.0; } //end evaluate gravity