void octree::sort_dust(tree_structure &tree) { if(tree.n_dust == 0) return; devContext.startTiming(execStream->s()); //Start reduction to get the boundary's of the dust boundaryReduction.set_arg<int>(0, &tree.n_dust); boundaryReduction.set_arg<cl_mem>(1, tree.dust_pos.p()); boundaryReduction.set_arg<cl_mem>(2, devMemRMIN.p()); boundaryReduction.set_arg<cl_mem>(3, devMemRMAX.p()); boundaryReduction.setWork(tree.n, NTHREAD_BOUNDARY, NBLOCK_BOUNDARY); //256 threads and 120 blocks in total boundaryReduction.execute(execStream->s()); devMemRMIN.d2h(); //Need to be defined and initialized somewhere outside this function devMemRMAX.d2h(); //Need to be defined and initialized somewhere outside this function real4 r_min = make_real4(+1e10, +1e10, +1e10, +1e10); real4 r_max = make_real4(-1e10, -1e10, -1e10, -1e10); //Reduce the blocks, done on host since its //A faster and B we need the results anyway for (int i = 0; i < 120; i++) { r_min.x = std::min(r_min.x, devMemRMIN[i].x); r_min.y = std::min(r_min.y, devMemRMIN[i].y); r_min.z = std::min(r_min.z, devMemRMIN[i].z); r_max.x = std::max(r_max.x, devMemRMAX[i].x); r_max.y = std::max(r_max.y, devMemRMAX[i].y); r_max.z = std::max(r_max.z, devMemRMAX[i].z); } LOG("Found dust boundarys, number of particles %d : \n", tree.n_dust); LOG("min: %f\t%f\t%f\tmax: %f\t%f\t%f \n", r_min.x,r_min.y,r_min.z,r_max.x,r_max.y,r_max.z); //Compute the boundarys of the dust, needed to get the PH key real size = 1.001f*std::max(r_max.z - r_min.z, std::max(r_max.y - r_min.y, r_max.x - r_min.x)); float4 corner = make_real4(0.5f*(r_min.x + r_max.x) - 0.5f*size, 0.5f*(r_min.y + r_max.y) - 0.5f*size, 0.5f*(r_min.z + r_max.z) - 0.5f*size, size); float domain_fac = size/(1 << MAXLEVELS); corner.w = domain_fac; //Compute the keys my_dev::dev_mem<uint4> srcValues(devContext); //The generalBuffer1 has size uint*4*N*3 //this buffer gets part: 0-uint*4*N srcValues.cmalloc_copy(tree.generalBuffer1, tree.n_dust, 0); //Compute the keys directly into srcValues build_key_list.set_arg<cl_mem>(0, srcValues.p()); build_key_list.set_arg<cl_mem>(1, tree.dust_pos.p()); build_key_list.set_arg<int>(2, &tree.n_dust); build_key_list.set_arg<real4>(3, &corner); build_key_list.setWork(tree.n_dust, 128); //128 threads per block build_key_list.execute(execStream->s()); // If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten gpuSort(devContext, srcValues, tree.dust_key,srcValues, tree.n_dust, 32, 3, tree); //Sort the relevant properties.Note we can optimize this //further as done with the normal particles my_dev::dev_mem<real4> real4Buffer1(devContext); my_dev::dev_mem<real4> real4Buffer2(devContext); my_dev::dev_mem<real4> real4Buffer3(devContext); int memOffset1 = real4Buffer1.cmalloc_copy(tree.generalBuffer1,tree.n_dust, 0); memOffset1 = real4Buffer2.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1); memOffset1 = real4Buffer3.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1); dataReorderCombined.set_arg<int>(0, &tree.n_dust); dataReorderCombined.set_arg<cl_mem>(1, tree.dust_key.p()); dataReorderCombined.setWork(tree.n_dust, 512); //Position, velocity and acc0 dataReorderCombined.set_arg<cl_mem>(2, tree.dust_pos.p()); dataReorderCombined.set_arg<cl_mem>(3, real4Buffer1.p()); dataReorderCombined.set_arg<cl_mem>(4, tree.dust_vel.p()); dataReorderCombined.set_arg<cl_mem>(5, real4Buffer2.p()); dataReorderCombined.set_arg<cl_mem>(6, tree.dust_acc0.p()); dataReorderCombined.set_arg<cl_mem>(7, real4Buffer3.p()); dataReorderCombined.execute(execStream->s()); tree.dust_pos.copy(real4Buffer1, tree.n_dust); tree.dust_vel.copy(real4Buffer2, tree.n_dust); tree.dust_acc0.copy(real4Buffer3, tree.n_dust); my_dev::dev_mem<int> intBuffer(devContext); my_dev::dev_mem<float2> float2Buffer(devContext); my_dev::dev_mem<int> sortPermutation(devContext); memOffset1 = float2Buffer.cmalloc_copy (tree.generalBuffer1,tree.n_dust, 0); memOffset1 = sortPermutation.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1); memOffset1 = intBuffer.cmalloc_copy (tree.generalBuffer1,tree.n_dust, memOffset1); dataReorderF2.set_arg<int>(0, &tree.n_dust); dataReorderF2.set_arg<cl_mem>(1, tree.dust_key.p()); dataReorderF2.set_arg<cl_mem>(2, float2Buffer.p()); //Place holder, dust has no time dataReorderF2.set_arg<cl_mem>(3, float2Buffer.p()); //Reuse as destination1 dataReorderF2.set_arg<cl_mem>(4, tree.dust_ids.p()); dataReorderF2.set_arg<cl_mem>(5, sortPermutation.p()); //Reuse as destination2 dataReorderF2.setWork(tree.n_dust, 512); dataReorderF2.execute(execStream->s()); tree.dust_ids.copy(sortPermutation, sortPermutation.get_size()); devContext.stopTiming("DustSortReorder", -1, execStream->s()); }
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, my_dev::dev_mem<uint> &sortPermutation, int n_bodies) { //We assume the bodies are already onthe GPU devContext.startTiming(); this->allocateParticleSpecificBuffers(n_bodies); //Call the GPUSort function, since we made it general //into a uint4 so we can extend the tree to 96bit key //we have to convert to 64bit key to a 96bit for sorting //and back from 96 to 64 my_dev::dev_mem<uint4> srcValues(devContext); my_dev::dev_mem<uint4> output(devContext); my_dev::dev_mem<uint4> bodies_key(devContext); //Allocate memory for the generalBuffer //The generalBuffer1 has size uint*4*N*3 //this buffer gets part: 0-uint*4*N srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, n_bodies, getAllignmentOffset(0)); //this buffer gets part: uint*4*N-uint*4*N*2 output.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[4*n_bodies], 4*n_bodies, n_bodies, getAllignmentOffset(4*n_bodies)); int prevOffset = getAllignmentOffset(4*n_bodies); bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*n_bodies], 8*n_bodies, n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset)); //This function computes the keys, seperate since we compute keys also before //buidling the tree-structure //Corner and size are not stored, since we can use sorting without building a tree float4 corner; float domain_fac; compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac); //Extract the keys convertKey64to96.set_arg<cl_mem>(0, bodies_key.p()); convertKey64to96.set_arg<cl_mem>(1, srcValues.p()); convertKey64to96.set_arg<int>(2, &n_bodies); convertKey64to96.setWork(n_bodies, 256); convertKey64to96.execute(); //Sort the keys // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values // are preserved, if they are the same srcValues will be overwritten gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree); //Extract the keys and get the permuation required to sort the other //properties of the particles //Extract the sorted keys extractKeyAndPerm.set_arg<cl_mem>(0, output.p()); extractKeyAndPerm.set_arg<cl_mem>(1, bodies_key.p()); extractKeyAndPerm.set_arg<cl_mem>(2, sortPermutation.p()); extractKeyAndPerm.set_arg<int>(3, &n_bodies); extractKeyAndPerm.setWork(n_bodies, 256); extractKeyAndPerm.execute(); devContext.stopTiming("Sorting", 0); }