void octree::compute_keys(my_dev::dev_mem<float4> &bodies_pos, my_dev::dev_mem<uint4> &bodies_key, int n_bodies, float4 &corner, float &domain_fac) { this->getCorner(bodies_pos, n_bodies, corner, domain_fac); //Compute the keys build_key_list.set_arg<cl_mem>(0, bodies_key.p()); build_key_list.set_arg<cl_mem>(1, bodies_pos.p()); build_key_list.set_arg<int>(2, &n_bodies); build_key_list.set_arg<real4>(3, &corner); build_key_list.setWork(n_bodies, 128); //128 threads per block build_key_list.execute(); }
void octree::getBoundaries(my_dev::dev_mem<float4> &bodies_pos, int n_bodies, real4 &r_min, real4 &r_max) { //Start reduction to get the boundary's of the system boundaryReduction.set_arg<int>(0, &n_bodies); boundaryReduction.set_arg<cl_mem>(1, bodies_pos.p()); boundaryReduction.set_arg<cl_mem>(2, devMemRMIN.p()); boundaryReduction.set_arg<cl_mem>(3, devMemRMAX.p()); boundaryReduction.setWork(n_bodies, NTHREAD_BOUNDARY, NBLOCK_BOUNDARY); //256 threads and 120 blocks in total boundaryReduction.execute(); devMemRMIN.d2h(); //Need to be defined and initialized somewhere outside this function devMemRMAX.d2h(); //Need to be defined and initialized somewhere outside this function r_min = (real4){+1e10, +1e10, +1e10, +1e10}; r_max = (real4){-1e10, -1e10, -1e10, -1e10}; //Reduce the blocks, done on host since its //A faster and B we need the results anyway for (int i = 0; i < 120; i++) { r_min.x = fmin(r_min.x, devMemRMIN[i].x); r_min.y = fmin(r_min.y, devMemRMIN[i].y); r_min.z = fmin(r_min.z, devMemRMIN[i].z); r_max.x = fmax(r_max.x, devMemRMAX[i].x); r_max.y = fmax(r_max.y, devMemRMAX[i].y); r_max.z = fmax(r_max.z, devMemRMAX[i].z); // printf("%f\t%f\t%f\t || \t%f\t%f\t%f\n", rMIN[i].x,rMIN[i].y,rMIN[i].z,rMAX[i].x,rMAX[i].y,rMAX[i].z); } printf("Found boundarys, number of particles %d : \n", n_bodies); printf("min: %f\t%f\t%f\tmax: %f\t%f\t%f \n", r_min.x,r_min.y,r_min.z,r_max.x,r_max.y,r_max.z); }
void octree::gpuSort_32b(my_dev::context &devContext, my_dev::dev_mem<uint> &srcKeys, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<int> &keysOutput, my_dev::dev_mem<uint> &keysAPing, my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing, int N, int numberOfBits) { int bitIdx = 0; //Step 1, do the count //Memory that should be alloced outside the function: setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; sortCount.set_arg<cl_mem>(0, srcKeys.p()); sortCount.set_arg<cl_mem>(1, this->devMemCounts.p()); sortCount.set_arg<uint>(2, &N); sortCount.set_arg<int>(3, NULL, 128);//smem size sortCount.set_arg<setupParams>(4, &sParam); sortCount.set_arg<int>(5, &bitIdx); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; sortCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// sortMove.set_arg<cl_mem>(0, srcKeys.p()); sortMove.set_arg<cl_mem>(1, keysOutput.p()); sortMove.set_arg<cl_mem>(2, srcValues.p()); sortMove.set_arg<cl_mem>(3, valuesOutput.p()); sortMove.set_arg<cl_mem>(4, this->devMemCounts.p()); sortMove.set_arg<uint>(5, &N); sortMove.set_arg<uint>(6, NULL, 192); //Dynamic shared memory 128+64 , prefux sum buffer sortMove.set_arg<uint>(7, NULL, 64*4); //Dynamic shared memory stage buffer sortMove.set_arg<uint>(8, NULL, 64*4); //Dynamic shared memory stage_values buffer sortMove.set_arg<setupParams>(9, &sParam); sortMove.set_arg<int>(10, &bitIdx); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; sortMove.setWork(globalWork, localWork); bool pingPong = false; //Execute bitIdx 0 sortCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); sortMove.execute(execStream->s()); //Swap buffers sortCount.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(1, keysAPing.p()); sortMove.set_arg<cl_mem>(2, valuesOutput.p()); sortMove.set_arg<cl_mem>(3, valuesAPing.p()); //Remaining bits, ping ponging buffers for(int i=1; i < numberOfBits; i++) { bitIdx = i; sortCount.set_arg<int>(5, &bitIdx); sortMove.set_arg<int>(10, &bitIdx); sortCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); sortMove.execute(execStream->s()); //Switch buffers if(pingPong) { sortCount.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(1, keysAPing.p()); sortMove.set_arg<cl_mem>(2, valuesOutput.p()); sortMove.set_arg<cl_mem>(3, valuesAPing.p()); pingPong = false; } else { sortCount.set_arg<cl_mem>(0, keysAPing.p()); sortMove.set_arg<cl_mem>(0, keysAPing.p()); sortMove.set_arg<cl_mem>(1, keysOutput.p()); sortMove.set_arg<cl_mem>(2, valuesAPing.p()); sortMove.set_arg<cl_mem>(3, valuesOutput.p()); pingPong = true; } } }
// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { #if defined (USE_B40C) sorter->sort(srcValues, output, N); #elif defined(USE_THRUST) && defined(USE_THRUST_96) //Extra buffer values my_dev::dev_mem<uint> permutation(devContext); // Permutation values, for sorting the int4 data my_dev::dev_mem<uint> temp_buffer(devContext); // temporary uint buffer //Permutation has to be allocated after the two previous //allocated buffers, get the right offset int memOffset = permutation.getGlobalMemAllignmentPadding(8*N); memOffset += 8*N; memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = temp_buffer.cmalloc_copy(tree.generalBuffer1, N, memOffset); thrust_sort_96b(srcValues, output, temp_buffer, permutation, N); #else //Extra buffer values my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes //Permutation has to be allocated after the two previous //allocated buffers, get the right offset int memOffset = simpleKeys.getGlobalMemAllignmentPadding(8*N); memOffset += 8*N; memOffset = simpleKeys.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = output32b.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = valuesOutput.cmalloc_copy(tree.generalBuffer1, N, memOffset); //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; extractInt.setWork(N, blockSize); reOrderKeysValues.setWork(N, blockSize); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; //Extracts a 32bit key and fills a sequence extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<cl_mem>(2, permutation.p()); extractInt.set_arg<uint>(3, &N); extractInt.set_arg<int>(4, &intIdx);//bit idx reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(execStream->s()); #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(execStream->s()); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(4, &intIdx);//smem size extractInt.execute(execStream->s()); #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); reOrderKeysValues.execute(execStream->s()); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(4, &intIdx);//integer idx extractInt.execute(execStream->s()); //Now sort the final set of 32bit keys #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.execute(execStream->s()); #endif // USE_THRUST_96 }
//Splits an array of integers, the values in srcValid indicate if a //value is valid (1 == valid anything else is UNvalid) returns the //splitted values in the output array (first all valid //number and then the invalid ones) and the total //number of valid items is stored in 'count' void octree::gpuSplit(my_dev::context &devContext, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<uint> &output, int N, int *validCount) // if validCount NULL leave count on device { //In the next step we associate the GPU memory with the Kernel arguments //my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512); //Memory that should be alloced outside the function: //devMemCounts and devMemCountsx // make sure previous reset has finished. this->devMemCountsx.waitForCopyEvent(); //Kernel configuration parameters setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; compactCount.set_arg<cl_mem>(0, srcValues.p()); compactCount.set_arg<cl_mem>(1, this->devMemCounts.p()); compactCount.set_arg<uint>(2, &N); compactCount.set_arg<int>(3, NULL, 128); compactCount.set_arg<setupParams>(4, &sParam); compactCount.set_arg<cl_mem>(5, this->devMemCountsx.p()); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; compactCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// splitMove.set_arg<cl_mem>(0, srcValues.p()); splitMove.set_arg<cl_mem>(1, output.p()); splitMove.set_arg<cl_mem>(2, this->devMemCounts.p()); splitMove.set_arg<uint>(3, &N); splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory splitMove.set_arg<setupParams>(5, &sParam); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; splitMove.setWork(globalWork, localWork); //////////////////// compactCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); splitMove.execute(execStream->s()); if (validCount) { this->devMemCountsx.d2h(); *validCount = this->devMemCountsx[0]; } }
void octree::approximate_gravity(tree_structure &tree, my_dev::dev_mem<float4> &j_bodies_pos, //Bodies that are part of the tree-structure my_dev::dev_mem<float4> &j_bodies_h, my_dev::dev_mem<int> &j_bodies_idx, my_dev::dev_mem<float4> &i_bodies_pos, //Bodies that are part of the groups my_dev::dev_mem<float4> &i_bodies_h, my_dev::dev_mem<int> &i_bodies_idx, int n_groupBodies, //Number of bodies that are part of the groups my_dev::dev_mem<real4> &i_bodies_acc, my_dev::dev_mem<real> &i_bodies_ds2, my_dev::dev_mem<int> &i_bodies_ngb) { #if 1 uint2 node_begend; int level_start = 2; node_begend.x = tree.level_list[level_start].x; node_begend.y = tree.level_list[level_start].y; //Reset the active particles tree.activePartList.zeroMem(); this->atomicValues.zeroMem(); float eps2 = this->eps2; //Set the kernel parameters, many! int argIdx = 0; approxGrav.set_arg<int>(argIdx++, &tree.n_groups); approxGrav.set_arg<float>(argIdx++, &eps2); approxGrav.set_arg<uint2>(argIdx++, &node_begend); approxGrav.set_arg<cl_mem>(argIdx++, this->atomicValues.p()); approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_pos.p()); approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_h .p()); approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_acc.p()); approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_pos.p()); approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_h .p()); approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ds2.p()); approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.activePartList.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.interactions.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.group_list.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.multipole.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p()); approxGrav.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory #if 1 approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles) approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles) #endif approxGrav.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize"); approxGrav.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter"); approxGrav.set_arg<real4>(argIdx++, tree.multipole, 4, "texMultipole"); approxGrav.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody"); approxGrav.setWork(-1, NTHREAD, nBlocksForTreeWalk); // approxGrav.setWork(-1, NTHREAD, 1); #if 0 devContext.startTiming(); #endif approxGrav.execute(execStream->s()); //First half #if 0 execStream->sync(); devContext.stopTiming("Gravity", 1); #endif //Print interaction statistics #if 0 tree.n = n_groupBodies; tree.body2group_list.d2h(); tree.interactions.d2h(); long long directSum = 0; long long apprSum = 0; long long directSum2 = 0; long long apprSum2 = 0; int maxDir = -1; int maxAppr = -1; for(int i=0; i < tree.n; i++) { apprSum += tree.interactions[i].x; directSum += tree.interactions[i].y; maxAppr = max(maxAppr,tree.interactions[i].x); maxDir = max(maxDir,tree.interactions[i].y); apprSum2 += tree.interactions[i].x*tree.interactions[i].x; directSum2 += tree.interactions[i].y*tree.interactions[i].y; } //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t"; //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl; cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t"; cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr << endl; cout << "sigma dir: " << sqrt((directSum2 - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n) << endl; #if 0 //Histogram of number of interactions const int bins = 256; const int jump = 15; int histoIDX[bins+1]; for(int i=0; i < bins; i++) histoIDX[i] = 0; for(int i=0; i < tree.n; i++) { int idx = tree.interactions[i].x / jump; if(idx >= bins) idx = bins; histoIDX[idx]++; } for(int i=0; i < bins; i++) { if(histoIDX[i] == 0) fprintf(stderr, "HISTO %d\t-\n", i*jump, histoIDX[i]); else fprintf(stderr, "HISTO %d\t%d\n", i*jump, histoIDX[i]); } #endif #endif #if 0 i_bodies_ngb.d2h(); i_bodies_acc.d2h(); i_bodies_ds2.d2h(); for(int i=0; i < n_groupBodies; i++) { fprintf(stderr, "%d\t Acc: %f %f %f %f \t Ds2: %f \t NGB: %d \n", i, i_bodies_acc[i].x, i_bodies_acc[i].y, i_bodies_acc[i].z, i_bodies_acc[i].w, i_bodies_ds2[i], i_bodies_ngb[i]); } #endif /* //Reduce the number of valid particles getNActive.set_arg<int>(0, &tree.n); getNActive.set_arg<cl_mem>(1, tree.activePartlist.p()); getNActive.set_arg<cl_mem>(2, this->nactive.p()); getNActive.set_arg<int>(3, NULL, 128); //Dynamic shared memory , equal to number of threads getNActive.setWork(-1, 128, NBLOCK_REDUCE); CU_SAFE_CALL(cuCtxSynchronize()); //Synchronize all streams, makes sure that the approx stream is finished getNActive.execute(); //Reduce the last parts on the host this->nactive.d2h(); tree.n_active_particles = this->nactive[0]; for (int i = 1; i < NBLOCK_REDUCE ; i++) tree.n_active_particles += this->nactive[i]; printf("Active particles: %d \n", tree.n_active_particles); */ my_dev::base_mem::printMemUsage(); #endif }
void octree::get_ngb(tree_structure &tree, my_dev::dev_mem<float4> &j_bodies_pos, //Bodies that are part of the tree-structure my_dev::dev_mem<int> &j_bodies_idx, my_dev::dev_mem<float4> &i_bodies_pos, //Bodies that are part of the groups my_dev::dev_mem<int> &i_bodies_idx, my_dev::dev_mem<int> &i_bodies_ngb ){ uint2 node_begend; int level_start = 2; node_begend.x = tree.level_list[level_start].x; node_begend.y = tree.level_list[level_start].y; //Reset the active particles tree.activePartList.zeroMem(); this->atomicValues.zeroMem(); //Set the kernel parameters, many! int argIdx = 0; getNGB.set_arg<int>(argIdx++, &tree.n_groups); getNGB.set_arg<uint2>(argIdx++, &node_begend); getNGB.set_arg<cl_mem>(argIdx++, this->atomicValues.p()); getNGB.set_arg<cl_mem>(argIdx++, j_bodies_pos.p()); getNGB.set_arg<cl_mem>(argIdx++, i_bodies_pos.p()); getNGB.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p()); getNGB.set_arg<cl_mem>(argIdx++, tree.activePartList.p()); getNGB.set_arg<cl_mem>(argIdx++, tree.interactions.p()); getNGB.set_arg<cl_mem>(argIdx++, tree.group_list.p()); getNGB.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p()); getNGB.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p()); getNGB.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles) getNGB.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles) getNGB.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory getNGB.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize"); getNGB.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter"); getNGB.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody"); getNGB.setWork(-1, NTHREAD, nBlocksForTreeWalk); devContext.startTiming(); getNGB.execute(execStream->s()); //First half execStream->sync(); devContext.stopTiming("GetNGB", 1); //Print interaction statistics #if 0 tree.n = n_groupBodies; tree.body2group_list.d2h(); tree.interactions.d2h(); long long directSum = 0; long long apprSum = 0; long long directSum2 = 0; long long apprSum2 = 0; int maxDir = -1; int maxAppr = -1; for(int i=0; i < tree.n; i++) { apprSum += tree.interactions[i].x; directSum += tree.interactions[i].y; maxAppr = max(maxAppr,tree.interactions[i].x); maxDir = max(maxDir,tree.interactions[i].y); apprSum2 += tree.interactions[i].x*tree.interactions[i].x; directSum2 += tree.interactions[i].y*tree.interactions[i].y; } //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t"; //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl; cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t"; cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr << endl; cout << "sigma dir: " << sqrt((directSum2 - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n) << endl; #endif #if 0 i_bodies_ngb.d2h(); i_bodies_acc.d2h(); i_bodies_ds2.d2h(); for(int i=0; i < n_groupBodies; i++) { fprintf(stderr, "%d\t Acc: %f %f %f %f \t Ds2: %f \t NGB: %d \n", i, i_bodies_acc[i].x, i_bodies_acc[i].y, i_bodies_acc[i].z, i_bodies_acc[i].w, i_bodies_ds2[i], i_bodies_ngb[i]); } #endif my_dev::base_mem::printMemUsage(); }
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, my_dev::dev_mem<uint> &sortPermutation, int n_bodies) { //We assume the bodies are already onthe GPU devContext.startTiming(); this->allocateParticleSpecificBuffers(n_bodies); //Call the GPUSort function, since we made it general //into a uint4 so we can extend the tree to 96bit key //we have to convert to 64bit key to a 96bit for sorting //and back from 96 to 64 my_dev::dev_mem<uint4> srcValues(devContext); my_dev::dev_mem<uint4> output(devContext); my_dev::dev_mem<uint4> bodies_key(devContext); //Allocate memory for the generalBuffer //The generalBuffer1 has size uint*4*N*3 //this buffer gets part: 0-uint*4*N srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, n_bodies, getAllignmentOffset(0)); //this buffer gets part: uint*4*N-uint*4*N*2 output.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[4*n_bodies], 4*n_bodies, n_bodies, getAllignmentOffset(4*n_bodies)); int prevOffset = getAllignmentOffset(4*n_bodies); bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*n_bodies], 8*n_bodies, n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset)); //This function computes the keys, seperate since we compute keys also before //buidling the tree-structure //Corner and size are not stored, since we can use sorting without building a tree float4 corner; float domain_fac; compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac); //Extract the keys convertKey64to96.set_arg<cl_mem>(0, bodies_key.p()); convertKey64to96.set_arg<cl_mem>(1, srcValues.p()); convertKey64to96.set_arg<int>(2, &n_bodies); convertKey64to96.setWork(n_bodies, 256); convertKey64to96.execute(); //Sort the keys // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values // are preserved, if they are the same srcValues will be overwritten gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree); //Extract the keys and get the permuation required to sort the other //properties of the particles //Extract the sorted keys extractKeyAndPerm.set_arg<cl_mem>(0, output.p()); extractKeyAndPerm.set_arg<cl_mem>(1, bodies_key.p()); extractKeyAndPerm.set_arg<cl_mem>(2, sortPermutation.p()); extractKeyAndPerm.set_arg<int>(3, &n_bodies); extractKeyAndPerm.setWork(n_bodies, 256); extractKeyAndPerm.execute(); devContext.stopTiming("Sorting", 0); }
// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { //Extra buffer values // my_dev::dev_mem<uint> simpleKeys(devContext, N); //Int keys, // my_dev::dev_mem<uint> permutation(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<uint> valuesOutput(devContext, N); //Buffers for the values which are the indexes my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*N], 8*N, N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum); permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[9*N], 9*N, N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum); output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[10*N], 10*N, N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum)); //N elements after permutation prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum); valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[11*N], 11*N, N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum)); //N elements after output32b //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; int ng = (N)/blockSize + 1; int nx = (int)sqrt(ng); int ny = (ng-1)/nx + 1; vector<size_t> localWork(2), globalWork(2); globalWork[0] = nx*blockSize; globalWork[1] = ny; localWork [0] = blockSize; localWork[1] = 1; extractInt.setWork(globalWork, localWork); fillSequence.setWork(globalWork, localWork); reOrderKeysValues.setWork(globalWork, localWork); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<uint>(2, &N); extractInt.set_arg<int>(3, &intIdx);//bit idx fillSequence.set_arg<cl_mem>(0, permutation.p()); fillSequence.set_arg<uint>(1, &N); reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(); fillSequence.execute(); //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, // output32b, aPing32b, output32b, simpleKeys, // valuesOutput,valuesAPing, valuesOutput,permutation, // count, N, 32); //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(3, &intIdx);//smem size reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(3, &intIdx);//integer idx reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 32bit keys //Using int2 with key and value combined //See sortArray4 //Using key and value in a seperate array //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); clFinish(devContext.get_command_queue()); // fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0); }
//Splits an array of integers, the values in srcValid indicate if a //value is valid (1 == valid anything else is UNvalid) returns the //splitted values in the output array (first all valid //number and then the invalid ones) and the total //number of valid items is stored in 'count' void octree::gpuSplit(my_dev::context &devContext, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<uint> &output, int N, int *validCount) { // In the next step we associate the GPU memory with the Kernel arguments // my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512); //Memory that should be alloced outside the function: //devMemCounts and devMemCountsx //Kernel configuration parameters setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; // printf("Param info: %d %d %d %d \n", sParam.jobs, sParam.blocksWithExtraJobs, sParam.extraElements, sParam.extraOffset); compactCount.set_arg<cl_mem>(0, srcValues.p()); compactCount.set_arg<cl_mem>(1, this->devMemCounts.p()); compactCount.set_arg<uint>(2, &N); compactCount.set_arg<int>(3, NULL, 128); compactCount.set_arg<setupParams>(4, &sParam); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; compactCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// splitMove.set_arg<cl_mem>(0, srcValues.p()); splitMove.set_arg<cl_mem>(1, output.p()); splitMove.set_arg<cl_mem>(2, this->devMemCounts.p()); splitMove.set_arg<uint>(3, &N); splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory splitMove.set_arg<setupParams>(5, &sParam); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; splitMove.setWork(globalWork, localWork); //////////////////// compactCount.execute(); // exit(0); // counts.d2h(); // for(int i=0; i < 482; i++) // { // printf("%d\t%d\n", i, counts[i]); // } // exScanBlock.execute(); splitMove.execute(); //TODO fix the damn clFinish function #ifdef USE_CUDA cuCtxSynchronize(); #else clFinish(devContext.get_command_queue()); #endif this->devMemCountsx.d2h(); *validCount = this->devMemCountsx[0]; //printf("Total number of valid items: %d \n", countx[0]); }
void octree::compute_properties(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, int n_bodies) { /***************************************************** Assign the memory buffers, note that we check the size first and if needed we increase the size of the generalBuffer1 Size required: - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4 - lower/upperbounds -> 2*n_nodes*uint4 - node lower/upper -> 2*n_nodes*uint4 - SUM: 10*n_nodes*uint4 - generalBuffer1 has default size: 3*N*uint4 check if 10*n_nodes < 3*N if so realloc (Note that generalBuffer might be larger because of tree-walk stack) *****************************************************/ if(10*tree.n_nodes > 3*tree.n) { #ifdef _DEBUG_PRINT_ fprintf(stderr, "Resizeing the generalBuffer1 \n"); #endif tree.generalBuffer1.cresize(10*tree.n_nodes*4, false); } my_dev::dev_mem<double4> multipoleD(devContext); my_dev::dev_mem<real4> nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO my_dev::dev_mem<real4> nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, 3*tree.n_nodes, getAllignmentOffset(0)); //Offset is in uint, so: double4 = 8uint*3*n_nodes nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes], 8*3*tree.n_nodes, tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes)); int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 8*3*tree.n_nodes + 4*tree.n_nodes, tree.n_nodes, prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum)); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeafD.set_arg<int>(0, &tree.n_leafs); propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeafD.set_arg<cl_mem>(3, bodies_pos.p()); propsLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); // propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeafD.setWork(tree.n_leafs, 128); #ifdef _DEBUG_PRINT_ printf("PropsLeaf: "); propsLeafD.printWorkSize(); #endif propsLeafD.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeafD.set_arg<int>(0, &temp); propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); //Work from the bottom up for(int i=tree.n_levels; i >= 1; i--) { propsNonLeafD.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeafD.setWork(totalOnThisLevel, 128); #ifdef _DEBUG_PRINT_ printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); #endif propsNonLeafD.printWorkSize(); } propsNonLeafD.set_arg<int>(0, &i); //set the level propsNonLeafD.execute(); } propsScalingD.set_arg<int>(0, &tree.n_nodes); propsScalingD.set_arg<cl_mem>(1, multipoleD.p()); propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p()); propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p()); propsScalingD.set_arg<cl_mem>(4, tree.n_children.p()); propsScalingD.set_arg<cl_mem>(5, tree.multipole.p()); propsScalingD.set_arg<float >(6, &theta); propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p()); propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p()); propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p()); propsScalingD.setWork(tree.n_nodes, 128); #ifdef _DEBUG_PRINT_ printf("propsScaling: \t "); propsScalingD.printWorkSize(); #endif propsScalingD.execute(); #if 0 #ifdef INDSOFT //If we use individual softening we need to get the max softening value //to be broadcasted during the exchange of the LET boundaries. //Only copy the root node that contains the max value my_dev::dev_stream memCpyStream; tree.multipole.d2h(3, false, memCpyStream.s()); #endif //Set the group properties, note that it is not based on the nodes anymore //but on self created groups based on particle order setPHGroupData copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); #ifdef INDSOFT memCpyStream.sync(); this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value #else #endif //Get the local domain boundary based on group positions and sizes real4 r_min, r_max; getBoundariesGroups(tree, r_min, r_max); #endif }