void octree::direct_dust(tree_structure &tree) { if(tree.n_dust == 0) return; directGrav.set_arg<cl_mem>(0, tree.dust_acc1.p()); directGrav.set_arg<cl_mem>(1, tree.dust_pos.p()); directGrav.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); directGrav.set_arg<int>(3, &tree.n_dust); directGrav.set_arg<int>(4, &tree.n); directGrav.set_arg<float>(5, &(this->eps2)); directGrav.set_arg<float4>(6, NULL, 256); std::vector<size_t> localWork(2), globalWork(2); localWork[0] = 256; localWork[1] = 1; globalWork[0] = 256 * ((tree.n_dust + 255) / 256); globalWork[1] = 1; directGrav.setWork(globalWork, localWork); directGrav.execute(gravStream->s()); //First half }
void octree::gpuSort_32b(my_dev::context &devContext, my_dev::dev_mem<uint> &srcKeys, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<int> &keysOutput, my_dev::dev_mem<uint> &keysAPing, my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing, int N, int numberOfBits) { int bitIdx = 0; //Step 1, do the count //Memory that should be alloced outside the function: setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; sortCount.set_arg<cl_mem>(0, srcKeys.p()); sortCount.set_arg<cl_mem>(1, this->devMemCounts.p()); sortCount.set_arg<uint>(2, &N); sortCount.set_arg<int>(3, NULL, 128);//smem size sortCount.set_arg<setupParams>(4, &sParam); sortCount.set_arg<int>(5, &bitIdx); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; sortCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// sortMove.set_arg<cl_mem>(0, srcKeys.p()); sortMove.set_arg<cl_mem>(1, keysOutput.p()); sortMove.set_arg<cl_mem>(2, srcValues.p()); sortMove.set_arg<cl_mem>(3, valuesOutput.p()); sortMove.set_arg<cl_mem>(4, this->devMemCounts.p()); sortMove.set_arg<uint>(5, &N); sortMove.set_arg<uint>(6, NULL, 192); //Dynamic shared memory 128+64 , prefux sum buffer sortMove.set_arg<uint>(7, NULL, 64*4); //Dynamic shared memory stage buffer sortMove.set_arg<uint>(8, NULL, 64*4); //Dynamic shared memory stage_values buffer sortMove.set_arg<setupParams>(9, &sParam); sortMove.set_arg<int>(10, &bitIdx); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; sortMove.setWork(globalWork, localWork); bool pingPong = false; //Execute bitIdx 0 sortCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); sortMove.execute(execStream->s()); //Swap buffers sortCount.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(1, keysAPing.p()); sortMove.set_arg<cl_mem>(2, valuesOutput.p()); sortMove.set_arg<cl_mem>(3, valuesAPing.p()); //Remaining bits, ping ponging buffers for(int i=1; i < numberOfBits; i++) { bitIdx = i; sortCount.set_arg<int>(5, &bitIdx); sortMove.set_arg<int>(10, &bitIdx); sortCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); sortMove.execute(execStream->s()); //Switch buffers if(pingPong) { sortCount.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(0, keysOutput.p()); sortMove.set_arg<cl_mem>(1, keysAPing.p()); sortMove.set_arg<cl_mem>(2, valuesOutput.p()); sortMove.set_arg<cl_mem>(3, valuesAPing.p()); pingPong = false; } else { sortCount.set_arg<cl_mem>(0, keysAPing.p()); sortMove.set_arg<cl_mem>(0, keysAPing.p()); sortMove.set_arg<cl_mem>(1, keysOutput.p()); sortMove.set_arg<cl_mem>(2, valuesAPing.p()); sortMove.set_arg<cl_mem>(3, valuesOutput.p()); pingPong = true; } } }
//Splits an array of integers, the values in srcValid indicate if a //value is valid (1 == valid anything else is UNvalid) returns the //splitted values in the output array (first all valid //number and then the invalid ones) and the total //number of valid items is stored in 'count' void octree::gpuSplit(my_dev::context &devContext, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<uint> &output, int N, int *validCount) // if validCount NULL leave count on device { //In the next step we associate the GPU memory with the Kernel arguments //my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512); //Memory that should be alloced outside the function: //devMemCounts and devMemCountsx // make sure previous reset has finished. this->devMemCountsx.waitForCopyEvent(); //Kernel configuration parameters setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; compactCount.set_arg<cl_mem>(0, srcValues.p()); compactCount.set_arg<cl_mem>(1, this->devMemCounts.p()); compactCount.set_arg<uint>(2, &N); compactCount.set_arg<int>(3, NULL, 128); compactCount.set_arg<setupParams>(4, &sParam); compactCount.set_arg<cl_mem>(5, this->devMemCountsx.p()); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; compactCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// splitMove.set_arg<cl_mem>(0, srcValues.p()); splitMove.set_arg<cl_mem>(1, output.p()); splitMove.set_arg<cl_mem>(2, this->devMemCounts.p()); splitMove.set_arg<uint>(3, &N); splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory splitMove.set_arg<setupParams>(5, &sParam); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; splitMove.setWork(globalWork, localWork); //////////////////// compactCount.execute(execStream->s()); exScanBlock.execute(execStream->s()); splitMove.execute(execStream->s()); if (validCount) { this->devMemCountsx.d2h(); *validCount = this->devMemCountsx[0]; } }
void octree::compute_properties (tree_structure &tree) { #if 0 fprintf(stderr,"This file is not up to date anymore! %s\n", __FILE__); exit(0); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeaf.set_arg<int>(0, &tree.n_leafs); propsLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeaf.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeaf.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); // propsLeaf.set_arg<cl_mem>(3, tree.bodies_pos.p()); propsLeaf.set_arg<cl_mem>(4, tree.multipole.p()); propsLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); propsLeaf.set_arg<cl_mem>(7, tree.lowerBounds.p()); propsLeaf.set_arg<cl_mem>(8, tree.upperBounds.p()); propsLeaf.set_arg<cl_mem>(9, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeaf.setWork(tree.n_leafs, 128); printf("PropsLeaf: "); propsLeaf.printWorkSize(); propsLeaf.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeaf.set_arg<int>(0, &temp); propsNonLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeaf.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeaf.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeaf.set_arg<cl_mem>(4, tree.multipole.p()); propsNonLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsNonLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); for(int i=tree.n_levels; i >= 1; i--) { propsNonLeaf.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeaf.setWork(totalOnThisLevel, 128); printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); propsNonLeaf.printWorkSize(); } propsNonLeaf.set_arg<int>(0, &i); //set the level propsNonLeaf.execute(); } float theta2 = theta; propsScaling.set_arg<int>(0, &tree.n_nodes); propsScaling.set_arg<real4>(1, &tree.corner); propsScaling.set_arg<cl_mem>(2, tree.multipole.p()); propsScaling.set_arg<cl_mem>(3, tree.nodeLowerBounds.p()); propsScaling.set_arg<cl_mem>(4, tree.nodeUpperBounds.p()); propsScaling.set_arg<cl_mem>(5, tree.n_children.p()); propsScaling.set_arg<cl_mem>(6, tree.node_data.p()); propsScaling.set_arg<float >(7, &theta2); propsScaling.set_arg<cl_mem>(8, tree.boxSizeInfo.p()); propsScaling.set_arg<cl_mem>(9, tree.boxCenterInfo.p()); propsScaling.setWork(tree.n_nodes, 128); printf("propsScaling: \t "); propsScaling.printWorkSize(); propsScaling.execute(); //tree.multipole.d2h(); //printf("COM: %f %f %f %f \n",tree.multipole[0].x, tree.multipole[0].y, tree.multipole[0].z, tree.multipole[0].w); #ifdef USE_CUDA cuCtxSynchronize(); #else clFinish(devContext.get_command_queue()); #endif tree.nodeLowerBounds.d2h(); tree.nodeUpperBounds.d2h(); copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n_nodes); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.node_data.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_data.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.node_bodies.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.group_list.p()); copyNodeDataToGroupData.set_arg<cl_mem>(6, tree.boxCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(7, tree.boxSizeInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(8, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(9, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(tree.n_nodes, 128); printf("copyNodeDataToGroupData: \t "); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); // tree.multipole.d2h(); // testRes.d2h(); // for(int i=0; i < tree.n_nodes; i++) // for(int i=tree.n_nodes-10; i < tree.n_nodes; i++) /* for(int i=0; i < 10; i++) { fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+0].x,tree.multipole[i*3+0].y,tree.multipole[i*3+0].z, tree.multipole[i*3+0].w); // fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, testRes[i]); fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, 0); fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+2].x,tree.multipole[i*3+2].y,tree.multipole[i*3+2].z, tree.multipole[i*3+2].w); } exit(0); */ #else compute_properties_double(tree); #endif }
void octree::compute_properties_double(tree_structure &tree) { /***************************************************** Assign the memory buffers, note that we check the size first and if needed we increase the size of the generalBuffer1 Size required: - multipoleD -> double4*3_n_nodes -> 6*n_nodes*uint4 - lower/upperbounds -> 2*n_nodes*uint4 - node lower/upper -> 2*n_nodes*uint4 - SUM: 10*n_nodes*uint4 - generalBuffer1 has default size: 3*N*uint4 check if 10*n_nodes < 3*N if so realloc *****************************************************/ if(10*tree.n_nodes > 3*tree.n) { fprintf(stderr, "Resizeing the generalBuffer1 \n"); tree.generalBuffer1.cresize(8*tree.n_nodes*4, false); } my_dev::dev_mem<double4> multipoleD(devContext); multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, 3*tree.n_nodes, getAllignmentOffset(0)); //Offset is in uint, so: double4 = 8uint*3*n_nodes tree.nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes], 8*3*tree.n_nodes, tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes)); int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output tree.nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 8*3*tree.n_nodes + 4*tree.n_nodes, tree.n_nodes, prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum)); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeafD.set_arg<int>(0, &tree.n_leafs); propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeafD.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); propsLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeafD.setWork(tree.n_leafs, 128); printf("PropsLeaf: "); propsLeafD.printWorkSize(); propsLeafD.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeafD.set_arg<int>(0, &temp); propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsNonLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsNonLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); //Work from the bottom up for(int i=tree.n_levels; i >= 1; i--) { propsNonLeafD.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeafD.setWork(totalOnThisLevel, 128); printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); propsNonLeafD.printWorkSize(); } propsNonLeafD.set_arg<int>(0, &i); //set the level propsNonLeafD.execute(); } float theta2 = theta; propsScalingD.set_arg<int>(0, &tree.n_nodes); propsScalingD.set_arg<real4>(1, &tree.corner); propsScalingD.set_arg<cl_mem>(2, multipoleD.p()); propsScalingD.set_arg<cl_mem>(3, tree.nodeLowerBounds.p()); propsScalingD.set_arg<cl_mem>(4, tree.nodeUpperBounds.p()); propsScalingD.set_arg<cl_mem>(5, tree.n_children.p()); propsScalingD.set_arg<cl_mem>(6, tree.multipole.p()); propsScalingD.set_arg<float >(7, &theta2); propsScalingD.set_arg<cl_mem>(8, tree.boxSizeInfo.p()); propsScalingD.set_arg<cl_mem>(9, tree.boxCenterInfo.p()); propsScalingD.set_arg<cl_mem>(10, tree.node_bodies.p()); propsScalingD.setWork(tree.n_nodes, 128); printf("propsScaling: \t "); propsScalingD.printWorkSize(); propsScalingD.execute(); #ifdef INDSOFT //If we use individual softening we need to get the max softening value //to be broadcasted during the exchange of the LET boundaries. //Only copy the root node that contains the max value my_dev::dev_stream memCpyStream; tree.multipole.d2h(3, false, memCpyStream.s()); #endif //Set the group properties, note that it is not based on the nodes anymore //but on self created groups based on particle order setPHGroupData copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); #ifdef INDSOFT memCpyStream.sync(); this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value #else #endif //Get the local domain boundary based on group positions and sizes real4 r_min, r_max; getBoundariesGroups(tree, r_min, r_max); #if 0 //Write the tree structure to file string nodeFileName = "fullTreeStructure.txt"; char fileName[256]; sprintf(fileName, "fullTreeStructure-%d.txt", mpiGetRank()); ofstream nodeFile; //nodeFile.open(nodeFileName.c_str()); nodeFile.open(fileName); tree.multipole.d2h(); tree.boxSizeInfo.d2h(); tree.boxCenterInfo.d2h(); for(int i=0; i < tree.n_nodes; i++) { //nodeFile << i << "\t" << tree.boxCenterInfo[i].x << "\t" << tree.boxCenterInfo[i].y; //nodeFile << "\t" << 2*tree.boxSizeInfo[i].x << "\t" << 2*tree.boxSizeInfo[i].y << "\t"; nodeFile << i << "\t" << tree.boxCenterInfo[i].x-tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y-tree.boxSizeInfo[i].y; nodeFile << "\t" << tree.boxCenterInfo[i].x+tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y+tree.boxSizeInfo[i].y << "\t"; nodeFile << tree.multipole[i*3+0].x << "\t" << tree.multipole[i*3+0].w << "\n"; } nodeFile.close(); sprintf(fileName, "fullTreeStructureParticles-%d.txt", mpiGetRank()); ofstream partFile; partFile.open(fileName); tree.bodies_pos.d2h(); for(int i=0; i < tree.n; i++) { float4 pos = tree.bodies_pos[i]; partFile << i << "\t" << pos.x << "\t" << pos.y << "\t" << pos.z << endl; } partFile.close(); #endif }
// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { //Extra buffer values // my_dev::dev_mem<uint> simpleKeys(devContext, N); //Int keys, // my_dev::dev_mem<uint> permutation(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<uint> valuesOutput(devContext, N); //Buffers for the values which are the indexes my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*N], 8*N, N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum); permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[9*N], 9*N, N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum); output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[10*N], 10*N, N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum)); //N elements after permutation prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum); valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[11*N], 11*N, N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum)); //N elements after output32b //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; int ng = (N)/blockSize + 1; int nx = (int)sqrt(ng); int ny = (ng-1)/nx + 1; vector<size_t> localWork(2), globalWork(2); globalWork[0] = nx*blockSize; globalWork[1] = ny; localWork [0] = blockSize; localWork[1] = 1; extractInt.setWork(globalWork, localWork); fillSequence.setWork(globalWork, localWork); reOrderKeysValues.setWork(globalWork, localWork); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<uint>(2, &N); extractInt.set_arg<int>(3, &intIdx);//bit idx fillSequence.set_arg<cl_mem>(0, permutation.p()); fillSequence.set_arg<uint>(1, &N); reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(); fillSequence.execute(); //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, // output32b, aPing32b, output32b, simpleKeys, // valuesOutput,valuesAPing, valuesOutput,permutation, // count, N, 32); //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(3, &intIdx);//smem size reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(3, &intIdx);//integer idx reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 32bit keys //Using int2 with key and value combined //See sortArray4 //Using key and value in a seperate array //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); clFinish(devContext.get_command_queue()); // fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0); }
//Splits an array of integers, the values in srcValid indicate if a //value is valid (1 == valid anything else is UNvalid) returns the //splitted values in the output array (first all valid //number and then the invalid ones) and the total //number of valid items is stored in 'count' void octree::gpuSplit(my_dev::context &devContext, my_dev::dev_mem<uint> &srcValues, my_dev::dev_mem<uint> &output, int N, int *validCount) { // In the next step we associate the GPU memory with the Kernel arguments // my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512); //Memory that should be alloced outside the function: //devMemCounts and devMemCountsx //Kernel configuration parameters setupParams sParam; sParam.jobs = (N / 64) / 480 ; //64=32*2 2 items per look, 480 is 120*4, number of procs sParam.blocksWithExtraJobs = (N / 64) % 480; sParam.extraElements = N % 64; sParam.extraOffset = N - sParam.extraElements; // printf("Param info: %d %d %d %d \n", sParam.jobs, sParam.blocksWithExtraJobs, sParam.extraElements, sParam.extraOffset); compactCount.set_arg<cl_mem>(0, srcValues.p()); compactCount.set_arg<cl_mem>(1, this->devMemCounts.p()); compactCount.set_arg<uint>(2, &N); compactCount.set_arg<int>(3, NULL, 128); compactCount.set_arg<setupParams>(4, &sParam); vector<size_t> localWork(2), globalWork(2); globalWork[0] = 32*120; globalWork[1] = 4; localWork [0] = 32; localWork[1] = 4; compactCount.setWork(globalWork, localWork); /////////////// exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p()); int blocks = 120*4; exScanBlock.set_arg<int>(1, &blocks); exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p()); exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation globalWork[0] = 512; globalWork[1] = 1; localWork [0] = 512; localWork [1] = 1; exScanBlock.setWork(globalWork, localWork); ////////////// splitMove.set_arg<cl_mem>(0, srcValues.p()); splitMove.set_arg<cl_mem>(1, output.p()); splitMove.set_arg<cl_mem>(2, this->devMemCounts.p()); splitMove.set_arg<uint>(3, &N); splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory splitMove.set_arg<setupParams>(5, &sParam); globalWork[0] = 120*32; globalWork[1] = 4; localWork [0] = 32; localWork [1] = 4; splitMove.setWork(globalWork, localWork); //////////////////// compactCount.execute(); // exit(0); // counts.d2h(); // for(int i=0; i < 482; i++) // { // printf("%d\t%d\n", i, counts[i]); // } // exScanBlock.execute(); splitMove.execute(); //TODO fix the damn clFinish function #ifdef USE_CUDA cuCtxSynchronize(); #else clFinish(devContext.get_command_queue()); #endif this->devMemCountsx.d2h(); *validCount = this->devMemCountsx[0]; //printf("Total number of valid items: %d \n", countx[0]); }
void octree::compute_properties(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, int n_bodies) { /***************************************************** Assign the memory buffers, note that we check the size first and if needed we increase the size of the generalBuffer1 Size required: - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4 - lower/upperbounds -> 2*n_nodes*uint4 - node lower/upper -> 2*n_nodes*uint4 - SUM: 10*n_nodes*uint4 - generalBuffer1 has default size: 3*N*uint4 check if 10*n_nodes < 3*N if so realloc (Note that generalBuffer might be larger because of tree-walk stack) *****************************************************/ if(10*tree.n_nodes > 3*tree.n) { #ifdef _DEBUG_PRINT_ fprintf(stderr, "Resizeing the generalBuffer1 \n"); #endif tree.generalBuffer1.cresize(10*tree.n_nodes*4, false); } my_dev::dev_mem<double4> multipoleD(devContext); my_dev::dev_mem<real4> nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO my_dev::dev_mem<real4> nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, 3*tree.n_nodes, getAllignmentOffset(0)); //Offset is in uint, so: double4 = 8uint*3*n_nodes nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes], 8*3*tree.n_nodes, tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes)); int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 8*3*tree.n_nodes + 4*tree.n_nodes, tree.n_nodes, prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum)); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeafD.set_arg<int>(0, &tree.n_leafs); propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeafD.set_arg<cl_mem>(3, bodies_pos.p()); propsLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); // propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeafD.setWork(tree.n_leafs, 128); #ifdef _DEBUG_PRINT_ printf("PropsLeaf: "); propsLeafD.printWorkSize(); #endif propsLeafD.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeafD.set_arg<int>(0, &temp); propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); //Work from the bottom up for(int i=tree.n_levels; i >= 1; i--) { propsNonLeafD.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeafD.setWork(totalOnThisLevel, 128); #ifdef _DEBUG_PRINT_ printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); #endif propsNonLeafD.printWorkSize(); } propsNonLeafD.set_arg<int>(0, &i); //set the level propsNonLeafD.execute(); } propsScalingD.set_arg<int>(0, &tree.n_nodes); propsScalingD.set_arg<cl_mem>(1, multipoleD.p()); propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p()); propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p()); propsScalingD.set_arg<cl_mem>(4, tree.n_children.p()); propsScalingD.set_arg<cl_mem>(5, tree.multipole.p()); propsScalingD.set_arg<float >(6, &theta); propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p()); propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p()); propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p()); propsScalingD.setWork(tree.n_nodes, 128); #ifdef _DEBUG_PRINT_ printf("propsScaling: \t "); propsScalingD.printWorkSize(); #endif propsScalingD.execute(); #if 0 #ifdef INDSOFT //If we use individual softening we need to get the max softening value //to be broadcasted during the exchange of the LET boundaries. //Only copy the root node that contains the max value my_dev::dev_stream memCpyStream; tree.multipole.d2h(3, false, memCpyStream.s()); #endif //Set the group properties, note that it is not based on the nodes anymore //but on self created groups based on particle order setPHGroupData copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); #ifdef INDSOFT memCpyStream.sync(); this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value #else #endif //Get the local domain boundary based on group positions and sizes real4 r_min, r_max; getBoundariesGroups(tree, r_min, r_max); #endif }