void octree::compute_properties_double(tree_structure &tree) { /***************************************************** Assign the memory buffers, note that we check the size first and if needed we increase the size of the generalBuffer1 Size required: - multipoleD -> double4*3_n_nodes -> 6*n_nodes*uint4 - lower/upperbounds -> 2*n_nodes*uint4 - node lower/upper -> 2*n_nodes*uint4 - SUM: 10*n_nodes*uint4 - generalBuffer1 has default size: 3*N*uint4 check if 10*n_nodes < 3*N if so realloc *****************************************************/ if(10*tree.n_nodes > 3*tree.n) { fprintf(stderr, "Resizeing the generalBuffer1 \n"); tree.generalBuffer1.cresize(8*tree.n_nodes*4, false); } my_dev::dev_mem<double4> multipoleD(devContext); multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, 3*tree.n_nodes, getAllignmentOffset(0)); //Offset is in uint, so: double4 = 8uint*3*n_nodes tree.nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes], 8*3*tree.n_nodes, tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes)); int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output tree.nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 8*3*tree.n_nodes + 4*tree.n_nodes, tree.n_nodes, prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum)); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeafD.set_arg<int>(0, &tree.n_leafs); propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeafD.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); propsLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeafD.setWork(tree.n_leafs, 128); printf("PropsLeaf: "); propsLeafD.printWorkSize(); propsLeafD.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeafD.set_arg<int>(0, &temp); propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsNonLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsNonLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); //Work from the bottom up for(int i=tree.n_levels; i >= 1; i--) { propsNonLeafD.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeafD.setWork(totalOnThisLevel, 128); printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); propsNonLeafD.printWorkSize(); } propsNonLeafD.set_arg<int>(0, &i); //set the level propsNonLeafD.execute(); } float theta2 = theta; propsScalingD.set_arg<int>(0, &tree.n_nodes); propsScalingD.set_arg<real4>(1, &tree.corner); propsScalingD.set_arg<cl_mem>(2, multipoleD.p()); propsScalingD.set_arg<cl_mem>(3, tree.nodeLowerBounds.p()); propsScalingD.set_arg<cl_mem>(4, tree.nodeUpperBounds.p()); propsScalingD.set_arg<cl_mem>(5, tree.n_children.p()); propsScalingD.set_arg<cl_mem>(6, tree.multipole.p()); propsScalingD.set_arg<float >(7, &theta2); propsScalingD.set_arg<cl_mem>(8, tree.boxSizeInfo.p()); propsScalingD.set_arg<cl_mem>(9, tree.boxCenterInfo.p()); propsScalingD.set_arg<cl_mem>(10, tree.node_bodies.p()); propsScalingD.setWork(tree.n_nodes, 128); printf("propsScaling: \t "); propsScalingD.printWorkSize(); propsScalingD.execute(); #ifdef INDSOFT //If we use individual softening we need to get the max softening value //to be broadcasted during the exchange of the LET boundaries. //Only copy the root node that contains the max value my_dev::dev_stream memCpyStream; tree.multipole.d2h(3, false, memCpyStream.s()); #endif //Set the group properties, note that it is not based on the nodes anymore //but on self created groups based on particle order setPHGroupData copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); #ifdef INDSOFT memCpyStream.sync(); this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value #else #endif //Get the local domain boundary based on group positions and sizes real4 r_min, r_max; getBoundariesGroups(tree, r_min, r_max); #if 0 //Write the tree structure to file string nodeFileName = "fullTreeStructure.txt"; char fileName[256]; sprintf(fileName, "fullTreeStructure-%d.txt", mpiGetRank()); ofstream nodeFile; //nodeFile.open(nodeFileName.c_str()); nodeFile.open(fileName); tree.multipole.d2h(); tree.boxSizeInfo.d2h(); tree.boxCenterInfo.d2h(); for(int i=0; i < tree.n_nodes; i++) { //nodeFile << i << "\t" << tree.boxCenterInfo[i].x << "\t" << tree.boxCenterInfo[i].y; //nodeFile << "\t" << 2*tree.boxSizeInfo[i].x << "\t" << 2*tree.boxSizeInfo[i].y << "\t"; nodeFile << i << "\t" << tree.boxCenterInfo[i].x-tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y-tree.boxSizeInfo[i].y; nodeFile << "\t" << tree.boxCenterInfo[i].x+tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y+tree.boxSizeInfo[i].y << "\t"; nodeFile << tree.multipole[i*3+0].x << "\t" << tree.multipole[i*3+0].w << "\n"; } nodeFile.close(); sprintf(fileName, "fullTreeStructureParticles-%d.txt", mpiGetRank()); ofstream partFile; partFile.open(fileName); tree.bodies_pos.d2h(); for(int i=0; i < tree.n; i++) { float4 pos = tree.bodies_pos[i]; partFile << i << "\t" << pos.x << "\t" << pos.y << "\t" << pos.z << endl; } partFile.close(); #endif }
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, my_dev::dev_mem<uint> &sortPermutation, int n_bodies) { //We assume the bodies are already onthe GPU devContext.startTiming(); this->allocateParticleSpecificBuffers(n_bodies); //Call the GPUSort function, since we made it general //into a uint4 so we can extend the tree to 96bit key //we have to convert to 64bit key to a 96bit for sorting //and back from 96 to 64 my_dev::dev_mem<uint4> srcValues(devContext); my_dev::dev_mem<uint4> output(devContext); my_dev::dev_mem<uint4> bodies_key(devContext); //Allocate memory for the generalBuffer //The generalBuffer1 has size uint*4*N*3 //this buffer gets part: 0-uint*4*N srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, n_bodies, getAllignmentOffset(0)); //this buffer gets part: uint*4*N-uint*4*N*2 output.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[4*n_bodies], 4*n_bodies, n_bodies, getAllignmentOffset(4*n_bodies)); int prevOffset = getAllignmentOffset(4*n_bodies); bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*n_bodies], 8*n_bodies, n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset)); //This function computes the keys, seperate since we compute keys also before //buidling the tree-structure //Corner and size are not stored, since we can use sorting without building a tree float4 corner; float domain_fac; compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac); //Extract the keys convertKey64to96.set_arg<cl_mem>(0, bodies_key.p()); convertKey64to96.set_arg<cl_mem>(1, srcValues.p()); convertKey64to96.set_arg<int>(2, &n_bodies); convertKey64to96.setWork(n_bodies, 256); convertKey64to96.execute(); //Sort the keys // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values // are preserved, if they are the same srcValues will be overwritten gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree); //Extract the keys and get the permuation required to sort the other //properties of the particles //Extract the sorted keys extractKeyAndPerm.set_arg<cl_mem>(0, output.p()); extractKeyAndPerm.set_arg<cl_mem>(1, bodies_key.p()); extractKeyAndPerm.set_arg<cl_mem>(2, sortPermutation.p()); extractKeyAndPerm.set_arg<int>(3, &n_bodies); extractKeyAndPerm.setWork(n_bodies, 256); extractKeyAndPerm.execute(); devContext.stopTiming("Sorting", 0); }
extractKeyAndPerm.set_arg<int>(3, &n_bodies); extractKeyAndPerm.setWork(n_bodies, 256); extractKeyAndPerm.execute(); devContext.stopTiming("Sorting", 0); } #if 0 //Use calloc here so valgrind does not complain about uninitialized values my_dev::dev_mem<real4> real4Buffer(devContext); real4Buffer.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[4*tree.n], 4*tree.n, tree.n, getAllignmentOffset(4*tree.n)); //Call the reorder data function //For the position dataReorderR4.set_arg<int>(0, &tree.n); dataReorderR4.set_arg<cl_mem>(1, tree.bodies_pos.p()); dataReorderR4.set_arg<cl_mem>(2, real4Buffer.p()); dataReorderR4.set_arg<cl_mem>(3, sortPermutation.p()); dataReorderR4.setWork(tree.n, 256); devContext.startTiming(); dataReorderR4.execute();
// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { //Extra buffer values // my_dev::dev_mem<uint> simpleKeys(devContext, N); //Int keys, // my_dev::dev_mem<uint> permutation(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<uint> valuesOutput(devContext, N); //Buffers for the values which are the indexes my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*N], 8*N, N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum); permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[9*N], 9*N, N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum); output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[10*N], 10*N, N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum)); //N elements after permutation prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum); valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[11*N], 11*N, N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum)); //N elements after output32b //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; int ng = (N)/blockSize + 1; int nx = (int)sqrt(ng); int ny = (ng-1)/nx + 1; vector<size_t> localWork(2), globalWork(2); globalWork[0] = nx*blockSize; globalWork[1] = ny; localWork [0] = blockSize; localWork[1] = 1; extractInt.setWork(globalWork, localWork); fillSequence.setWork(globalWork, localWork); reOrderKeysValues.setWork(globalWork, localWork); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<uint>(2, &N); extractInt.set_arg<int>(3, &intIdx);//bit idx fillSequence.set_arg<cl_mem>(0, permutation.p()); fillSequence.set_arg<uint>(1, &N); reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(); fillSequence.execute(); //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, // output32b, aPing32b, output32b, simpleKeys, // valuesOutput,valuesAPing, valuesOutput,permutation, // count, N, 32); //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(3, &intIdx);//smem size reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(3, &intIdx);//integer idx reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 32bit keys //Using int2 with key and value combined //See sortArray4 //Using key and value in a seperate array //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); clFinish(devContext.get_command_queue()); // fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0); }
void octree::build (tree_structure &tree) { int level = 0; int validCount = 0; int offset = 0; /******** load kernels **********/ /******** create memory buffers **********/ my_dev::dev_mem<uint> validList(devContext); my_dev::dev_mem<uint> compactList(devContext); validList.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, tree.n*2, getAllignmentOffset(0)); validList.zeroMem(); compactList.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[tree.n*2], tree.n*2, tree.n*2, getAllignmentOffset(tree.n*2)); /******** set kernels parameters **********/ build_key_list.set_arg<cl_mem>(0, tree.bodies_key.p()); build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p()); build_key_list.set_arg<int>(2, &tree.n); build_key_list.set_arg<real4>(3, &tree.corner); build_key_list.setWork(tree.n, 128); build_valid_list.set_arg<int>(0, &tree.n); build_valid_list.set_arg<int>(1, &level); build_valid_list.set_arg<cl_mem>(2, tree.bodies_key.p()); build_valid_list.set_arg<cl_mem>(3, validList.p()); build_valid_list.setWork(tree.n, 128); build_nodes.set_arg<int>(0, &level); build_nodes.set_arg<int>(1, &validCount); build_nodes.set_arg<int>(2, &offset); build_nodes.set_arg<cl_mem>(3, compactList.p()); build_nodes.set_arg<cl_mem>(4, tree.bodies_key.p()); build_nodes.set_arg<cl_mem>(5, tree.node_key.p()); build_nodes.set_arg<cl_mem>(6, tree.n_children.p()); build_nodes.set_arg<cl_mem>(7, tree.node_bodies.p()); link_tree.set_arg<int>(0, &offset); link_tree.set_arg<cl_mem>(1, tree.n_children.p()); link_tree.set_arg<cl_mem>(2, tree.node_bodies.p()); link_tree.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); link_tree.set_arg<real4>(4, &tree.corner); link_tree.set_arg<cl_mem>(5, tree.level_list.p()); link_tree.set_arg<cl_mem>(6, validList.p()); link_tree.set_arg<cl_mem>(7, tree.node_key.p()); link_tree.set_arg<cl_mem>(8, tree.bodies_key.p()); link_tree.set_arg<int>(9, &level); /********** build list of keys ********/ build_key_list.execute(); /****** build the levels *********/ int nodeSum = 0; for (level = 0; level < MAXLEVELS; level++) { // mark bodies to be combined into nodes build_valid_list.set_arg<int>(1, &level); build_valid_list.execute(); //gpuCompact to get number of created nodes gpuCompact(devContext, validList, compactList, tree.n*2, &validCount); nodeSum += validCount / 2; printf("ValidCount (%d): %d \tSum: %d Offset: %d\n", mpiGetRank(), validCount, nodeSum, offset); validCount /= 2; if (validCount == 0) break; // asssemble nodes build_nodes.setWork(validCount, 128); build_nodes.set_arg<int>(0, &level); build_nodes.set_arg<int>(1, &validCount); build_nodes.set_arg<int>(2, &offset); build_nodes.execute(); tree.level_list[level] = (uint2){offset, offset + validCount}; offset += validCount; } //end for lvl //Put the last level + 1 index to 0,0 //so we dont need an extra if statement in the linking phase tree.level_list[level] = (uint2){0, 0}; tree.level_list.h2d(); int n_nodes = offset; tree.n_nodes = n_nodes; /***** Link the tree ******/ link_tree.set_arg<int>(0, &offset); //Offset=number of nodes link_tree.set_arg<int>(9, &level); //level=highest number of levels //The maximum number of levels that can be used is MAXLEVEl //if max level is larger than that the program will exit printf("Max level : %d \n", level); if(level >= MAXLEVELS) { cerr << "The tree has become too deep, the program will exit. \n"; cerr << "Consider the removal of far away particles to prevent a too large box. \n"; maxlevels_exceeded = true; return; //exit(0); } link_tree.setWork(n_nodes, 128); printf("Link_tree: "); link_tree.printWorkSize(); tree.n_levels = level-1; for(int i=0; i < level; i++) printf("%d\t%d\t%d\n", i, tree.level_list[i].x, tree.level_list[i].y); //Link the tree link_tree.execute(); //After executing link_tree, the id_list contains for each node //the ID of its parent. //Valid_list contains for each node if its a leaf (valid) or a normal //node -> non_valid //Execute a split on the validList to get seperate id lists //for the leafs and nodes. Used when computing multipoles tree.leafNodeIdx.cmalloc(tree.n_nodes , false); //Split the leaf ids and non-leaf node ids gpuSplit(devContext, validList, tree.leafNodeIdx, tree.n_nodes, &tree.n_leafs); printf("Total nodes: %d N_leafs: %d non-leafs: %d \n", tree.n_nodes, tree.n_leafs, tree.n_nodes - tree.n_leafs); build_level_list.set_arg<int>(0, &tree.n_nodes); build_level_list.set_arg<int>(1, &tree.n_leafs); build_level_list.set_arg<cl_mem>(2, tree.leafNodeIdx.p()); build_level_list.set_arg<cl_mem>(3, tree.node_bodies.p()); build_level_list.set_arg<cl_mem>(4, validList.p()); build_level_list.setWork(tree.n_nodes-tree.n_leafs, 128); validList.zeroMem(); //Build the level list based on the leafIdx list //required for easy access in the compute node properties build_level_list.execute(); tree.node_level_list.cmalloc(level*2 , false); int levelThing; gpuCompact(devContext, validList, tree.node_level_list, 2*(tree.n_nodes-tree.n_leafs), &levelThing); tree.node_level_list.d2h(); //We only care about end positions, so compress the list: int j=0; for(int i=0; i < levelThing; i+=2, j++) tree.node_level_list[j] = tree.node_level_list[i]; tree.node_level_list[j] =tree.node_level_list[levelThing-1]+1; //Add 1 to make it the end position levelThing = j+1; tree.node_level_list.h2d(); printf("Finished level list \n"); for(int i=0; i < levelThing; i++) { printf("node_level_list: %d \t%d\n", i, tree.node_level_list[i]); } ///****** Start building the particle groups *******/////// //Compute the box size, the max length of one of the sides of the rectangle real size = fmax(fabs(rMaxLocalTree.z - rMinLocalTree.z), fmax(fabs(rMaxLocalTree.y - rMinLocalTree.y), fabs(rMaxLocalTree.x - rMinLocalTree.x))); real dist = ((rMaxLocalTree.z - rMinLocalTree.z) * (rMaxLocalTree.z - rMinLocalTree.z) + (rMaxLocalTree.y - rMinLocalTree.y) * (rMaxLocalTree.y - rMinLocalTree.y) + (rMaxLocalTree.x - rMinLocalTree.x) * (rMaxLocalTree.x - rMinLocalTree.x)); float maxDist = sqrt(dist) / 10; maxDist *= maxDist; //Square since we dont do sqrt on device fprintf(stderr,"Box max size: %f en max dist: %f \t %f en %f \n", size, dist, sqrt(dist), maxDist); //maxDist = 50; validList.zeroMem(); //The newest group creation method! define_groups.set_arg<int>(0, &tree.n); define_groups.set_arg<cl_mem>(1, validList.p()); define_groups.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); define_groups.set_arg<float>(3, &maxDist); define_groups.setWork(tree.n, 128); define_groups.execute(); //gpuCompact gpuCompact(devContext, validList, compactList, tree.n*2, &validCount); printf("Found number of groups: %d \n", validCount/2); tree.n_groups = validCount/2; //Now compact validList to get the list of group ids tree.group_list_test.cmalloc(tree.n_groups , false); store_groups.set_arg<int>(0, &tree.n); store_groups.set_arg<int>(1, &tree.n_groups); store_groups.set_arg<cl_mem>(2, compactList.p()); store_groups.set_arg<cl_mem>(3, tree.body2group_list.p()); store_groups.set_arg<cl_mem>(4, tree.group_list_test.p()); store_groups.setWork(-1, NCRIT, tree.n_groups); store_groups.execute(); //Memory allocation for the valid group lists if(tree.active_group_list.get_size() > 0) { tree.active_group_list.cresize(tree.n_groups, false); tree.activeGrpList.cresize(tree.n_groups, false); } else { tree.active_group_list.cmalloc(tree.n_groups, false); tree.activeGrpList.cmalloc(tree.n_groups, false); } printf("Tree built complete!\n"); /*************************/ }
void octree::compute_properties(tree_structure &tree, my_dev::dev_mem<float4> &bodies_pos, int n_bodies) { /***************************************************** Assign the memory buffers, note that we check the size first and if needed we increase the size of the generalBuffer1 Size required: - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4 - lower/upperbounds -> 2*n_nodes*uint4 - node lower/upper -> 2*n_nodes*uint4 - SUM: 10*n_nodes*uint4 - generalBuffer1 has default size: 3*N*uint4 check if 10*n_nodes < 3*N if so realloc (Note that generalBuffer might be larger because of tree-walk stack) *****************************************************/ if(10*tree.n_nodes > 3*tree.n) { #ifdef _DEBUG_PRINT_ fprintf(stderr, "Resizeing the generalBuffer1 \n"); #endif tree.generalBuffer1.cresize(10*tree.n_nodes*4, false); } my_dev::dev_mem<double4> multipoleD(devContext); my_dev::dev_mem<real4> nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO my_dev::dev_mem<real4> nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, 3*tree.n_nodes, getAllignmentOffset(0)); //Offset is in uint, so: double4 = 8uint*3*n_nodes nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes], 8*3*tree.n_nodes, tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes)); int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 8*3*tree.n_nodes + 4*tree.n_nodes, tree.n_nodes, prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum)); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeafD.set_arg<int>(0, &tree.n_leafs); propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeafD.set_arg<cl_mem>(3, bodies_pos.p()); propsLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); // propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeafD.setWork(tree.n_leafs, 128); #ifdef _DEBUG_PRINT_ printf("PropsLeaf: "); propsLeafD.printWorkSize(); #endif propsLeafD.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeafD.set_arg<int>(0, &temp); propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p()); propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p()); propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p()); //Work from the bottom up for(int i=tree.n_levels; i >= 1; i--) { propsNonLeafD.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeafD.setWork(totalOnThisLevel, 128); #ifdef _DEBUG_PRINT_ printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); #endif propsNonLeafD.printWorkSize(); } propsNonLeafD.set_arg<int>(0, &i); //set the level propsNonLeafD.execute(); } propsScalingD.set_arg<int>(0, &tree.n_nodes); propsScalingD.set_arg<cl_mem>(1, multipoleD.p()); propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p()); propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p()); propsScalingD.set_arg<cl_mem>(4, tree.n_children.p()); propsScalingD.set_arg<cl_mem>(5, tree.multipole.p()); propsScalingD.set_arg<float >(6, &theta); propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p()); propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p()); propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p()); propsScalingD.setWork(tree.n_nodes, 128); #ifdef _DEBUG_PRINT_ printf("propsScaling: \t "); propsScalingD.printWorkSize(); #endif propsScalingD.execute(); #if 0 #ifdef INDSOFT //If we use individual softening we need to get the max softening value //to be broadcasted during the exchange of the LET boundaries. //Only copy the root node that contains the max value my_dev::dev_stream memCpyStream; tree.multipole.d2h(3, false, memCpyStream.s()); #endif //Set the group properties, note that it is not based on the nodes anymore //but on self created groups based on particle order setPHGroupData copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); #ifdef INDSOFT memCpyStream.sync(); this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value #else #endif //Get the local domain boundary based on group positions and sizes real4 r_min, r_max; getBoundariesGroups(tree, r_min, r_max); #endif }