void octree::make_dust_groups(tree_structure &tree) { if(tree.n_dust == 0) return; //Split the dust particles into groups //This is done slightly different than with the normal //particles. We do an extra split, similar to how we create //a tree-level. This to prevent that particles far away //get into the same box because of jumps into the PH my_dev::dev_mem<uint> validList(devContext); my_dev::dev_mem<uint> compactList(devContext); int n_bodies = tree.n_dust; int memOffset1 = validList.cmalloc_copy(tree.generalBuffer1,n_bodies*2, 0); memOffset1 = compactList.cmalloc_copy(tree.generalBuffer1,n_bodies*2, memOffset1); validList.zeroMem(); // set devMemCountsx to 1 because it is used to early out when it hits zero this->devMemCountsx[0] = 1; this->devMemCountsx.h2d(1); int level = 5; //This level proofs to be sort of OK. We can tune it depending on the data-set build_valid_list.set_arg<int>(0, &n_bodies); build_valid_list.set_arg<int>(1, &level); build_valid_list.set_arg<cl_mem>(2, tree.dust_key.p()); build_valid_list.set_arg<cl_mem>(3, validList.p()); build_valid_list.set_arg<cl_mem>(4, this->devMemCountsx.p()); build_valid_list.setWork(tree.n, 128); build_valid_list.execute(execStream->s()); //Now we reuse the results of the build_valid_list //it already has the breaks in the right places //Just add breaks every NGROUP items //The newest group creation method! define_dust_groups.set_arg<int>(0, &n_bodies); define_dust_groups.set_arg<cl_mem>(1, tree.dust_pos.p()); define_dust_groups.set_arg<cl_mem>(2, validList.p()); define_dust_groups.setWork(n_bodies, 256); define_dust_groups.execute(execStream->s()); // reset counts to 1 so next compact proceeds... this->devMemCountsx[0] = 1; this->devMemCountsx.h2d(1); int validCount; gpuCompact(devContext, validList, compactList, n_bodies*2, &validCount); tree.n_dust_groups = validCount / 2; LOGF(stderr, "Ngroups_dust: %d \n", tree.n_dust_groups); this->allocateDustGroupBuffers(tree); store_dust_groups.set_arg<int>(0, &tree.n_dust_groups); store_dust_groups.set_arg<cl_mem>(1, compactList.p()); store_dust_groups.set_arg<cl_mem>(2, tree.dust2group_list.p()); store_dust_groups.set_arg<cl_mem>(3, tree.dust_group_list.p()); store_dust_groups.set_arg<cl_mem>(4, tree.activeDustGrouplist.p()); store_dust_groups.setWork(-1, NCRIT, tree.n_dust_groups); store_dust_groups.execute(execStream->s()); }
void octree::build (tree_structure &tree) { int level = 0; int validCount = 0; int offset = 0; /******** load kernels **********/ /******** create memory buffers **********/ my_dev::dev_mem<uint> validList(devContext); my_dev::dev_mem<uint> compactList(devContext); validList.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[0], 0, tree.n*2, getAllignmentOffset(0)); validList.zeroMem(); compactList.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[tree.n*2], tree.n*2, tree.n*2, getAllignmentOffset(tree.n*2)); /******** set kernels parameters **********/ build_key_list.set_arg<cl_mem>(0, tree.bodies_key.p()); build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p()); build_key_list.set_arg<int>(2, &tree.n); build_key_list.set_arg<real4>(3, &tree.corner); build_key_list.setWork(tree.n, 128); build_valid_list.set_arg<int>(0, &tree.n); build_valid_list.set_arg<int>(1, &level); build_valid_list.set_arg<cl_mem>(2, tree.bodies_key.p()); build_valid_list.set_arg<cl_mem>(3, validList.p()); build_valid_list.setWork(tree.n, 128); build_nodes.set_arg<int>(0, &level); build_nodes.set_arg<int>(1, &validCount); build_nodes.set_arg<int>(2, &offset); build_nodes.set_arg<cl_mem>(3, compactList.p()); build_nodes.set_arg<cl_mem>(4, tree.bodies_key.p()); build_nodes.set_arg<cl_mem>(5, tree.node_key.p()); build_nodes.set_arg<cl_mem>(6, tree.n_children.p()); build_nodes.set_arg<cl_mem>(7, tree.node_bodies.p()); link_tree.set_arg<int>(0, &offset); link_tree.set_arg<cl_mem>(1, tree.n_children.p()); link_tree.set_arg<cl_mem>(2, tree.node_bodies.p()); link_tree.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); link_tree.set_arg<real4>(4, &tree.corner); link_tree.set_arg<cl_mem>(5, tree.level_list.p()); link_tree.set_arg<cl_mem>(6, validList.p()); link_tree.set_arg<cl_mem>(7, tree.node_key.p()); link_tree.set_arg<cl_mem>(8, tree.bodies_key.p()); link_tree.set_arg<int>(9, &level); /********** build list of keys ********/ build_key_list.execute(); /****** build the levels *********/ int nodeSum = 0; for (level = 0; level < MAXLEVELS; level++) { // mark bodies to be combined into nodes build_valid_list.set_arg<int>(1, &level); build_valid_list.execute(); //gpuCompact to get number of created nodes gpuCompact(devContext, validList, compactList, tree.n*2, &validCount); nodeSum += validCount / 2; printf("ValidCount (%d): %d \tSum: %d Offset: %d\n", mpiGetRank(), validCount, nodeSum, offset); validCount /= 2; if (validCount == 0) break; // asssemble nodes build_nodes.setWork(validCount, 128); build_nodes.set_arg<int>(0, &level); build_nodes.set_arg<int>(1, &validCount); build_nodes.set_arg<int>(2, &offset); build_nodes.execute(); tree.level_list[level] = (uint2){offset, offset + validCount}; offset += validCount; } //end for lvl //Put the last level + 1 index to 0,0 //so we dont need an extra if statement in the linking phase tree.level_list[level] = (uint2){0, 0}; tree.level_list.h2d(); int n_nodes = offset; tree.n_nodes = n_nodes; /***** Link the tree ******/ link_tree.set_arg<int>(0, &offset); //Offset=number of nodes link_tree.set_arg<int>(9, &level); //level=highest number of levels //The maximum number of levels that can be used is MAXLEVEl //if max level is larger than that the program will exit printf("Max level : %d \n", level); if(level >= MAXLEVELS) { cerr << "The tree has become too deep, the program will exit. \n"; cerr << "Consider the removal of far away particles to prevent a too large box. \n"; maxlevels_exceeded = true; return; //exit(0); } link_tree.setWork(n_nodes, 128); printf("Link_tree: "); link_tree.printWorkSize(); tree.n_levels = level-1; for(int i=0; i < level; i++) printf("%d\t%d\t%d\n", i, tree.level_list[i].x, tree.level_list[i].y); //Link the tree link_tree.execute(); //After executing link_tree, the id_list contains for each node //the ID of its parent. //Valid_list contains for each node if its a leaf (valid) or a normal //node -> non_valid //Execute a split on the validList to get seperate id lists //for the leafs and nodes. Used when computing multipoles tree.leafNodeIdx.cmalloc(tree.n_nodes , false); //Split the leaf ids and non-leaf node ids gpuSplit(devContext, validList, tree.leafNodeIdx, tree.n_nodes, &tree.n_leafs); printf("Total nodes: %d N_leafs: %d non-leafs: %d \n", tree.n_nodes, tree.n_leafs, tree.n_nodes - tree.n_leafs); build_level_list.set_arg<int>(0, &tree.n_nodes); build_level_list.set_arg<int>(1, &tree.n_leafs); build_level_list.set_arg<cl_mem>(2, tree.leafNodeIdx.p()); build_level_list.set_arg<cl_mem>(3, tree.node_bodies.p()); build_level_list.set_arg<cl_mem>(4, validList.p()); build_level_list.setWork(tree.n_nodes-tree.n_leafs, 128); validList.zeroMem(); //Build the level list based on the leafIdx list //required for easy access in the compute node properties build_level_list.execute(); tree.node_level_list.cmalloc(level*2 , false); int levelThing; gpuCompact(devContext, validList, tree.node_level_list, 2*(tree.n_nodes-tree.n_leafs), &levelThing); tree.node_level_list.d2h(); //We only care about end positions, so compress the list: int j=0; for(int i=0; i < levelThing; i+=2, j++) tree.node_level_list[j] = tree.node_level_list[i]; tree.node_level_list[j] =tree.node_level_list[levelThing-1]+1; //Add 1 to make it the end position levelThing = j+1; tree.node_level_list.h2d(); printf("Finished level list \n"); for(int i=0; i < levelThing; i++) { printf("node_level_list: %d \t%d\n", i, tree.node_level_list[i]); } ///****** Start building the particle groups *******/////// //Compute the box size, the max length of one of the sides of the rectangle real size = fmax(fabs(rMaxLocalTree.z - rMinLocalTree.z), fmax(fabs(rMaxLocalTree.y - rMinLocalTree.y), fabs(rMaxLocalTree.x - rMinLocalTree.x))); real dist = ((rMaxLocalTree.z - rMinLocalTree.z) * (rMaxLocalTree.z - rMinLocalTree.z) + (rMaxLocalTree.y - rMinLocalTree.y) * (rMaxLocalTree.y - rMinLocalTree.y) + (rMaxLocalTree.x - rMinLocalTree.x) * (rMaxLocalTree.x - rMinLocalTree.x)); float maxDist = sqrt(dist) / 10; maxDist *= maxDist; //Square since we dont do sqrt on device fprintf(stderr,"Box max size: %f en max dist: %f \t %f en %f \n", size, dist, sqrt(dist), maxDist); //maxDist = 50; validList.zeroMem(); //The newest group creation method! define_groups.set_arg<int>(0, &tree.n); define_groups.set_arg<cl_mem>(1, validList.p()); define_groups.set_arg<cl_mem>(2, tree.bodies_Ppos.p()); define_groups.set_arg<float>(3, &maxDist); define_groups.setWork(tree.n, 128); define_groups.execute(); //gpuCompact gpuCompact(devContext, validList, compactList, tree.n*2, &validCount); printf("Found number of groups: %d \n", validCount/2); tree.n_groups = validCount/2; //Now compact validList to get the list of group ids tree.group_list_test.cmalloc(tree.n_groups , false); store_groups.set_arg<int>(0, &tree.n); store_groups.set_arg<int>(1, &tree.n_groups); store_groups.set_arg<cl_mem>(2, compactList.p()); store_groups.set_arg<cl_mem>(3, tree.body2group_list.p()); store_groups.set_arg<cl_mem>(4, tree.group_list_test.p()); store_groups.setWork(-1, NCRIT, tree.n_groups); store_groups.execute(); //Memory allocation for the valid group lists if(tree.active_group_list.get_size() > 0) { tree.active_group_list.cresize(tree.n_groups, false); tree.activeGrpList.cresize(tree.n_groups, false); } else { tree.active_group_list.cmalloc(tree.n_groups, false); tree.activeGrpList.cmalloc(tree.n_groups, false); } printf("Tree built complete!\n"); /*************************/ }