Beispiel #1
0
void octree::sort_dust(tree_structure &tree)
{
  if(tree.n_dust == 0) return;
  
  devContext.startTiming(execStream->s());

  //Start reduction to get the boundary's of the dust
  boundaryReduction.set_arg<int>(0, &tree.n_dust);
  boundaryReduction.set_arg<cl_mem>(1, tree.dust_pos.p());
  boundaryReduction.set_arg<cl_mem>(2, devMemRMIN.p());
  boundaryReduction.set_arg<cl_mem>(3, devMemRMAX.p());

  boundaryReduction.setWork(tree.n, NTHREAD_BOUNDARY, NBLOCK_BOUNDARY);  //256 threads and 120 blocks in total
  boundaryReduction.execute(execStream->s());
  
   
  devMemRMIN.d2h();     //Need to be defined and initialized somewhere outside this function
  devMemRMAX.d2h();     //Need to be defined and initialized somewhere outside this function
  real4 r_min = make_real4(+1e10, +1e10, +1e10, +1e10); 
  real4 r_max = make_real4(-1e10, -1e10, -1e10, -1e10);   
  
  //Reduce the blocks, done on host since its
  //A faster and B we need the results anyway
  for (int i = 0; i < 120; i++) {    
    r_min.x = std::min(r_min.x, devMemRMIN[i].x);
    r_min.y = std::min(r_min.y, devMemRMIN[i].y);
    r_min.z = std::min(r_min.z, devMemRMIN[i].z);
    
    r_max.x = std::max(r_max.x, devMemRMAX[i].x);
    r_max.y = std::max(r_max.y, devMemRMAX[i].y);
    r_max.z = std::max(r_max.z, devMemRMAX[i].z);    
  }
  
  
  LOG("Found dust boundarys, number of particles %d : \n", tree.n_dust);
  LOG("min: %f\t%f\t%f\tmax: %f\t%f\t%f \n", r_min.x,r_min.y,r_min.z,r_max.x,r_max.y,r_max.z);

  //Compute the boundarys of the dust, needed to get the PH key
  real size     = 1.001f*std::max(r_max.z - r_min.z,
                         std::max(r_max.y - r_min.y, r_max.x - r_min.x));
  
  float4 corner   = make_real4(0.5f*(r_min.x + r_max.x) - 0.5f*size,
                             0.5f*(r_min.y + r_max.y) - 0.5f*size,
                             0.5f*(r_min.z + r_max.z) - 0.5f*size, size); 
       
  float domain_fac   = size/(1 << MAXLEVELS);
  
  corner.w = domain_fac;  
  

  //Compute the keys
  my_dev::dev_mem<uint4>  srcValues(devContext);
    
  //The generalBuffer1 has size uint*4*N*3
  //this buffer gets part: 0-uint*4*N
  srcValues.cmalloc_copy(tree.generalBuffer1, tree.n_dust, 0);
  
  //Compute the keys directly into srcValues   
  build_key_list.set_arg<cl_mem>(0,   srcValues.p());
  build_key_list.set_arg<cl_mem>(1,   tree.dust_pos.p());
  build_key_list.set_arg<int>(2,      &tree.n_dust);
  build_key_list.set_arg<real4>(3,    &corner);
  build_key_list.setWork(tree.n_dust, 128); //128 threads per block
  build_key_list.execute(execStream->s());  
  
  // If srcValues and buffer are different, then the original values
  // are preserved, if they are the same srcValues will be overwritten  
  gpuSort(devContext, srcValues, tree.dust_key,srcValues, tree.n_dust, 32, 3, tree);


  
  //Sort the relevant properties.Note we can optimize this 
  //further as done with  the normal particles

  my_dev::dev_mem<real4>  real4Buffer1(devContext);
  my_dev::dev_mem<real4>  real4Buffer2(devContext);
  my_dev::dev_mem<real4>  real4Buffer3(devContext);
  
  int memOffset1 = real4Buffer1.cmalloc_copy(tree.generalBuffer1,tree.n_dust, 0);
      memOffset1 = real4Buffer2.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1);
      memOffset1 = real4Buffer3.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1);
  
  dataReorderCombined.set_arg<int>(0,      &tree.n_dust);
  dataReorderCombined.set_arg<cl_mem>(1,   tree.dust_key.p());  
  dataReorderCombined.setWork(tree.n_dust, 512);   
  
  
  //Position, velocity and acc0
  dataReorderCombined.set_arg<cl_mem>(2,   tree.dust_pos.p());
  dataReorderCombined.set_arg<cl_mem>(3,   real4Buffer1.p()); 
  dataReorderCombined.set_arg<cl_mem>(4,   tree.dust_vel.p()); 
  dataReorderCombined.set_arg<cl_mem>(5,   real4Buffer2.p()); 
  dataReorderCombined.set_arg<cl_mem>(6,   tree.dust_acc0.p()); 
  dataReorderCombined.set_arg<cl_mem>(7,   real4Buffer3.p()); 
  dataReorderCombined.execute(execStream->s());
  tree.dust_pos.copy(real4Buffer1,  tree.n_dust);
  tree.dust_vel.copy(real4Buffer2,  tree.n_dust);
  tree.dust_acc0.copy(real4Buffer3, tree.n_dust);
  

  my_dev::dev_mem<int>     intBuffer(devContext);
  my_dev::dev_mem<float2>  float2Buffer(devContext);
  my_dev::dev_mem<int>     sortPermutation(devContext);
  
  memOffset1 = float2Buffer.cmalloc_copy   (tree.generalBuffer1,tree.n_dust, 0);
  memOffset1 = sortPermutation.cmalloc_copy(tree.generalBuffer1,tree.n_dust, memOffset1);
  memOffset1 = intBuffer.cmalloc_copy      (tree.generalBuffer1,tree.n_dust, memOffset1);
  
  
  dataReorderF2.set_arg<int>(0,      &tree.n_dust);
  dataReorderF2.set_arg<cl_mem>(1,   tree.dust_key.p());    
  dataReorderF2.set_arg<cl_mem>(2,   float2Buffer.p()); //Place holder, dust has no time
  dataReorderF2.set_arg<cl_mem>(3,   float2Buffer.p()); //Reuse as destination1
  dataReorderF2.set_arg<cl_mem>(4,   tree.dust_ids.p()); 
  dataReorderF2.set_arg<cl_mem>(5,   sortPermutation.p()); //Reuse as destination2  
  dataReorderF2.setWork(tree.n_dust, 512);   
  dataReorderF2.execute(execStream->s());

  tree.dust_ids.copy(sortPermutation, sortPermutation.get_size());  
  
  
  devContext.stopTiming("DustSortReorder", -1, execStream->s());  
  
}
Beispiel #2
0
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4>  &bodies_pos, 
                         my_dev::dev_mem<uint>  &sortPermutation, int n_bodies) {

  //We assume the bodies are already onthe GPU
  devContext.startTiming();
  
  this->allocateParticleSpecificBuffers(n_bodies);

  
  //Call the GPUSort function, since we made it general 
  //into a uint4 so we can extend the tree to 96bit key
  //we have to convert to 64bit key to a 96bit for sorting
  //and back from 96 to 64    
  my_dev::dev_mem<uint4>  srcValues(devContext);
  my_dev::dev_mem<uint4>  output(devContext);
  my_dev::dev_mem<uint4>  bodies_key(devContext);
  
  
  //Allocate memory for the generalBuffer
  
  //The generalBuffer1 has size uint*4*N*3
  //this buffer gets part: 0-uint*4*N
  srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                         tree.generalBuffer1.get_flags(), 
                         tree.generalBuffer1.get_devMem(),
                         &tree.generalBuffer1[0], 0,  
                         n_bodies, getAllignmentOffset(0));  
 
  //this buffer gets part: uint*4*N-uint*4*N*2
  output.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
		      tree.generalBuffer1.get_flags(), 
		      tree.generalBuffer1.get_devMem(),
		      &tree.generalBuffer1[4*n_bodies], 4*n_bodies,
		      n_bodies, getAllignmentOffset(4*n_bodies));
  
  int prevOffset = getAllignmentOffset(4*n_bodies);
  
  bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
			  tree.generalBuffer1.get_flags(), 
			  tree.generalBuffer1.get_devMem(),
			  &tree.generalBuffer1[8*n_bodies], 8*n_bodies,
			  n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset));
  
  
  //This function computes the keys, seperate since we compute keys also before 
  //buidling the tree-structure
  //Corner and size are not stored, since we can use sorting without building a tree
  float4 corner;
  float domain_fac;
  compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac);
  
  
  
  
   //Extract the keys
  convertKey64to96.set_arg<cl_mem>(0,   bodies_key.p());
  convertKey64to96.set_arg<cl_mem>(1,   srcValues.p());
  convertKey64to96.set_arg<int>(2,      &n_bodies);  
  convertKey64to96.setWork(n_bodies, 256);
  convertKey64to96.execute();
  
 
  //Sort the keys  
  
  // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values
  // are preserved, if they are the same srcValues will be overwritten  
  gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree);
 

  //Extract the keys and get the permuation required to sort the other
  //properties of the particles
  //Extract the sorted keys
  extractKeyAndPerm.set_arg<cl_mem>(0,   output.p());
  extractKeyAndPerm.set_arg<cl_mem>(1,   bodies_key.p());
  extractKeyAndPerm.set_arg<cl_mem>(2,   sortPermutation.p());  
  extractKeyAndPerm.set_arg<int>(3,      &n_bodies);
  extractKeyAndPerm.setWork(n_bodies, 256);
  extractKeyAndPerm.execute();  
  
  devContext.stopTiming("Sorting", 0);  
}