コード例 #1
0
ファイル: sort_bodies_gpu.cpp プロジェクト: Ingwar/amuse
void octree::compute_keys(my_dev::dev_mem<float4>  &bodies_pos, 
                          my_dev::dev_mem<uint4>  &bodies_key, int n_bodies,
                          float4 &corner, float &domain_fac)
{

  this->getCorner(bodies_pos, n_bodies, corner, domain_fac);
 
  //Compute the keys
  build_key_list.set_arg<cl_mem>(0,   bodies_key.p());
  build_key_list.set_arg<cl_mem>(1,   bodies_pos.p());
  build_key_list.set_arg<int>(2,      &n_bodies);
  build_key_list.set_arg<real4>(3,    &corner);
  build_key_list.setWork(n_bodies, 128); //128 threads per block
  build_key_list.execute();    
}
コード例 #2
0
ファイル: sort_bodies_gpu.cpp プロジェクト: Ingwar/amuse
void octree::getBoundaries(my_dev::dev_mem<float4>  &bodies_pos, int n_bodies, real4 &r_min, real4 &r_max)
{

  //Start reduction to get the boundary's of the system
  boundaryReduction.set_arg<int>(0, &n_bodies);
  boundaryReduction.set_arg<cl_mem>(1, bodies_pos.p());
  boundaryReduction.set_arg<cl_mem>(2, devMemRMIN.p());
  boundaryReduction.set_arg<cl_mem>(3, devMemRMAX.p());

  boundaryReduction.setWork(n_bodies, NTHREAD_BOUNDARY, NBLOCK_BOUNDARY);  //256 threads and 120 blocks in total
  boundaryReduction.execute();
  
   
  devMemRMIN.d2h();     //Need to be defined and initialized somewhere outside this function
  devMemRMAX.d2h();     //Need to be defined and initialized somewhere outside this function
  r_min = (real4){+1e10, +1e10, +1e10, +1e10}; 
  r_max = (real4){-1e10, -1e10, -1e10, -1e10};   
  
  //Reduce the blocks, done on host since its
  //A faster and B we need the results anyway
  for (int i = 0; i < 120; i++) {    
    r_min.x = fmin(r_min.x, devMemRMIN[i].x);
    r_min.y = fmin(r_min.y, devMemRMIN[i].y);
    r_min.z = fmin(r_min.z, devMemRMIN[i].z);
    
    r_max.x = fmax(r_max.x, devMemRMAX[i].x);
    r_max.y = fmax(r_max.y, devMemRMAX[i].y);
    r_max.z = fmax(r_max.z, devMemRMAX[i].z);    
//     printf("%f\t%f\t%f\t || \t%f\t%f\t%f\n", rMIN[i].x,rMIN[i].y,rMIN[i].z,rMAX[i].x,rMAX[i].y,rMAX[i].z);    
  }

  printf("Found boundarys, number of particles %d : \n", n_bodies);
  printf("min: %f\t%f\t%f\tmax: %f\t%f\t%f \n", r_min.x,r_min.y,r_min.z,r_max.x,r_max.y,r_max.z);
}
コード例 #3
0
ファイル: load_kernels.cpp プロジェクト: BerndDoser/Bonsai
void octree::gpuSort_32b(my_dev::context &devContext, 
                    my_dev::dev_mem<uint> &srcKeys,     my_dev::dev_mem<uint> &srcValues,
                    my_dev::dev_mem<int>  &keysOutput,  my_dev::dev_mem<uint> &keysAPing,
                    my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing,
                    int N, int numberOfBits)
{

  int bitIdx = 0;

  //Step 1, do the count
  //Memory that should be alloced outside the function:

  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480;
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;

  sortCount.set_arg<cl_mem>(0, srcKeys.p());
  sortCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  sortCount.set_arg<uint>(2, &N);
  sortCount.set_arg<int>(3, NULL, 128);//smem size
  sortCount.set_arg<setupParams>(4, &sParam);
  sortCount.set_arg<int>(5, &bitIdx);
  
  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;
  sortCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  sortMove.set_arg<cl_mem>(0, srcKeys.p());
  sortMove.set_arg<cl_mem>(1, keysOutput.p());
  sortMove.set_arg<cl_mem>(2, srcValues.p());
  sortMove.set_arg<cl_mem>(3, valuesOutput.p());
  sortMove.set_arg<cl_mem>(4, this->devMemCounts.p());
  sortMove.set_arg<uint>(5, &N);
  sortMove.set_arg<uint>(6, NULL, 192); //Dynamic shared memory 128+64 , prefux sum buffer
  sortMove.set_arg<uint>(7, NULL, 64*4); //Dynamic shared memory stage buffer
  sortMove.set_arg<uint>(8, NULL, 64*4); //Dynamic shared memory stage_values buffer
  sortMove.set_arg<setupParams>(9, &sParam);
  sortMove.set_arg<int>(10, &bitIdx);

  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  sortMove.setWork(globalWork, localWork);

  bool pingPong = false;

  //Execute bitIdx 0

  sortCount.execute(execStream->s());
  exScanBlock.execute(execStream->s());
  sortMove.execute(execStream->s());  

  //Swap buffers
  sortCount.set_arg<cl_mem>(0, keysOutput.p());
  sortMove.set_arg<cl_mem>(0, keysOutput.p());
  sortMove.set_arg<cl_mem>(1, keysAPing.p());
  sortMove.set_arg<cl_mem>(2, valuesOutput.p());
  sortMove.set_arg<cl_mem>(3, valuesAPing.p());

  //Remaining bits, ping ponging buffers
  for(int i=1; i < numberOfBits; i++)
  {
    bitIdx = i;
    sortCount.set_arg<int>(5, &bitIdx);
    sortMove.set_arg<int>(10, &bitIdx);

    sortCount.execute(execStream->s());
    exScanBlock.execute(execStream->s()); 
    
    sortMove.execute(execStream->s());

    //Switch buffers
    if(pingPong)
    {
      sortCount.set_arg<cl_mem>(0, keysOutput.p());

      sortMove.set_arg<cl_mem>(0, keysOutput.p());
      sortMove.set_arg<cl_mem>(1, keysAPing.p());

      sortMove.set_arg<cl_mem>(2, valuesOutput.p());
      sortMove.set_arg<cl_mem>(3, valuesAPing.p());

      pingPong = false;
    }
    else
    {
      sortCount.set_arg<cl_mem>(0, keysAPing.p());

      sortMove.set_arg<cl_mem>(0, keysAPing.p());
      sortMove.set_arg<cl_mem>(1, keysOutput.p());

      sortMove.set_arg<cl_mem>(2, valuesAPing.p());
      sortMove.set_arg<cl_mem>(3, valuesOutput.p());

      pingPong = true;
    }
  }
 

}
コード例 #4
0
ファイル: load_kernels.cpp プロジェクト: BerndDoser/Bonsai
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

#if defined (USE_B40C)
  sorter->sort(srcValues, output, N);

#elif defined(USE_THRUST) && defined(USE_THRUST_96)
  //Extra buffer values
  my_dev::dev_mem<uint> permutation(devContext);   // Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> temp_buffer(devContext);  // temporary uint buffer
  
  //Permutation has to be allocated after the two previous
  //allocated buffers, get the right offset
  int memOffset  = permutation.getGlobalMemAllignmentPadding(8*N);
      memOffset += 8*N; 

      memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset);
      memOffset = temp_buffer.cmalloc_copy(tree.generalBuffer1, N, memOffset);      
      
  thrust_sort_96b(srcValues, output, temp_buffer, permutation, N);
  
#else
  //Extra buffer values
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  //Permutation has to be allocated after the two previous
  //allocated buffers, get the right offset
  int memOffset = simpleKeys.getGlobalMemAllignmentPadding(8*N);
      memOffset += 8*N; 
      memOffset = simpleKeys.cmalloc_copy(tree.generalBuffer1, N, memOffset);
      memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset);   
      memOffset = output32b.cmalloc_copy(tree.generalBuffer1, N, memOffset); 
      memOffset = valuesOutput.cmalloc_copy(tree.generalBuffer1, N, memOffset); 
  
    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  
  extractInt.setWork(N, blockSize); 
  reOrderKeysValues.setWork(N, blockSize); 

  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  //Extracts a 32bit key and fills a sequence
  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<cl_mem>(2, permutation.p());
  extractInt.set_arg<uint>(3, &N);
  extractInt.set_arg<int>(4, &intIdx);//bit idx


  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute(execStream->s());
  
  #ifdef USE_THRUST
  
  thrust_sort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
                   valuesOutput,permutation,
                   N, 32);
  
  #else
    //Now sort the first 32bit keys
    //Using 32bit sort with key and value seperated    
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif  

    
  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute(execStream->s());
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(4, &intIdx);//smem size
  extractInt.execute(execStream->s());

  #ifdef USE_THRUST
  
    thrust_sort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  
  #else
    //Now sort the 2nd 32bit keys
    //Using 32bit sort with key and value seperated    
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif   

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
  reOrderKeysValues.execute(execStream->s());

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
 
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(4, &intIdx);//integer idx
  extractInt.execute(execStream->s());


  //Now sort the final set of 32bit keys
  #ifdef USE_THRUST  
    thrust_sort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  
  #else
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif   
  
  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.execute(execStream->s());  
#endif // USE_THRUST_96
}
コード例 #5
0
ファイル: load_kernels.cpp プロジェクト: BerndDoser/Bonsai
//Splits an array of integers, the values in srcValid indicate if a
//value is valid (1 == valid anything else is UNvalid) returns the 
//splitted values in the output array (first all valid 
//number and then the invalid ones) and the total
//number of valid items is stored in 'count' 
void octree::gpuSplit(my_dev::context &devContext, 
                      my_dev::dev_mem<uint> &srcValues,
                      my_dev::dev_mem<uint> &output,                        
                      int N, 
                      int *validCount)  // if validCount NULL leave count on device
{

  //In the next step we associate the GPU memory with the Kernel arguments
  //my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512);
  //Memory that should be alloced outside the function:
  //devMemCounts and devMemCountsx 
  
  // make sure previous reset has finished.
  this->devMemCountsx.waitForCopyEvent();

  //Kernel configuration parameters
  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480; 
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;
  
  compactCount.set_arg<cl_mem>(0, srcValues.p());
  compactCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  compactCount.set_arg<uint>(2, &N);
  compactCount.set_arg<int>(3, NULL, 128);
  compactCount.set_arg<setupParams>(4, &sParam);
  compactCount.set_arg<cl_mem>(5, this->devMemCountsx.p());
  
  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;   
  compactCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());  
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  splitMove.set_arg<cl_mem>(0, srcValues.p());
  splitMove.set_arg<cl_mem>(1, output.p());
  splitMove.set_arg<cl_mem>(2, this->devMemCounts.p());
  splitMove.set_arg<uint>(3, &N);
  splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory
  splitMove.set_arg<setupParams>(5, &sParam);
  
  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  splitMove.setWork(globalWork, localWork);

  ////////////////////
  compactCount.execute(execStream->s());
  exScanBlock.execute(execStream->s());
  splitMove.execute(execStream->s());

  if (validCount) {
    this->devMemCountsx.d2h();
    *validCount = this->devMemCountsx[0];
  }
}
コード例 #6
0
void octree::approximate_gravity(tree_structure &tree,
                                 my_dev::dev_mem<float4>  &j_bodies_pos,  //Bodies that are part of the tree-structure
                                 my_dev::dev_mem<float4>  &j_bodies_h,
                                 my_dev::dev_mem<int>     &j_bodies_idx,
                                 my_dev::dev_mem<float4>  &i_bodies_pos,  //Bodies that are part of the groups
                                 my_dev::dev_mem<float4>  &i_bodies_h,
                                 my_dev::dev_mem<int>     &i_bodies_idx,
                                 int n_groupBodies,                     //Number of bodies that are part of the groups
                                 my_dev::dev_mem<real4>  &i_bodies_acc,
                                 my_dev::dev_mem<real>   &i_bodies_ds2,
                                 my_dev::dev_mem<int>    &i_bodies_ngb)
{ 
#if 1
  uint2 node_begend;
  int level_start = 2;
  node_begend.x   = tree.level_list[level_start].x;
  node_begend.y   = tree.level_list[level_start].y;

  //Reset the active particles
  tree.activePartList.zeroMem();
  this->atomicValues.zeroMem();


  float eps2 = this->eps2;


  //Set the kernel parameters, many!
  int argIdx = 0;
  approxGrav.set_arg<int>(argIdx++,    &tree.n_groups);
  approxGrav.set_arg<float>(argIdx++,  &eps2);
  approxGrav.set_arg<uint2>(argIdx++,  &node_begend);
  approxGrav.set_arg<cl_mem>(argIdx++, this->atomicValues.p());  
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_pos.p());
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_h  .p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_acc.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_pos.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_h  .p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ds2.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.activePartList.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.interactions.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.group_list.p());  
  approxGrav.set_arg<cl_mem>(argIdx++, tree.multipole.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory
  
#if 1
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles)
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles)
#endif 
  
  approxGrav.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize");
  approxGrav.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter");
  approxGrav.set_arg<real4>(argIdx++, tree.multipole, 4, "texMultipole");
  approxGrav.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody");
  approxGrav.setWork(-1, NTHREAD, nBlocksForTreeWalk);
//   approxGrav.setWork(-1, NTHREAD, 1);    

#if 0
  devContext.startTiming();
#endif
  approxGrav.execute(execStream->s());  //First half

#if 0
  execStream->sync();
  devContext.stopTiming("Gravity", 1);
#endif

  //Print interaction statistics
  #if 0
  tree.n = n_groupBodies;
  
  tree.body2group_list.d2h();
  tree.interactions.d2h();
    long long directSum = 0;
    long long apprSum = 0;
    long long directSum2 = 0;
    long long apprSum2 = 0;
    
    
    int maxDir = -1;
    int maxAppr = -1;

    for(int i=0; i < tree.n; i++)
    {
      apprSum     += tree.interactions[i].x;
      directSum   += tree.interactions[i].y;
      
      maxAppr = max(maxAppr,tree.interactions[i].x);
      maxDir  = max(maxDir,tree.interactions[i].y);
      
      apprSum2     += tree.interactions[i].x*tree.interactions[i].x;
      directSum2   += tree.interactions[i].y*tree.interactions[i].y;      
    }
  
    //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl;

    cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr <<  endl;
    cout << "sigma dir: " << sqrt((directSum2  - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n)  <<  endl;    

      #if 0
      //Histogram of number of interactions
      const int bins = 256;
      const int jump = 15;
      int histoIDX[bins+1];
      for(int i=0; i < bins; i++)
        histoIDX[i] = 0;
      
      for(int i=0; i < tree.n; i++)
      {
          int idx = tree.interactions[i].x / jump;
          if(idx >= bins)
            idx = bins;
          histoIDX[idx]++;  
      }
      for(int i=0; i < bins; i++)
      {
        if(histoIDX[i] == 0)
          fprintf(stderr, "HISTO %d\t-\n", i*jump, histoIDX[i]);
        else
          fprintf(stderr, "HISTO %d\t%d\n", i*jump, histoIDX[i]);
      }     
    #endif
  #endif
  
  #if 0
    i_bodies_ngb.d2h();
    i_bodies_acc.d2h();
    i_bodies_ds2.d2h();
    
    for(int i=0; i < n_groupBodies; i++)
    {
        fprintf(stderr, "%d\t Acc: %f %f %f %f  \t Ds2: %f \t NGB: %d \n", 
                i, i_bodies_acc[i].x, i_bodies_acc[i].y,
                i_bodies_acc[i].z, i_bodies_acc[i].w,
                i_bodies_ds2[i],
                i_bodies_ngb[i]);
    }
  #endif
  
  
  
  
  
/*
    //Reduce the number of valid particles    
    getNActive.set_arg<int>(0,    &tree.n);
    getNActive.set_arg<cl_mem>(1, tree.activePartlist.p());
    getNActive.set_arg<cl_mem>(2, this->nactive.p());
    getNActive.set_arg<int>(3,    NULL, 128); //Dynamic shared memory , equal to number of threads
    getNActive.setWork(-1, 128,   NBLOCK_REDUCE);
    
    CU_SAFE_CALL(cuCtxSynchronize()); //Synchronize all streams, makes sure that the approx stream is finished
    getNActive.execute();
    
    //Reduce the last parts on the host
    this->nactive.d2h();
    tree.n_active_particles = this->nactive[0];
    for (int i = 1; i < NBLOCK_REDUCE ; i++)
        tree.n_active_particles += this->nactive[i];

    printf("Active particles: %d \n", tree.n_active_particles);
    */

  my_dev::base_mem::printMemUsage();
#endif
}
コード例 #7
0
void octree::get_ngb(tree_structure &tree,
                      my_dev::dev_mem<float4>  &j_bodies_pos,  //Bodies that are part of the tree-structure
                      my_dev::dev_mem<int>     &j_bodies_idx,
                      my_dev::dev_mem<float4>  &i_bodies_pos,  //Bodies that are part of the groups
                      my_dev::dev_mem<int>     &i_bodies_idx,
                      my_dev::dev_mem<int>    &i_bodies_ngb
){ 
  uint2 node_begend;
  int level_start = 2;
  node_begend.x   = tree.level_list[level_start].x;
  node_begend.y   = tree.level_list[level_start].y;

  //Reset the active particles
  tree.activePartList.zeroMem();
  this->atomicValues.zeroMem();

  //Set the kernel parameters, many!
  int argIdx = 0;
  getNGB.set_arg<int>(argIdx++,    &tree.n_groups);
  getNGB.set_arg<uint2>(argIdx++,  &node_begend);
  getNGB.set_arg<cl_mem>(argIdx++, this->atomicValues.p());  
  getNGB.set_arg<cl_mem>(argIdx++, j_bodies_pos.p());
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_pos.p());
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.activePartList.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.interactions.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.group_list.p());  

  getNGB.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p());

  getNGB.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles)
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles)

  getNGB.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory
  
  
  getNGB.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize");
  getNGB.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter");
  getNGB.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody");
  getNGB.setWork(-1, NTHREAD, nBlocksForTreeWalk);


  devContext.startTiming();
  getNGB.execute(execStream->s());  //First half

  execStream->sync();
  devContext.stopTiming("GetNGB", 1);

  //Print interaction statistics
  #if 0
  tree.n = n_groupBodies;
  
  tree.body2group_list.d2h();
  tree.interactions.d2h();
    long long directSum = 0;
    long long apprSum = 0;
    long long directSum2 = 0;
    long long apprSum2 = 0;
    
    
    int maxDir = -1;
    int maxAppr = -1;

    for(int i=0; i < tree.n; i++)
    {
      apprSum     += tree.interactions[i].x;
      directSum   += tree.interactions[i].y;
      
      maxAppr = max(maxAppr,tree.interactions[i].x);
      maxDir  = max(maxDir,tree.interactions[i].y);
      
      apprSum2     += tree.interactions[i].x*tree.interactions[i].x;
      directSum2   += tree.interactions[i].y*tree.interactions[i].y;      
    }
  
    //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl;

    cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr <<  endl;
    cout << "sigma dir: " << sqrt((directSum2  - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n)  <<  endl;    

  
  #endif
  
  #if 0
    i_bodies_ngb.d2h();
    i_bodies_acc.d2h();
    i_bodies_ds2.d2h();
    
    for(int i=0; i < n_groupBodies; i++)
    {
        fprintf(stderr, "%d\t Acc: %f %f %f %f  \t Ds2: %f \t NGB: %d \n", 
                i, i_bodies_acc[i].x, i_bodies_acc[i].y,
                i_bodies_acc[i].z, i_bodies_acc[i].w,
                i_bodies_ds2[i],
                i_bodies_ngb[i]);
    }
  #endif
  
  

  my_dev::base_mem::printMemUsage();
}
コード例 #8
0
ファイル: sort_bodies_gpu.cpp プロジェクト: Ingwar/amuse
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4>  &bodies_pos, 
                         my_dev::dev_mem<uint>  &sortPermutation, int n_bodies) {

  //We assume the bodies are already onthe GPU
  devContext.startTiming();
  
  this->allocateParticleSpecificBuffers(n_bodies);

  
  //Call the GPUSort function, since we made it general 
  //into a uint4 so we can extend the tree to 96bit key
  //we have to convert to 64bit key to a 96bit for sorting
  //and back from 96 to 64    
  my_dev::dev_mem<uint4>  srcValues(devContext);
  my_dev::dev_mem<uint4>  output(devContext);
  my_dev::dev_mem<uint4>  bodies_key(devContext);
  
  
  //Allocate memory for the generalBuffer
  
  //The generalBuffer1 has size uint*4*N*3
  //this buffer gets part: 0-uint*4*N
  srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                         tree.generalBuffer1.get_flags(), 
                         tree.generalBuffer1.get_devMem(),
                         &tree.generalBuffer1[0], 0,  
                         n_bodies, getAllignmentOffset(0));  
 
  //this buffer gets part: uint*4*N-uint*4*N*2
  output.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
		      tree.generalBuffer1.get_flags(), 
		      tree.generalBuffer1.get_devMem(),
		      &tree.generalBuffer1[4*n_bodies], 4*n_bodies,
		      n_bodies, getAllignmentOffset(4*n_bodies));
  
  int prevOffset = getAllignmentOffset(4*n_bodies);
  
  bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
			  tree.generalBuffer1.get_flags(), 
			  tree.generalBuffer1.get_devMem(),
			  &tree.generalBuffer1[8*n_bodies], 8*n_bodies,
			  n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset));
  
  
  //This function computes the keys, seperate since we compute keys also before 
  //buidling the tree-structure
  //Corner and size are not stored, since we can use sorting without building a tree
  float4 corner;
  float domain_fac;
  compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac);
  
  
  
  
   //Extract the keys
  convertKey64to96.set_arg<cl_mem>(0,   bodies_key.p());
  convertKey64to96.set_arg<cl_mem>(1,   srcValues.p());
  convertKey64to96.set_arg<int>(2,      &n_bodies);  
  convertKey64to96.setWork(n_bodies, 256);
  convertKey64to96.execute();
  
 
  //Sort the keys  
  
  // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values
  // are preserved, if they are the same srcValues will be overwritten  
  gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree);
 

  //Extract the keys and get the permuation required to sort the other
  //properties of the particles
  //Extract the sorted keys
  extractKeyAndPerm.set_arg<cl_mem>(0,   output.p());
  extractKeyAndPerm.set_arg<cl_mem>(1,   bodies_key.p());
  extractKeyAndPerm.set_arg<cl_mem>(2,   sortPermutation.p());  
  extractKeyAndPerm.set_arg<int>(3,      &n_bodies);
  extractKeyAndPerm.setWork(n_bodies, 256);
  extractKeyAndPerm.execute();  
  
  devContext.stopTiming("Sorting", 0);  
}
コード例 #9
0
ファイル: load_kernels.cpp プロジェクト: Ingwar/amuse
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

  //Extra buffer values

//   my_dev::dev_mem<uint> simpleKeys(devContext, N);    //Int keys,
//   my_dev::dev_mem<uint> permutation(devContext, N);   //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<uint> valuesOutput(devContext, N);  //Buffers for the values which are the indexes
  
  
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output

  
  simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*N], 8*N,
                          N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum));    //Ofset 8 since we have 2 uint4 before
  
  prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);
  
  permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[9*N], 9*N,
                          N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum));  //N elements after simpleKeys    

  prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum);
  

  output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[10*N], 10*N,
                          N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum));  //N elements after permutation      
  
  prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum);
  
  valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[11*N], 11*N,
                          N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum));  //N elements after output32b        

    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  int ng = (N)/blockSize + 1;
  int nx = (int)sqrt(ng);
  int ny = (ng-1)/nx + 1;

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = nx*blockSize;   globalWork[1] = ny;
  localWork [0] = blockSize;       localWork[1] = 1;

  extractInt.setWork(globalWork, localWork);
  fillSequence.setWork(globalWork, localWork);
  reOrderKeysValues.setWork(globalWork, localWork);
  
  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<uint>(2, &N);
  extractInt.set_arg<int>(3, &intIdx);//bit idx

  fillSequence.set_arg<cl_mem>(0, permutation.p());
  fillSequence.set_arg<uint>(1, &N);

  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute();
  fillSequence.execute();

  //Now sort the first 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
//                     output32b, aPing32b,
                   output32b, simpleKeys,
//                    valuesOutput,valuesAPing,
                   valuesOutput,permutation,
//                   count,
                   N, 32);


  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute();
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(3, &intIdx);//smem size

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
 
  extractInt.execute();
  
  fillSequence.execute();

  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
//                    output32b, aPing32b,
//                   valuesOutput,valuesAPing,
                   valuesOutput,permutation,
                   //count,
                   N, 32);
                   
  reOrderKeysValues.execute();
  

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
  
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(3, &intIdx);//integer idx

  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());

  extractInt.execute();
  fillSequence.execute();
  //Now sort the 32bit keys
  //Using int2 with key and value combined
  //See sortArray4
  //Using key and value in a seperate array
  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
              simpleKeys, permutation,
              output32b, simpleKeys,
//               output32b, aPing32b,
//               valuesOutput,valuesAPing,
              valuesOutput,permutation,
              //count,
              N, 32);  

  reOrderKeysValues.execute();

  clFinish(devContext.get_command_queue());

//   fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0);
}
コード例 #10
0
ファイル: load_kernels.cpp プロジェクト: Ingwar/amuse
//Splits an array of integers, the values in srcValid indicate if a
//value is valid (1 == valid anything else is UNvalid) returns the 
//splitted values in the output array (first all valid 
//number and then the invalid ones) and the total
//number of valid items is stored in 'count' 
void octree::gpuSplit(my_dev::context &devContext, 
                        my_dev::dev_mem<uint> &srcValues,
                        my_dev::dev_mem<uint> &output,                        
                        int N, int *validCount)
{

  // In the next step we associate the GPU memory with the Kernel arguments
//   my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512);
  //Memory that should be alloced outside the function:
  //devMemCounts and devMemCountsx 
  

  //Kernel configuration parameters
  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480; 
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;
  
  
//   printf("Param info: %d %d %d %d \n", sParam.jobs, sParam.blocksWithExtraJobs, sParam.extraElements, sParam.extraOffset);

  compactCount.set_arg<cl_mem>(0, srcValues.p());
  compactCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  compactCount.set_arg<uint>(2, &N);
  compactCount.set_arg<int>(3, NULL, 128);
  compactCount.set_arg<setupParams>(4, &sParam);

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;   
  compactCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());  
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  splitMove.set_arg<cl_mem>(0, srcValues.p());
  splitMove.set_arg<cl_mem>(1, output.p());
  splitMove.set_arg<cl_mem>(2, this->devMemCounts.p());
  splitMove.set_arg<uint>(3, &N);
  splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory
  splitMove.set_arg<setupParams>(5, &sParam);

  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  splitMove.setWork(globalWork, localWork);

  ////////////////////
  compactCount.execute();

//   exit(0);
//   counts.d2h();
//   for(int i=0; i < 482; i++)
//   {
//     printf("%d\t%d\n", i, counts[i]);
//   }
//   

  exScanBlock.execute();
  
  splitMove.execute();

  //TODO fix the damn clFinish function
  #ifdef USE_CUDA
    cuCtxSynchronize();
  #else
    clFinish(devContext.get_command_queue());
  #endif
  this->devMemCountsx.d2h();
  *validCount = this->devMemCountsx[0];
  //printf("Total number of valid items: %d \n", countx[0]);
}
コード例 #11
0
void octree::compute_properties(tree_structure &tree,  my_dev::dev_mem<float4>  &bodies_pos, int n_bodies) {

    /*****************************************************
      Assign the memory buffers, note that we check the size first
      and if needed we increase the size of the generalBuffer1
      Size required:
        - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4
        - lower/upperbounds ->               2*n_nodes*uint4
        - node lower/upper  ->               2*n_nodes*uint4
        - SUM:                               10*n_nodes*uint4
        - generalBuffer1 has default size: 3*N*uint4

      check if 10*n_nodes < 3*N if so realloc
      (Note that generalBuffer might be larger because of tree-walk stack)
     *****************************************************/

    if(10*tree.n_nodes > 3*tree.n)
    {
#ifdef _DEBUG_PRINT_
        fprintf(stderr, "Resizeing the generalBuffer1 \n");
#endif
        tree.generalBuffer1.cresize(10*tree.n_nodes*4, false);
    }

    my_dev::dev_mem<double4> multipoleD(devContext);
    my_dev::dev_mem<real4>  nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO
    my_dev::dev_mem<real4>  nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO

    multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                            tree.generalBuffer1.get_flags(),
                            tree.generalBuffer1.get_devMem(),
                            &tree.generalBuffer1[0], 0,
                            3*tree.n_nodes, getAllignmentOffset(0));

    //Offset is in uint, so: double4 = 8uint*3*n_nodes
    nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                                 tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));

    int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output

    nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes],
                                 8*3*tree.n_nodes + 4*tree.n_nodes,
                                 tree.n_nodes,
                                 prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));

    //Computes the tree-properties (size, cm, monopole, quadropole, etc)
    //start the kernel for the leaf-type nodes
    propsLeafD.set_arg<int>(0,    &tree.n_leafs);
    propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
    propsLeafD.set_arg<cl_mem>(3, bodies_pos.p());
    propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());
//   propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps

    propsLeafD.setWork(tree.n_leafs, 128);
#ifdef _DEBUG_PRINT_
    printf("PropsLeaf: ");
    propsLeafD.printWorkSize();
#endif
    propsLeafD.execute();


    int temp = tree.n_nodes-tree.n_leafs;
    propsNonLeafD.set_arg<int>(0,    &temp);
    propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
    propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());
    propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());

    //Work from the bottom up
    for(int i=tree.n_levels; i >= 1; i--)
    {
        propsNonLeafD.set_arg<int>(0,    &i);
        {
            vector<size_t> localWork(2), globalWork(2);
            int totalOnThisLevel;

            totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];

            propsNonLeafD.setWork(totalOnThisLevel, 128);

#ifdef _DEBUG_PRINT_
            printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
                   tree.node_level_list[i-1], tree.node_level_list[i]);
#endif
            propsNonLeafD.printWorkSize();
        }
        propsNonLeafD.set_arg<int>(0,    &i); //set the level
        propsNonLeafD.execute();
    }

    propsScalingD.set_arg<int>(0,    &tree.n_nodes);
    propsScalingD.set_arg<cl_mem>(1, multipoleD.p());
    propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p());
    propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p());
    propsScalingD.set_arg<cl_mem>(4, tree.n_children.p());
    propsScalingD.set_arg<cl_mem>(5, tree.multipole.p());
    propsScalingD.set_arg<float >(6, &theta);
    propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
    propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p());
    propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p());

    propsScalingD.setWork(tree.n_nodes, 128);
#ifdef _DEBUG_PRINT_
    printf("propsScaling: \t ");
    propsScalingD.printWorkSize();
#endif
    propsScalingD.execute();


#if 0
#ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
#endif


    //Set the group properties, note that it is not based on the nodes anymore
    //but on self created groups based on particle order setPHGroupData
    copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
    copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
    copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
    copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);
    copyNodeDataToGroupData.printWorkSize();
    copyNodeDataToGroupData.execute();

#ifdef INDSOFT
    memCpyStream.sync();
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
#else
#endif

    //Get the local domain boundary based on group positions and sizes
    real4 r_min, r_max;
    getBoundariesGroups(tree, r_min, r_max);

#endif

}