Example #1
0
void octgrav::allocate_cuda_memory() {
  allocateCUDAarray((void**)&dev.bodies_pos,  n_bodies * sizeof(float4));
  allocateCUDAarray((void**)&dev.bodies_grav, n_norm(n_bodies, 256) * sizeof(float4));

  allocateCUDAarray((void**)&dev.children, children_list.size() * sizeof(int4));

  allocateCUDAarray((void**)&dev.node_Qu,      node_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.node_Qd,      node_list.size() * sizeof(float4));


  allocateCUDAarray((void**)&dev.Oct1,      node_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.Oct2,      node_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.Oct3,      node_list.size() * sizeof(float2));

  allocateCUDAarray((void**)&dev.node_pos,      node_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.node_com,      node_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.n_in_node,     node_list.size() * sizeof(int));
  allocateCUDAarray((void**)&dev.node_bodies_offset, node_list.size() * sizeof(int));

//   allocateCUDAarray((void**)&dev.leaf_pos,  leaf_list.size() * sizeof(float4));
//   allocateCUDAarray((void**)&dev.leaf_com,  leaf_list.size() * sizeof(float4));
//   allocateCUDAarray((void**)&dev.n_in_leaf, leaf_list.size() * sizeof(int));
//   allocateCUDAarray((void**)&dev.leaf_bodies_offset, leaf_list.size() * sizeof(int));

  allocateCUDAarray((void**)&dev.cell_pos,  cell_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.cell_com,  cell_list.size() * sizeof(float4));
  allocateCUDAarray((void**)&dev.n_in_cell, cell_list.size() * sizeof(int));
  allocateCUDAarray((void**)&dev.cell_bodies_offset, cell_list.size() * sizeof(int));
}
Example #2
0
double sapporo::evaluate_gravity(int ni_total, int nj)
{
  //This function is called inside an omp parallel section  
  #ifdef DEBUG_PRINT
    cerr << "evaluate_gravity ni: " << ni_total << "\tnj: " << nj << endl;
  #endif

  if(ni_total == 0 || nj == 0)  return 0.0;


  //Use this to indicate we did gravity on the host, to disable memory copies
  executedOnHost = false; 
  
  #ifdef CPU_SUPPORT
    //Compute number of interactions to be done and compare to CPU threshold
    long long int nInter = ni_total* (long long int) nj;    
    if (nInter < CPUThreshold)
    {
        fprintf(stderr, "CPU EXEC || ni: %d  nj: %d BThreshold: %d nInter: %lld\n", ni_total, nj, CPUThreshold, nInter);
        
        predictJParticles_host(nj);                    //Predict the particles
        //evaluate_gravity_host(ni_total, nj);         //Non-vector version
        evaluate_gravity_host_vector(ni_total, nj);    //Vector version
        executedOnHost = true;
        return 0.0;
    }    
  #endif
    
  //ni is the number of i-particles that is set and for which we compute the force
  //nj is the current number of j-particles that are used as sources

  //If there are particles updated, put them in the correct locations
  //in the device memory. From the temp buffers to the final location.
  copyJInDev(nj);

  //Execute prediction if necessary
  predictJParticles(nj);
  
  
    
  //Reset the memory buffers in order to be able to do atomicAdds
  int argIdx          = 0;
  int doNGB          = true;
  int doNGBList      = true;
  
  sapdevice->resetDevBuffers.set_arg<int  >(argIdx++, &ni_total);
  sapdevice->resetDevBuffers.set_arg<int  >(argIdx++, &doNGB);
  sapdevice->resetDevBuffers.set_arg<int  >(argIdx++, &doNGBList);
  sapdevice->resetDevBuffers.set_arg<int  >(argIdx++, &integrationOrder);
  sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->iParticleResults.ptr());
  sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->ds_i.ptr());
  sapdevice->resetDevBuffers.set_arg<void*>(argIdx++, sapdevice->ngb_count_i.ptr());
  sapdevice->resetDevBuffers.setWork_2D(256, ni_total);
  sapdevice->resetDevBuffers.execute();

  int ni = 0;
  //Loop over the ni-particles in jumps equal to the number of threads
  for(int ni_offset = 0; ni_offset < ni_total; ni_offset += NTHREADS)
  {
    //Determine number of particles to be integrated
    ni = min(ni_total - ni_offset, NTHREADS);
    
    //Setting the properties for the gravity kernel

    //Calculate the number of blocks, groups, etc. For efficiency we always 
    //launch a multiple number of blocks of the warpsize/wavefront size
    int multipleSize = 1;
    
    if(integrationOrder <= FOURTH)
    {
      //This is only possible for 4th order, the 6th order requires too many 
      //resources to launch big thread-blocks. The get_workGroupMultiple 
      //retrieves the warp/wavefront size
      if(ni > 128)
        multipleSize = sapdevice->evalgravKernelTemplate.get_workGroupMultiple();
      else if(ni > 96)
        multipleSize =  sapdevice->evalgravKernelTemplate.get_workGroupMultiple() / 2;
    }
    
    //Force ni to be a multiple of the warp/wavefront size. Note we can let ni be a multiple
    //since we ignore all results of non-used (non-requested) particles
#if 1  //Disable this for block timing. BLOCK_TIMING
    int temp = ni / multipleSize; 
    if((ni % multipleSize) != 0 ) temp++;
    ni = temp * multipleSize;
#endif

    //Dimensions of one thread-block, this can be of the 2D form if there are multiple 
    //y dimensions (q) with an x-dimension of p.
    int p = ni;
    int q = min(NTHREADS2/ni, 32); //NOTE NTHREADS2 to make 2D blocks for devices with enough resources
                                   //to allow for 2x NTHREADS block-sizes
    
    //The above is the default and works all the time, we can do some extra device/algorithm
    //specific tunings using the code below.

    //Set the amount of shared-memory and possibly improve the 2D block sizes 
    //by using specific optimizations. 
    //NOTE this is also device/resource dependend and can cause 'out of resource' crashes!!
    if(integrationOrder == FOURTH)
    {
      if(integrationPrecision == DOUBLESINGLE)
      {
        #ifdef ENABLE_THREAD_BLOCK_SIZE_OPTIMIZATION
          //This is most optimal one for Fourth order Double-Single. 
          if(ni <= 256 && ni >= 32)   
            q = min(sapdevice->evalgravKernelTemplate.get_workGroupMaxSize()/ni, 32);      
        #endif            
      }
    }
    
//     q = 1; //Use this when testing optimal thread/block/multi size. Disables 2D thread-blocks . BLOCK_TIMING 
    
    int sharedMemSizeEval = p*q*(sapdevice->sharedMemPerThread);
    


    //Compute the number of nj particles used per-block (note can have multiple blocks per thread-block in 2D case)
    int nj_scaled       = n_norm(nj, q*(sapdevice->get_NBLOCKS()));
    int thisBlockScaled = nj_scaled/((sapdevice->get_NBLOCKS())*q);

    #ifdef DEBUG_PRINT
      fprintf(stderr, "Offset: %d  --> Total: %d Current step: %d \n", ni_offset, ni_total, ni);
      fprintf(stderr, "EvalGrav config: p: %d q: %d  nj: %d nj_scaled: %d thisblockscaled: %d ni: %d EPS: %f \n",
                      p,q,nj, nj_scaled, thisBlockScaled, ni, EPS2);
      fprintf(stderr, "Shared memory configuration, size eval: %d  Per thread: %d (bytes)\n",
                      sharedMemSizeEval, sapdevice->sharedMemPerThread);    
    #endif


    argIdx = 0;
    
    sapdevice->evalgravKernelTemplate.set_arg<int  >(argIdx++, &nj);      //Total number of j particles
    sapdevice->evalgravKernelTemplate.set_arg<int  >(argIdx++, &thisBlockScaled);
    sapdevice->evalgravKernelTemplate.set_arg<int  >(argIdx++, &ni_offset);    
    sapdevice->evalgravKernelTemplate.set_arg<int  >(argIdx++, &ni_total);   

    sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++,  sapdevice->pPos_j.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++,  sapdevice->pos_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void* >(argIdx++,  sapdevice->iParticleResults.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<double>(argIdx++, &EPS2);

    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->pVel_j.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->id_j.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->vel_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->id_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->ds_i.ptr());     
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->ngb_count_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->ngb_list_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->acc_i.ptr());
    sapdevice->evalgravKernelTemplate.set_arg<void*>(argIdx++,  sapdevice->pAcc_j.ptr());

    sapdevice->evalgravKernelTemplate.set_arg<int>(argIdx++, NULL, (sharedMemSizeEval)/sizeof(int));  //Shared memory

    sapdevice->evalgravKernelTemplate.setWork_threadblock2D(p, q, (sapdevice->get_NBLOCKS()), 1); //Default
    sapdevice->evalgravKernelTemplate.execute();
  } //Loop over ni


  return 0.0;
} //end evaluate gravity