Ejemplo n.º 1
0
void octree::load_kernels() {

  if (!devContext_flag) set_context();
  
  //If we arive here we have aquired a device, configure parts of the code
  
  //Get the number of multiprocessors and compute number of 
  //blocks to be used during the tree-walk
  nMultiProcessors      = devContext.multiProcessorCount;
  const int blocksPerSM = getTreeWalkBlocksPerSM(
                          this->getDevContext()->getComputeCapabilityMajor(),
                          this->getDevContext()->getComputeCapabilityMinor());
 
  nBlocksForTreeWalk = nMultiProcessors*blocksPerSM;
  

  std::string pathName;


  //AMUSE specific
  if(this->src_directory != NULL)
  {
    pathName.assign(this->src_directory);
  }
  else
  {  
    //Strip the executable name, to get the path name
    std::string temp(execPath);
    int idx = (int)temp.find_last_of("/\\");
    pathName.assign(temp.substr(0, idx+1));
  }
  

  // load scan & sort kernels

  compactCount.setContext(devContext);
  exScanBlock.setContext(devContext);
  compactMove.setContext(devContext);
  splitMove.setContext(devContext);
  sortCount.setContext(devContext);
  sortMove.setContext(devContext);
  extractInt.setContext(devContext);
  reOrderKeysValues.setContext(devContext);
  convertKey64to96.setContext(devContext);
  extractKeyAndPerm.setContext(devContext);
  dataReorderR4.setContext(devContext);
  dataReorderF2.setContext(devContext);
  dataReorderI1.setContext(devContext);
  dataReorderCombined.setContext(devContext);

#ifdef USE_CUDA
  compactCount.load_source("./scanKernels.ptx", pathName.c_str());
  compactCount.create("compact_count", (const void*)&compact_count);
  

  exScanBlock.load_source("./scanKernels.ptx", pathName.c_str());
  exScanBlock.create("exclusive_scan_block", (const void*)&exclusive_scan_block);
  
  compactMove.load_source("./scanKernels.ptx", pathName.c_str());
  compactMove.create("compact_move", (const void*)&compact_move);
  
  splitMove.load_source("./scanKernels.ptx", pathName.c_str());
  splitMove.create("split_move", (const void*)split_move);
  
  sortCount.load_source("./sortKernels.ptx", pathName.c_str());
  sortCount.create("sort_count", (const void*)sort_count);

  sortMove.load_source("./sortKernels.ptx", pathName.c_str());
  sortMove.create("sort_move_stage_key_value", (const void*)sort_move_stage_key_value);

  extractInt.load_source("./sortKernels.ptx", pathName.c_str());
  extractInt.create("extractInt", (const void*)extractInt_kernel);
  
  reOrderKeysValues.load_source("./sortKernels.ptx", pathName.c_str());
  reOrderKeysValues.create("reOrderKeysValues", (const void*)&reOrderKeysValues_kernel);
  
  extractKeyAndPerm.load_source("./sortKernels.ptx", pathName.c_str());
  extractKeyAndPerm.create("extractKeyAndPerm", (const void*)&gpu_extractKeyAndPerm);
  
  convertKey64to96.load_source("./sortKernels.ptx", pathName.c_str());
  convertKey64to96.create("convertKey64to96", (const void*)&gpu_convertKey64to96);
  
  dataReorderR4.load_source("./sortKernels.ptx", pathName.c_str());
//  dataReorderR4.create("dataReorderR4");  
  dataReorderR4.create("dataReorderCombined4", (const void*)&dataReorderCombined4);
  
  dataReorderF2.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderF2.create("dataReorderF2", (const void*)&gpu_dataReorderF2);

  dataReorderI1.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderI1.create("dataReorderI1", (const void*)&gpu_dataReorderI1);
  
  dataReorderCombined.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderCombined.create("dataReorderCombined", (const void*)&gpu_dataReorderCombined);

  
  
#else
  compactCount.load_source("scanKernels.cl", "OpenCLKernels");
  compactCount.create("compact_count");
  
  exScanBlock.load_source("scanKernels.cl", "OpenCLKernels");
  exScanBlock.create("exclusive_scan_block");
  
  compactMove.load_source("scanKernels.cl", "OpenCLKernels");
  compactMove.create("compact_move");
  
  splitMove.load_source("scanKernels.cl", "OpenCLKernels");
  splitMove.create("split_move");
#endif


  // load tree-build kernels
  
  /* set context */
  
  build_key_list.setContext(devContext);
  build_valid_list.setContext(devContext);
  build_nodes.setContext(devContext);
  link_tree.setContext(devContext);
  define_groups.setContext(devContext);
  build_level_list.setContext(devContext);
  boundaryReduction.setContext(devContext);
  boundaryReductionGroups.setContext(devContext);  
  build_body2group_list.setContext(devContext);
  store_groups.setContext(devContext);
  segmentedCoarseGroupBoundary.setContext(devContext);
  /* load kernels tree properties */
  
#ifdef USE_CUDA
  build_key_list.load_source("./build_tree.ptx", pathName.c_str());
  build_valid_list.load_source("./build_tree.ptx", pathName.c_str());
  build_nodes.load_source("./build_tree.ptx", pathName.c_str());
  link_tree.load_source("./build_tree.ptx", pathName.c_str());
  define_groups.load_source("./build_tree.ptx", pathName.c_str());
  build_level_list.load_source("./build_tree.ptx", pathName.c_str());
  boundaryReduction.load_source("./build_tree.ptx", pathName.c_str());
  boundaryReductionGroups.load_source("./build_tree.ptx", pathName.c_str());
  build_body2group_list.load_source("./build_tree.ptx", pathName.c_str());
  store_groups.load_source("./build_tree.ptx", pathName.c_str());
  segmentedCoarseGroupBoundary.load_source("./build_tree.ptx", pathName.c_str());
  
  /* create kernels */

  build_key_list.create("cl_build_key_list", (const void*)&cl_build_key_list);
  build_valid_list.create("cl_build_valid_list", (const void*)&cl_build_valid_list);
  build_nodes.create("cl_build_nodes", (const void*)&cl_build_nodes);
  link_tree.create("cl_link_tree", (const void*)&cl_link_tree);
  define_groups.create("build_group_list2", (const void*)&build_group_list2);
  build_level_list.create("build_level_list", (const void*)&gpu_build_level_list);
  boundaryReduction.create("boundaryReduction", (const void*)&gpu_boundaryReduction);
  boundaryReductionGroups.create("boundaryReductionGroups", (const void*)&gpu_boundaryReductionGroups);
//  build_body2group_list.create("build_body2group_list", (const void*)&gpu_build_body2group_list);
  store_groups.create("store_group_list", (const void*)&store_group_list);
  segmentedCoarseGroupBoundary.create("segmentedCoarseGroupBoundary", (const void*)&gpu_segmentedCoarseGroupBoundary);


#else
  build_key_list.load_source("build_tree.cl", "");
  build_valid_list.load_source("build_tree.cl", "");
  build_nodes.load_source("build_tree.cl", "");
  link_tree.load_source("build_tree.cl", "");
  
  /* create kernels */

  build_key_list.create("cl_build_key_list");
  build_valid_list.create("cl_build_valid_list");
  build_nodes.create("cl_build_nodes");
  link_tree.create("cl_link_tree");
#endif

  // load tree-props kernels
  propsNonLeafD.setContext(devContext);
  propsLeafD.setContext(devContext);
  propsScalingD.setContext(devContext);
  
  setPHGroupData.setContext(devContext);
  setPHGroupDataGetKey.setContext(devContext);
  setPHGroupDataGetKey2.setContext(devContext);
 
  /* load kernels */
  
#ifdef USE_CUDA

  propsNonLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1);
  propsLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1);
  propsScalingD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "",-1);
  
  setPHGroupData.load_source("./compute_propertiesD.ptx", pathName.c_str());
  setPHGroupDataGetKey.load_source("./compute_propertiesD.ptx", pathName.c_str());
  setPHGroupDataGetKey2.load_source("./compute_propertiesD.ptx", pathName.c_str());
  /* create kernels */
  
  propsNonLeafD.create("compute_non_leaf", (const void*)&compute_non_leaf);
  propsLeafD.create("compute_leaf", (const void*)&compute_leaf);
  propsScalingD.create("compute_scaling", (const void*)&compute_scaling);

  setPHGroupData.create("setPHGroupData", (const void*)&gpu_setPHGroupData);
  setPHGroupDataGetKey.create("setPHGroupDataGetKey", (const void*)&gpu_setPHGroupDataGetKey);
  setPHGroupDataGetKey2.create("setPHGroupDataGetKey2", (const void*)&gpu_setPHGroupDataGetKey2);

  
#else
  propsNonLeaf.load_source("compProps.cl", "");
  propsLeaf.load_source("compProps.cl", "");
  propsScaling.load_source("compProps.cl", ""); 
  
  /* create kernels */

  propsNonLeaf.create("compute_non_leaf");
  propsLeaf.create("compute_leaf");
  propsScaling.create("compute_scaling");  
#endif

  /* Tree iteration */
  getTNext.setContext(devContext);
  predictParticles.setContext(devContext);
  getNActive.setContext(devContext);
  approxGrav.setContext(devContext);
  directGrav.setContext(devContext);
  correctParticles.setContext(devContext);
  computeDt.setContext(devContext);
  computeEnergy.setContext(devContext);
  setActiveGrps.setContext(devContext);
  distanceCheck.setContext(devContext);  


  approxGravLET.setContext(devContext);
  determineLET.setContext(devContext);

#ifdef USE_CUDA
  getTNext.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  predictParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  getNActive.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  approxGrav.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);
  directGrav.load_source("./dev_direct_gravity.ptx", pathName.c_str(), "", 64);
  correctParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  computeDt.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  computeEnergy.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  setActiveGrps.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  distanceCheck.load_source("./timestep.ptx", pathName.c_str(), "", -1);  
  
  approxGravLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);  
  determineLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);
  /* create kernels */

  getTNext.create("get_Tnext", (const void*)&get_Tnext);
  predictParticles.create("predict_particles", (const void*)&predict_particles);
  getNActive.create("get_nactive", (const void*)&get_nactive);
  approxGrav.create("dev_approximate_gravity", (const void*)&dev_approximate_gravity);

#ifdef KEPLER /* preferL1 equal egaburov */
  cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity, cudaFuncCachePreferL1);
  cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity_let, cudaFuncCachePreferL1);
#if 0
#if 1
  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
#else
  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif
#endif
#endif


  directGrav.create("dev_direct_gravity", (const void*)&dev_direct_gravity);
  correctParticles.create("correct_particles", (const void*)&correct_particles);
  computeDt.create("compute_dt", (const void*)&compute_dt);
  setActiveGrps.create("setActiveGroups", (const void*)&setActiveGroups);

  computeEnergy.create("compute_energy_double", (const void*)&compute_energy_double);
  distanceCheck.create("distanceCheck", (const void*)&distanceCheck);
  
  approxGravLET.create("dev_approximate_gravity_let", (const void*)&dev_approximate_gravity_let);
#if 0  /* egaburov, doesn't compile with this */
  determineLET.create("dev_determineLET", (const void*)&dev_determineLET);
#endif

#else
  getTNext.load_source("", "");
  
  /* create kernels */

  getTNext.create("");
  
#endif

  //Parallel kernels
  domainCheck.setContext(devContext);  
  extractSampleParticles.setContext(devContext);  
  extractOutOfDomainR4.setContext(devContext);  
  extractOutOfDomainBody.setContext(devContext);  
  insertNewParticles.setContext(devContext);  
  internalMove.setContext(devContext);  
  build_parallel_grps.setContext(devContext);
  segmentedSummaryBasic.setContext(devContext);
  domainCheckSFC.setContext(devContext);
  internalMoveSFC.setContext(devContext);
  internalMoveSFC2.setContext(devContext);
  extractOutOfDomainParticlesAdvancedSFC.setContext(devContext);
  extractOutOfDomainParticlesAdvancedSFC2.setContext(devContext);
  insertNewParticlesSFC.setContext(devContext);
  extractSampleParticlesSFC.setContext(devContext);
  domainCheckSFCAndAssign.setContext(devContext);



#ifdef USE_CUDA
  domainCheck.load_source("./parallel.ptx", pathName.c_str());
  extractSampleParticles.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainR4.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainBody.load_source("./parallel.ptx", pathName.c_str());
  insertNewParticles.load_source("./parallel.ptx", pathName.c_str());
  internalMove.load_source("./parallel.ptx", pathName.c_str());

  
  build_parallel_grps.load_source("./build_tree.ptx", pathName.c_str());
  segmentedSummaryBasic.load_source("./build_tree.ptx", pathName.c_str());
  domainCheckSFC.load_source("./parallel.ptx", pathName.c_str());
  internalMoveSFC.load_source("./parallel.ptx", pathName.c_str());
  internalMoveSFC2.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainParticlesAdvancedSFC.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainParticlesAdvancedSFC2.load_source("./parallel.ptx", pathName.c_str());
  insertNewParticlesSFC.load_source("./parallel.ptx", pathName.c_str());
  extractSampleParticlesSFC.load_source("./parallel.ptx", pathName.c_str());
  domainCheckSFCAndAssign.load_source("./parallel.ptx", pathName.c_str());

  domainCheck.create("doDomainCheck", (const void*)&doDomainCheck);
  extractSampleParticles.create("extractSampleParticles", (const void*)&gpu_extractSampleParticles);
  extractOutOfDomainR4.create("extractOutOfDomainParticlesR4", (const void*)&extractOutOfDomainParticlesR4);
  extractOutOfDomainBody.create("extractOutOfDomainParticlesAdvanced", (const void*)&extractOutOfDomainParticlesAdvanced);
  insertNewParticles.create("insertNewParticles", (const void*)&gpu_insertNewParticles);
  internalMove.create("internalMove", (const void*)&gpu_internalMove);

  extractSampleParticlesSFC.create("build_parallel_grps", (const void*)&gpu_extractSampleParticlesSFC);
  build_parallel_grps.create("build_parallel_grps", (const void*)&gpu_build_parallel_grps);
  segmentedSummaryBasic.create("segmentedSummaryBasic", (const void*)&gpu_segmentedSummaryBasic);
  domainCheckSFC.create("domainCheckSFC", (const void*)&gpu_domainCheckSFC);
  internalMoveSFC.create("internalMoveSFC", (const void*)&gpu_internalMoveSFC);
  internalMoveSFC2.create("internalMoveSFC2", (const void*)&gpu_internalMoveSFC2);
  extractOutOfDomainParticlesAdvancedSFC.create("extractOutOfDomainParticlesAdvancedSFC", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC);
  extractOutOfDomainParticlesAdvancedSFC2.create("extractOutOfDomainParticlesAdvancedSFC2", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC2);
  insertNewParticlesSFC.create("insertNewParticlesSFC", (const void*)&gpu_insertNewParticlesSFC);
  domainCheckSFCAndAssign.create("domainCheckSFCAndAssign", (const void*)&gpu_domainCheckSFCAndAssign);

#else


#endif
  
#ifdef USE_DUST
   define_dust_groups.setContext(devContext);
   define_dust_groups.load_source("./build_tree.ptx", pathName.c_str());
   define_dust_groups.create("define_dust_groups");
   
   store_dust_groups.setContext(devContext);
   store_dust_groups.load_source("./build_tree.ptx", pathName.c_str());
   store_dust_groups.create("store_dust_groups");
   
   predictDust.setContext(devContext);
   predictDust.load_source("./build_tree.ptx", pathName.c_str());
   predictDust.create("predict_dust_particles");   
   
   correctDust.setContext(devContext);
   correctDust.load_source("./build_tree.ptx", pathName.c_str());
   correctDust.create("correct_dust_particles");     
   
   
   
#endif  


}
Ejemplo n.º 2
0
int main(int argc, char *argv[])
{
  typedef int                 IndexType;
  typedef double              ValueType;
  typedef cusp::device_memory MemorySpace;
  //typedef cusp::row_major     Orientation;

  bool success = true;
  bool verbose = false;
  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString("This test performance of block multiply routines.\n");
    IndexType n = 32;
    CLP.setOption("n", &n, "Number of mesh points in the each direction");
    IndexType nrhs_begin = 32;
    CLP.setOption("begin", &nrhs_begin,
                  "Staring number of right-hand-sides");
    IndexType nrhs_end = 512;
    CLP.setOption("end", &nrhs_end,
                  "Ending number of right-hand-sides");
    IndexType nrhs_step = 32;
    CLP.setOption("step", &nrhs_step,
                  "Increment in number of right-hand-sides");
    IndexType nits = 10;
    CLP.setOption("nits", &nits,
                  "Number of multiply iterations");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
    CLP.parse( argc, argv );

    // Set CUDA device
    cudaSetDevice(device_id);
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);

    // create 3D Poisson problem
    cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
    cusp::gallery::poisson27pt(A, n, n, n);

    std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , "
              << "col_time , col_gflops" << std::endl;

    for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {

      double flops =
        2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs);

      // test row-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
        A.num_rows, nrhs, 0);

      cusp::detail::timer row_timer;
      row_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_row, y_row);
      }
      cudaDeviceSynchronize();
      double row_time = row_timer.seconds_elapsed() / nits;
      double row_gflops = 1.0e-9 * flops / row_time;

      // test column-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
        A.num_rows, nrhs, 0);

      cusp::detail::timer col_timer;
      col_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_col, y_col);
      }
      cudaDeviceSynchronize();
      double col_time = col_timer.seconds_elapsed() / nits;
      double col_gflops = 1.0e-9 * flops / col_time;

      std::cout << nrhs << " , "
                << A.num_rows << " , "  << A.num_entries << " , "
                << row_time << " , " << row_gflops << " , "
                << col_time << " , " << col_gflops
                << std::endl;

    }

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}