예제 #1
0
void FGHKernelLauncher(const FGHKernelArgs* h_ctx, const KernelConfiguration& config) {
	if(step > 1) {
		std::cout << "This kernel is only valid for 2-step RK" << std::endl;
		exit(-1);
	}

	//Upload parameters to the GPU
	KPSIMULATOR_CHECK_CUDA(cudaMemcpyToSymbolAsync(fgh_ctx, h_ctx, sizeof(FGHKernelArgs), 0, cudaMemcpyHostToDevice, config.stream));

	//Launch kernel
	cudaFuncSetCacheConfig(FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step>, cudaFuncCachePreferShared);
	FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step><<<config.grid, config.block, 0, config.stream>>>();
	KPSIMULATOR_CHECK_CUDA_ERROR("fluxSourceKernel");
}
예제 #2
0
  inline
  CudaParallelLaunch( const DriverType & driver ,
                      const dim3       & grid ,
                      const dim3       & block ,
                      const int          shmem )
  {
    if ( sizeof( KokkosArray::Impl::CudaTraits::ConstantGlobalBufferType ) <
         sizeof( DriverType ) ) {
      KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
    }

    if ( CudaTraits::SharedMemoryCapacity < shmem ) {
      KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
    }
    else if ( shmem ) {
      cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
    }

    // Copy functor to constant memory on the device
    cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );

    // Invoke the driver function on the device
    cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
  }
예제 #3
0
void octree::load_kernels() {

  if (!devContext_flag) set_context();
  
  //If we arive here we have aquired a device, configure parts of the code
  
  //Get the number of multiprocessors and compute number of 
  //blocks to be used during the tree-walk
  nMultiProcessors      = devContext.multiProcessorCount;
  const int blocksPerSM = getTreeWalkBlocksPerSM(
                          this->getDevContext()->getComputeCapabilityMajor(),
                          this->getDevContext()->getComputeCapabilityMinor());
 
  nBlocksForTreeWalk = nMultiProcessors*blocksPerSM;
  

  std::string pathName;


  //AMUSE specific
  if(this->src_directory != NULL)
  {
    pathName.assign(this->src_directory);
  }
  else
  {  
    //Strip the executable name, to get the path name
    std::string temp(execPath);
    int idx = (int)temp.find_last_of("/\\");
    pathName.assign(temp.substr(0, idx+1));
  }
  

  // load scan & sort kernels

  compactCount.setContext(devContext);
  exScanBlock.setContext(devContext);
  compactMove.setContext(devContext);
  splitMove.setContext(devContext);
  sortCount.setContext(devContext);
  sortMove.setContext(devContext);
  extractInt.setContext(devContext);
  reOrderKeysValues.setContext(devContext);
  convertKey64to96.setContext(devContext);
  extractKeyAndPerm.setContext(devContext);
  dataReorderR4.setContext(devContext);
  dataReorderF2.setContext(devContext);
  dataReorderI1.setContext(devContext);
  dataReorderCombined.setContext(devContext);

#ifdef USE_CUDA
  compactCount.load_source("./scanKernels.ptx", pathName.c_str());
  compactCount.create("compact_count", (const void*)&compact_count);
  

  exScanBlock.load_source("./scanKernels.ptx", pathName.c_str());
  exScanBlock.create("exclusive_scan_block", (const void*)&exclusive_scan_block);
  
  compactMove.load_source("./scanKernels.ptx", pathName.c_str());
  compactMove.create("compact_move", (const void*)&compact_move);
  
  splitMove.load_source("./scanKernels.ptx", pathName.c_str());
  splitMove.create("split_move", (const void*)split_move);
  
  sortCount.load_source("./sortKernels.ptx", pathName.c_str());
  sortCount.create("sort_count", (const void*)sort_count);

  sortMove.load_source("./sortKernels.ptx", pathName.c_str());
  sortMove.create("sort_move_stage_key_value", (const void*)sort_move_stage_key_value);

  extractInt.load_source("./sortKernels.ptx", pathName.c_str());
  extractInt.create("extractInt", (const void*)extractInt_kernel);
  
  reOrderKeysValues.load_source("./sortKernels.ptx", pathName.c_str());
  reOrderKeysValues.create("reOrderKeysValues", (const void*)&reOrderKeysValues_kernel);
  
  extractKeyAndPerm.load_source("./sortKernels.ptx", pathName.c_str());
  extractKeyAndPerm.create("extractKeyAndPerm", (const void*)&gpu_extractKeyAndPerm);
  
  convertKey64to96.load_source("./sortKernels.ptx", pathName.c_str());
  convertKey64to96.create("convertKey64to96", (const void*)&gpu_convertKey64to96);
  
  dataReorderR4.load_source("./sortKernels.ptx", pathName.c_str());
//  dataReorderR4.create("dataReorderR4");  
  dataReorderR4.create("dataReorderCombined4", (const void*)&dataReorderCombined4);
  
  dataReorderF2.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderF2.create("dataReorderF2", (const void*)&gpu_dataReorderF2);

  dataReorderI1.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderI1.create("dataReorderI1", (const void*)&gpu_dataReorderI1);
  
  dataReorderCombined.load_source("./sortKernels.ptx", pathName.c_str());
  dataReorderCombined.create("dataReorderCombined", (const void*)&gpu_dataReorderCombined);

  
  
#else
  compactCount.load_source("scanKernels.cl", "OpenCLKernels");
  compactCount.create("compact_count");
  
  exScanBlock.load_source("scanKernels.cl", "OpenCLKernels");
  exScanBlock.create("exclusive_scan_block");
  
  compactMove.load_source("scanKernels.cl", "OpenCLKernels");
  compactMove.create("compact_move");
  
  splitMove.load_source("scanKernels.cl", "OpenCLKernels");
  splitMove.create("split_move");
#endif


  // load tree-build kernels
  
  /* set context */
  
  build_key_list.setContext(devContext);
  build_valid_list.setContext(devContext);
  build_nodes.setContext(devContext);
  link_tree.setContext(devContext);
  define_groups.setContext(devContext);
  build_level_list.setContext(devContext);
  boundaryReduction.setContext(devContext);
  boundaryReductionGroups.setContext(devContext);  
  build_body2group_list.setContext(devContext);
  store_groups.setContext(devContext);
  segmentedCoarseGroupBoundary.setContext(devContext);
  /* load kernels tree properties */
  
#ifdef USE_CUDA
  build_key_list.load_source("./build_tree.ptx", pathName.c_str());
  build_valid_list.load_source("./build_tree.ptx", pathName.c_str());
  build_nodes.load_source("./build_tree.ptx", pathName.c_str());
  link_tree.load_source("./build_tree.ptx", pathName.c_str());
  define_groups.load_source("./build_tree.ptx", pathName.c_str());
  build_level_list.load_source("./build_tree.ptx", pathName.c_str());
  boundaryReduction.load_source("./build_tree.ptx", pathName.c_str());
  boundaryReductionGroups.load_source("./build_tree.ptx", pathName.c_str());
  build_body2group_list.load_source("./build_tree.ptx", pathName.c_str());
  store_groups.load_source("./build_tree.ptx", pathName.c_str());
  segmentedCoarseGroupBoundary.load_source("./build_tree.ptx", pathName.c_str());
  
  /* create kernels */

  build_key_list.create("cl_build_key_list", (const void*)&cl_build_key_list);
  build_valid_list.create("cl_build_valid_list", (const void*)&cl_build_valid_list);
  build_nodes.create("cl_build_nodes", (const void*)&cl_build_nodes);
  link_tree.create("cl_link_tree", (const void*)&cl_link_tree);
  define_groups.create("build_group_list2", (const void*)&build_group_list2);
  build_level_list.create("build_level_list", (const void*)&gpu_build_level_list);
  boundaryReduction.create("boundaryReduction", (const void*)&gpu_boundaryReduction);
  boundaryReductionGroups.create("boundaryReductionGroups", (const void*)&gpu_boundaryReductionGroups);
//  build_body2group_list.create("build_body2group_list", (const void*)&gpu_build_body2group_list);
  store_groups.create("store_group_list", (const void*)&store_group_list);
  segmentedCoarseGroupBoundary.create("segmentedCoarseGroupBoundary", (const void*)&gpu_segmentedCoarseGroupBoundary);


#else
  build_key_list.load_source("build_tree.cl", "");
  build_valid_list.load_source("build_tree.cl", "");
  build_nodes.load_source("build_tree.cl", "");
  link_tree.load_source("build_tree.cl", "");
  
  /* create kernels */

  build_key_list.create("cl_build_key_list");
  build_valid_list.create("cl_build_valid_list");
  build_nodes.create("cl_build_nodes");
  link_tree.create("cl_link_tree");
#endif

  // load tree-props kernels
  propsNonLeafD.setContext(devContext);
  propsLeafD.setContext(devContext);
  propsScalingD.setContext(devContext);
  
  setPHGroupData.setContext(devContext);
  setPHGroupDataGetKey.setContext(devContext);
  setPHGroupDataGetKey2.setContext(devContext);
 
  /* load kernels */
  
#ifdef USE_CUDA

  propsNonLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1);
  propsLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1);
  propsScalingD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "",-1);
  
  setPHGroupData.load_source("./compute_propertiesD.ptx", pathName.c_str());
  setPHGroupDataGetKey.load_source("./compute_propertiesD.ptx", pathName.c_str());
  setPHGroupDataGetKey2.load_source("./compute_propertiesD.ptx", pathName.c_str());
  /* create kernels */
  
  propsNonLeafD.create("compute_non_leaf", (const void*)&compute_non_leaf);
  propsLeafD.create("compute_leaf", (const void*)&compute_leaf);
  propsScalingD.create("compute_scaling", (const void*)&compute_scaling);

  setPHGroupData.create("setPHGroupData", (const void*)&gpu_setPHGroupData);
  setPHGroupDataGetKey.create("setPHGroupDataGetKey", (const void*)&gpu_setPHGroupDataGetKey);
  setPHGroupDataGetKey2.create("setPHGroupDataGetKey2", (const void*)&gpu_setPHGroupDataGetKey2);

  
#else
  propsNonLeaf.load_source("compProps.cl", "");
  propsLeaf.load_source("compProps.cl", "");
  propsScaling.load_source("compProps.cl", ""); 
  
  /* create kernels */

  propsNonLeaf.create("compute_non_leaf");
  propsLeaf.create("compute_leaf");
  propsScaling.create("compute_scaling");  
#endif

  /* Tree iteration */
  getTNext.setContext(devContext);
  predictParticles.setContext(devContext);
  getNActive.setContext(devContext);
  approxGrav.setContext(devContext);
  directGrav.setContext(devContext);
  correctParticles.setContext(devContext);
  computeDt.setContext(devContext);
  computeEnergy.setContext(devContext);
  setActiveGrps.setContext(devContext);
  distanceCheck.setContext(devContext);  


  approxGravLET.setContext(devContext);
  determineLET.setContext(devContext);

#ifdef USE_CUDA
  getTNext.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  predictParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  getNActive.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  approxGrav.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);
  directGrav.load_source("./dev_direct_gravity.ptx", pathName.c_str(), "", 64);
  correctParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  computeDt.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  computeEnergy.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  setActiveGrps.load_source("./timestep.ptx", pathName.c_str(), "", -1);
  distanceCheck.load_source("./timestep.ptx", pathName.c_str(), "", -1);  
  
  approxGravLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);  
  determineLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64);
  /* create kernels */

  getTNext.create("get_Tnext", (const void*)&get_Tnext);
  predictParticles.create("predict_particles", (const void*)&predict_particles);
  getNActive.create("get_nactive", (const void*)&get_nactive);
  approxGrav.create("dev_approximate_gravity", (const void*)&dev_approximate_gravity);

#ifdef KEPLER /* preferL1 equal egaburov */
  cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity, cudaFuncCachePreferL1);
  cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity_let, cudaFuncCachePreferL1);
#if 0
#if 1
  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
#else
  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif
#endif
#endif


  directGrav.create("dev_direct_gravity", (const void*)&dev_direct_gravity);
  correctParticles.create("correct_particles", (const void*)&correct_particles);
  computeDt.create("compute_dt", (const void*)&compute_dt);
  setActiveGrps.create("setActiveGroups", (const void*)&setActiveGroups);

  computeEnergy.create("compute_energy_double", (const void*)&compute_energy_double);
  distanceCheck.create("distanceCheck", (const void*)&distanceCheck);
  
  approxGravLET.create("dev_approximate_gravity_let", (const void*)&dev_approximate_gravity_let);
#if 0  /* egaburov, doesn't compile with this */
  determineLET.create("dev_determineLET", (const void*)&dev_determineLET);
#endif

#else
  getTNext.load_source("", "");
  
  /* create kernels */

  getTNext.create("");
  
#endif

  //Parallel kernels
  domainCheck.setContext(devContext);  
  extractSampleParticles.setContext(devContext);  
  extractOutOfDomainR4.setContext(devContext);  
  extractOutOfDomainBody.setContext(devContext);  
  insertNewParticles.setContext(devContext);  
  internalMove.setContext(devContext);  
  build_parallel_grps.setContext(devContext);
  segmentedSummaryBasic.setContext(devContext);
  domainCheckSFC.setContext(devContext);
  internalMoveSFC.setContext(devContext);
  internalMoveSFC2.setContext(devContext);
  extractOutOfDomainParticlesAdvancedSFC.setContext(devContext);
  extractOutOfDomainParticlesAdvancedSFC2.setContext(devContext);
  insertNewParticlesSFC.setContext(devContext);
  extractSampleParticlesSFC.setContext(devContext);
  domainCheckSFCAndAssign.setContext(devContext);



#ifdef USE_CUDA
  domainCheck.load_source("./parallel.ptx", pathName.c_str());
  extractSampleParticles.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainR4.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainBody.load_source("./parallel.ptx", pathName.c_str());
  insertNewParticles.load_source("./parallel.ptx", pathName.c_str());
  internalMove.load_source("./parallel.ptx", pathName.c_str());

  
  build_parallel_grps.load_source("./build_tree.ptx", pathName.c_str());
  segmentedSummaryBasic.load_source("./build_tree.ptx", pathName.c_str());
  domainCheckSFC.load_source("./parallel.ptx", pathName.c_str());
  internalMoveSFC.load_source("./parallel.ptx", pathName.c_str());
  internalMoveSFC2.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainParticlesAdvancedSFC.load_source("./parallel.ptx", pathName.c_str());
  extractOutOfDomainParticlesAdvancedSFC2.load_source("./parallel.ptx", pathName.c_str());
  insertNewParticlesSFC.load_source("./parallel.ptx", pathName.c_str());
  extractSampleParticlesSFC.load_source("./parallel.ptx", pathName.c_str());
  domainCheckSFCAndAssign.load_source("./parallel.ptx", pathName.c_str());

  domainCheck.create("doDomainCheck", (const void*)&doDomainCheck);
  extractSampleParticles.create("extractSampleParticles", (const void*)&gpu_extractSampleParticles);
  extractOutOfDomainR4.create("extractOutOfDomainParticlesR4", (const void*)&extractOutOfDomainParticlesR4);
  extractOutOfDomainBody.create("extractOutOfDomainParticlesAdvanced", (const void*)&extractOutOfDomainParticlesAdvanced);
  insertNewParticles.create("insertNewParticles", (const void*)&gpu_insertNewParticles);
  internalMove.create("internalMove", (const void*)&gpu_internalMove);

  extractSampleParticlesSFC.create("build_parallel_grps", (const void*)&gpu_extractSampleParticlesSFC);
  build_parallel_grps.create("build_parallel_grps", (const void*)&gpu_build_parallel_grps);
  segmentedSummaryBasic.create("segmentedSummaryBasic", (const void*)&gpu_segmentedSummaryBasic);
  domainCheckSFC.create("domainCheckSFC", (const void*)&gpu_domainCheckSFC);
  internalMoveSFC.create("internalMoveSFC", (const void*)&gpu_internalMoveSFC);
  internalMoveSFC2.create("internalMoveSFC2", (const void*)&gpu_internalMoveSFC2);
  extractOutOfDomainParticlesAdvancedSFC.create("extractOutOfDomainParticlesAdvancedSFC", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC);
  extractOutOfDomainParticlesAdvancedSFC2.create("extractOutOfDomainParticlesAdvancedSFC2", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC2);
  insertNewParticlesSFC.create("insertNewParticlesSFC", (const void*)&gpu_insertNewParticlesSFC);
  domainCheckSFCAndAssign.create("domainCheckSFCAndAssign", (const void*)&gpu_domainCheckSFCAndAssign);

#else


#endif
  
#ifdef USE_DUST
   define_dust_groups.setContext(devContext);
   define_dust_groups.load_source("./build_tree.ptx", pathName.c_str());
   define_dust_groups.create("define_dust_groups");
   
   store_dust_groups.setContext(devContext);
   store_dust_groups.load_source("./build_tree.ptx", pathName.c_str());
   store_dust_groups.create("store_dust_groups");
   
   predictDust.setContext(devContext);
   predictDust.load_source("./build_tree.ptx", pathName.c_str());
   predictDust.create("predict_dust_particles");   
   
   correctDust.setContext(devContext);
   correctDust.load_source("./build_tree.ptx", pathName.c_str());
   correctDust.create("correct_dust_particles");     
   
   
   
#endif  


}
예제 #4
0
cudaError_t WINAPI wine_cudaFuncSetCacheConfig( const char *func, enum cudaFuncCache cacheConfig ) {
    WINE_TRACE("\n");
    return cudaFuncSetCacheConfig( func, cacheConfig );
}
예제 #5
0
inline void BCKernelLauncher(const KernelConfiguration& config) {
	cudaFuncSetCacheConfig(BCKernel<threads, N, S, E, W>, cudaFuncCachePreferL1);
	BCKernel<threads, N, S, E, W><<<config.grid, config.block, 0, config.stream>>>();
	KPSIMULATOR_CHECK_CUDA_ERROR("BCKernel");
}