void FGHKernelLauncher(const FGHKernelArgs* h_ctx, const KernelConfiguration& config) { if(step > 1) { std::cout << "This kernel is only valid for 2-step RK" << std::endl; exit(-1); } //Upload parameters to the GPU KPSIMULATOR_CHECK_CUDA(cudaMemcpyToSymbolAsync(fgh_ctx, h_ctx, sizeof(FGHKernelArgs), 0, cudaMemcpyHostToDevice, config.stream)); //Launch kernel cudaFuncSetCacheConfig(FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step>, cudaFuncCachePreferShared); FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step><<<config.grid, config.block, 0, config.stream>>>(); KPSIMULATOR_CHECK_CUDA_ERROR("fluxSourceKernel"); }
inline CudaParallelLaunch( const DriverType & driver , const dim3 & grid , const dim3 & block , const int shmem ) { if ( sizeof( KokkosArray::Impl::CudaTraits::ConstantGlobalBufferType ) < sizeof( DriverType ) ) { KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); } if ( CudaTraits::SharedMemoryCapacity < shmem ) { KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); } else if ( shmem ) { cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ); } // Copy functor to constant memory on the device cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); // Invoke the driver function on the device cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>(); }
void octree::load_kernels() { if (!devContext_flag) set_context(); //If we arive here we have aquired a device, configure parts of the code //Get the number of multiprocessors and compute number of //blocks to be used during the tree-walk nMultiProcessors = devContext.multiProcessorCount; const int blocksPerSM = getTreeWalkBlocksPerSM( this->getDevContext()->getComputeCapabilityMajor(), this->getDevContext()->getComputeCapabilityMinor()); nBlocksForTreeWalk = nMultiProcessors*blocksPerSM; std::string pathName; //AMUSE specific if(this->src_directory != NULL) { pathName.assign(this->src_directory); } else { //Strip the executable name, to get the path name std::string temp(execPath); int idx = (int)temp.find_last_of("/\\"); pathName.assign(temp.substr(0, idx+1)); } // load scan & sort kernels compactCount.setContext(devContext); exScanBlock.setContext(devContext); compactMove.setContext(devContext); splitMove.setContext(devContext); sortCount.setContext(devContext); sortMove.setContext(devContext); extractInt.setContext(devContext); reOrderKeysValues.setContext(devContext); convertKey64to96.setContext(devContext); extractKeyAndPerm.setContext(devContext); dataReorderR4.setContext(devContext); dataReorderF2.setContext(devContext); dataReorderI1.setContext(devContext); dataReorderCombined.setContext(devContext); #ifdef USE_CUDA compactCount.load_source("./scanKernels.ptx", pathName.c_str()); compactCount.create("compact_count", (const void*)&compact_count); exScanBlock.load_source("./scanKernels.ptx", pathName.c_str()); exScanBlock.create("exclusive_scan_block", (const void*)&exclusive_scan_block); compactMove.load_source("./scanKernels.ptx", pathName.c_str()); compactMove.create("compact_move", (const void*)&compact_move); splitMove.load_source("./scanKernels.ptx", pathName.c_str()); splitMove.create("split_move", (const void*)split_move); sortCount.load_source("./sortKernels.ptx", pathName.c_str()); sortCount.create("sort_count", (const void*)sort_count); sortMove.load_source("./sortKernels.ptx", pathName.c_str()); sortMove.create("sort_move_stage_key_value", (const void*)sort_move_stage_key_value); extractInt.load_source("./sortKernels.ptx", pathName.c_str()); extractInt.create("extractInt", (const void*)extractInt_kernel); reOrderKeysValues.load_source("./sortKernels.ptx", pathName.c_str()); reOrderKeysValues.create("reOrderKeysValues", (const void*)&reOrderKeysValues_kernel); extractKeyAndPerm.load_source("./sortKernels.ptx", pathName.c_str()); extractKeyAndPerm.create("extractKeyAndPerm", (const void*)&gpu_extractKeyAndPerm); convertKey64to96.load_source("./sortKernels.ptx", pathName.c_str()); convertKey64to96.create("convertKey64to96", (const void*)&gpu_convertKey64to96); dataReorderR4.load_source("./sortKernels.ptx", pathName.c_str()); // dataReorderR4.create("dataReorderR4"); dataReorderR4.create("dataReorderCombined4", (const void*)&dataReorderCombined4); dataReorderF2.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderF2.create("dataReorderF2", (const void*)&gpu_dataReorderF2); dataReorderI1.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderI1.create("dataReorderI1", (const void*)&gpu_dataReorderI1); dataReorderCombined.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderCombined.create("dataReorderCombined", (const void*)&gpu_dataReorderCombined); #else compactCount.load_source("scanKernels.cl", "OpenCLKernels"); compactCount.create("compact_count"); exScanBlock.load_source("scanKernels.cl", "OpenCLKernels"); exScanBlock.create("exclusive_scan_block"); compactMove.load_source("scanKernels.cl", "OpenCLKernels"); compactMove.create("compact_move"); splitMove.load_source("scanKernels.cl", "OpenCLKernels"); splitMove.create("split_move"); #endif // load tree-build kernels /* set context */ build_key_list.setContext(devContext); build_valid_list.setContext(devContext); build_nodes.setContext(devContext); link_tree.setContext(devContext); define_groups.setContext(devContext); build_level_list.setContext(devContext); boundaryReduction.setContext(devContext); boundaryReductionGroups.setContext(devContext); build_body2group_list.setContext(devContext); store_groups.setContext(devContext); segmentedCoarseGroupBoundary.setContext(devContext); /* load kernels tree properties */ #ifdef USE_CUDA build_key_list.load_source("./build_tree.ptx", pathName.c_str()); build_valid_list.load_source("./build_tree.ptx", pathName.c_str()); build_nodes.load_source("./build_tree.ptx", pathName.c_str()); link_tree.load_source("./build_tree.ptx", pathName.c_str()); define_groups.load_source("./build_tree.ptx", pathName.c_str()); build_level_list.load_source("./build_tree.ptx", pathName.c_str()); boundaryReduction.load_source("./build_tree.ptx", pathName.c_str()); boundaryReductionGroups.load_source("./build_tree.ptx", pathName.c_str()); build_body2group_list.load_source("./build_tree.ptx", pathName.c_str()); store_groups.load_source("./build_tree.ptx", pathName.c_str()); segmentedCoarseGroupBoundary.load_source("./build_tree.ptx", pathName.c_str()); /* create kernels */ build_key_list.create("cl_build_key_list", (const void*)&cl_build_key_list); build_valid_list.create("cl_build_valid_list", (const void*)&cl_build_valid_list); build_nodes.create("cl_build_nodes", (const void*)&cl_build_nodes); link_tree.create("cl_link_tree", (const void*)&cl_link_tree); define_groups.create("build_group_list2", (const void*)&build_group_list2); build_level_list.create("build_level_list", (const void*)&gpu_build_level_list); boundaryReduction.create("boundaryReduction", (const void*)&gpu_boundaryReduction); boundaryReductionGroups.create("boundaryReductionGroups", (const void*)&gpu_boundaryReductionGroups); // build_body2group_list.create("build_body2group_list", (const void*)&gpu_build_body2group_list); store_groups.create("store_group_list", (const void*)&store_group_list); segmentedCoarseGroupBoundary.create("segmentedCoarseGroupBoundary", (const void*)&gpu_segmentedCoarseGroupBoundary); #else build_key_list.load_source("build_tree.cl", ""); build_valid_list.load_source("build_tree.cl", ""); build_nodes.load_source("build_tree.cl", ""); link_tree.load_source("build_tree.cl", ""); /* create kernels */ build_key_list.create("cl_build_key_list"); build_valid_list.create("cl_build_valid_list"); build_nodes.create("cl_build_nodes"); link_tree.create("cl_link_tree"); #endif // load tree-props kernels propsNonLeafD.setContext(devContext); propsLeafD.setContext(devContext); propsScalingD.setContext(devContext); setPHGroupData.setContext(devContext); setPHGroupDataGetKey.setContext(devContext); setPHGroupDataGetKey2.setContext(devContext); /* load kernels */ #ifdef USE_CUDA propsNonLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1); propsLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1); propsScalingD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "",-1); setPHGroupData.load_source("./compute_propertiesD.ptx", pathName.c_str()); setPHGroupDataGetKey.load_source("./compute_propertiesD.ptx", pathName.c_str()); setPHGroupDataGetKey2.load_source("./compute_propertiesD.ptx", pathName.c_str()); /* create kernels */ propsNonLeafD.create("compute_non_leaf", (const void*)&compute_non_leaf); propsLeafD.create("compute_leaf", (const void*)&compute_leaf); propsScalingD.create("compute_scaling", (const void*)&compute_scaling); setPHGroupData.create("setPHGroupData", (const void*)&gpu_setPHGroupData); setPHGroupDataGetKey.create("setPHGroupDataGetKey", (const void*)&gpu_setPHGroupDataGetKey); setPHGroupDataGetKey2.create("setPHGroupDataGetKey2", (const void*)&gpu_setPHGroupDataGetKey2); #else propsNonLeaf.load_source("compProps.cl", ""); propsLeaf.load_source("compProps.cl", ""); propsScaling.load_source("compProps.cl", ""); /* create kernels */ propsNonLeaf.create("compute_non_leaf"); propsLeaf.create("compute_leaf"); propsScaling.create("compute_scaling"); #endif /* Tree iteration */ getTNext.setContext(devContext); predictParticles.setContext(devContext); getNActive.setContext(devContext); approxGrav.setContext(devContext); directGrav.setContext(devContext); correctParticles.setContext(devContext); computeDt.setContext(devContext); computeEnergy.setContext(devContext); setActiveGrps.setContext(devContext); distanceCheck.setContext(devContext); approxGravLET.setContext(devContext); determineLET.setContext(devContext); #ifdef USE_CUDA getTNext.load_source("./timestep.ptx", pathName.c_str(), "", -1); predictParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1); getNActive.load_source("./timestep.ptx", pathName.c_str(), "", -1); approxGrav.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); directGrav.load_source("./dev_direct_gravity.ptx", pathName.c_str(), "", 64); correctParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1); computeDt.load_source("./timestep.ptx", pathName.c_str(), "", -1); computeEnergy.load_source("./timestep.ptx", pathName.c_str(), "", -1); setActiveGrps.load_source("./timestep.ptx", pathName.c_str(), "", -1); distanceCheck.load_source("./timestep.ptx", pathName.c_str(), "", -1); approxGravLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); determineLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); /* create kernels */ getTNext.create("get_Tnext", (const void*)&get_Tnext); predictParticles.create("predict_particles", (const void*)&predict_particles); getNActive.create("get_nactive", (const void*)&get_nactive); approxGrav.create("dev_approximate_gravity", (const void*)&dev_approximate_gravity); #ifdef KEPLER /* preferL1 equal egaburov */ cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity, cudaFuncCachePreferL1); cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity_let, cudaFuncCachePreferL1); #if 0 #if 1 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte); #else cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); #endif #endif #endif directGrav.create("dev_direct_gravity", (const void*)&dev_direct_gravity); correctParticles.create("correct_particles", (const void*)&correct_particles); computeDt.create("compute_dt", (const void*)&compute_dt); setActiveGrps.create("setActiveGroups", (const void*)&setActiveGroups); computeEnergy.create("compute_energy_double", (const void*)&compute_energy_double); distanceCheck.create("distanceCheck", (const void*)&distanceCheck); approxGravLET.create("dev_approximate_gravity_let", (const void*)&dev_approximate_gravity_let); #if 0 /* egaburov, doesn't compile with this */ determineLET.create("dev_determineLET", (const void*)&dev_determineLET); #endif #else getTNext.load_source("", ""); /* create kernels */ getTNext.create(""); #endif //Parallel kernels domainCheck.setContext(devContext); extractSampleParticles.setContext(devContext); extractOutOfDomainR4.setContext(devContext); extractOutOfDomainBody.setContext(devContext); insertNewParticles.setContext(devContext); internalMove.setContext(devContext); build_parallel_grps.setContext(devContext); segmentedSummaryBasic.setContext(devContext); domainCheckSFC.setContext(devContext); internalMoveSFC.setContext(devContext); internalMoveSFC2.setContext(devContext); extractOutOfDomainParticlesAdvancedSFC.setContext(devContext); extractOutOfDomainParticlesAdvancedSFC2.setContext(devContext); insertNewParticlesSFC.setContext(devContext); extractSampleParticlesSFC.setContext(devContext); domainCheckSFCAndAssign.setContext(devContext); #ifdef USE_CUDA domainCheck.load_source("./parallel.ptx", pathName.c_str()); extractSampleParticles.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainR4.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainBody.load_source("./parallel.ptx", pathName.c_str()); insertNewParticles.load_source("./parallel.ptx", pathName.c_str()); internalMove.load_source("./parallel.ptx", pathName.c_str()); build_parallel_grps.load_source("./build_tree.ptx", pathName.c_str()); segmentedSummaryBasic.load_source("./build_tree.ptx", pathName.c_str()); domainCheckSFC.load_source("./parallel.ptx", pathName.c_str()); internalMoveSFC.load_source("./parallel.ptx", pathName.c_str()); internalMoveSFC2.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainParticlesAdvancedSFC.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainParticlesAdvancedSFC2.load_source("./parallel.ptx", pathName.c_str()); insertNewParticlesSFC.load_source("./parallel.ptx", pathName.c_str()); extractSampleParticlesSFC.load_source("./parallel.ptx", pathName.c_str()); domainCheckSFCAndAssign.load_source("./parallel.ptx", pathName.c_str()); domainCheck.create("doDomainCheck", (const void*)&doDomainCheck); extractSampleParticles.create("extractSampleParticles", (const void*)&gpu_extractSampleParticles); extractOutOfDomainR4.create("extractOutOfDomainParticlesR4", (const void*)&extractOutOfDomainParticlesR4); extractOutOfDomainBody.create("extractOutOfDomainParticlesAdvanced", (const void*)&extractOutOfDomainParticlesAdvanced); insertNewParticles.create("insertNewParticles", (const void*)&gpu_insertNewParticles); internalMove.create("internalMove", (const void*)&gpu_internalMove); extractSampleParticlesSFC.create("build_parallel_grps", (const void*)&gpu_extractSampleParticlesSFC); build_parallel_grps.create("build_parallel_grps", (const void*)&gpu_build_parallel_grps); segmentedSummaryBasic.create("segmentedSummaryBasic", (const void*)&gpu_segmentedSummaryBasic); domainCheckSFC.create("domainCheckSFC", (const void*)&gpu_domainCheckSFC); internalMoveSFC.create("internalMoveSFC", (const void*)&gpu_internalMoveSFC); internalMoveSFC2.create("internalMoveSFC2", (const void*)&gpu_internalMoveSFC2); extractOutOfDomainParticlesAdvancedSFC.create("extractOutOfDomainParticlesAdvancedSFC", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC); extractOutOfDomainParticlesAdvancedSFC2.create("extractOutOfDomainParticlesAdvancedSFC2", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC2); insertNewParticlesSFC.create("insertNewParticlesSFC", (const void*)&gpu_insertNewParticlesSFC); domainCheckSFCAndAssign.create("domainCheckSFCAndAssign", (const void*)&gpu_domainCheckSFCAndAssign); #else #endif #ifdef USE_DUST define_dust_groups.setContext(devContext); define_dust_groups.load_source("./build_tree.ptx", pathName.c_str()); define_dust_groups.create("define_dust_groups"); store_dust_groups.setContext(devContext); store_dust_groups.load_source("./build_tree.ptx", pathName.c_str()); store_dust_groups.create("store_dust_groups"); predictDust.setContext(devContext); predictDust.load_source("./build_tree.ptx", pathName.c_str()); predictDust.create("predict_dust_particles"); correctDust.setContext(devContext); correctDust.load_source("./build_tree.ptx", pathName.c_str()); correctDust.create("correct_dust_particles"); #endif }
cudaError_t WINAPI wine_cudaFuncSetCacheConfig( const char *func, enum cudaFuncCache cacheConfig ) { WINE_TRACE("\n"); return cudaFuncSetCacheConfig( func, cacheConfig ); }
inline void BCKernelLauncher(const KernelConfiguration& config) { cudaFuncSetCacheConfig(BCKernel<threads, N, S, E, W>, cudaFuncCachePreferL1); BCKernel<threads, N, S, E, W><<<config.grid, config.block, 0, config.stream>>>(); KPSIMULATOR_CHECK_CUDA_ERROR("BCKernel"); }