void octree::load_kernels() { if (!devContext_flag) set_context(); //If we arive here we have aquired a device, configure parts of the code //Get the number of multiprocessors and compute number of //blocks to be used during the tree-walk nMultiProcessors = devContext.multiProcessorCount; const int blocksPerSM = getTreeWalkBlocksPerSM( this->getDevContext()->getComputeCapabilityMajor(), this->getDevContext()->getComputeCapabilityMinor()); nBlocksForTreeWalk = nMultiProcessors*blocksPerSM; std::string pathName; //AMUSE specific if(this->src_directory != NULL) { pathName.assign(this->src_directory); } else { //Strip the executable name, to get the path name std::string temp(execPath); int idx = (int)temp.find_last_of("/\\"); pathName.assign(temp.substr(0, idx+1)); } // load scan & sort kernels compactCount.setContext(devContext); exScanBlock.setContext(devContext); compactMove.setContext(devContext); splitMove.setContext(devContext); sortCount.setContext(devContext); sortMove.setContext(devContext); extractInt.setContext(devContext); reOrderKeysValues.setContext(devContext); convertKey64to96.setContext(devContext); extractKeyAndPerm.setContext(devContext); dataReorderR4.setContext(devContext); dataReorderF2.setContext(devContext); dataReorderI1.setContext(devContext); dataReorderCombined.setContext(devContext); #ifdef USE_CUDA compactCount.load_source("./scanKernels.ptx", pathName.c_str()); compactCount.create("compact_count", (const void*)&compact_count); exScanBlock.load_source("./scanKernels.ptx", pathName.c_str()); exScanBlock.create("exclusive_scan_block", (const void*)&exclusive_scan_block); compactMove.load_source("./scanKernels.ptx", pathName.c_str()); compactMove.create("compact_move", (const void*)&compact_move); splitMove.load_source("./scanKernels.ptx", pathName.c_str()); splitMove.create("split_move", (const void*)split_move); sortCount.load_source("./sortKernels.ptx", pathName.c_str()); sortCount.create("sort_count", (const void*)sort_count); sortMove.load_source("./sortKernels.ptx", pathName.c_str()); sortMove.create("sort_move_stage_key_value", (const void*)sort_move_stage_key_value); extractInt.load_source("./sortKernels.ptx", pathName.c_str()); extractInt.create("extractInt", (const void*)extractInt_kernel); reOrderKeysValues.load_source("./sortKernels.ptx", pathName.c_str()); reOrderKeysValues.create("reOrderKeysValues", (const void*)&reOrderKeysValues_kernel); extractKeyAndPerm.load_source("./sortKernels.ptx", pathName.c_str()); extractKeyAndPerm.create("extractKeyAndPerm", (const void*)&gpu_extractKeyAndPerm); convertKey64to96.load_source("./sortKernels.ptx", pathName.c_str()); convertKey64to96.create("convertKey64to96", (const void*)&gpu_convertKey64to96); dataReorderR4.load_source("./sortKernels.ptx", pathName.c_str()); // dataReorderR4.create("dataReorderR4"); dataReorderR4.create("dataReorderCombined4", (const void*)&dataReorderCombined4); dataReorderF2.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderF2.create("dataReorderF2", (const void*)&gpu_dataReorderF2); dataReorderI1.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderI1.create("dataReorderI1", (const void*)&gpu_dataReorderI1); dataReorderCombined.load_source("./sortKernels.ptx", pathName.c_str()); dataReorderCombined.create("dataReorderCombined", (const void*)&gpu_dataReorderCombined); #else compactCount.load_source("scanKernels.cl", "OpenCLKernels"); compactCount.create("compact_count"); exScanBlock.load_source("scanKernels.cl", "OpenCLKernels"); exScanBlock.create("exclusive_scan_block"); compactMove.load_source("scanKernels.cl", "OpenCLKernels"); compactMove.create("compact_move"); splitMove.load_source("scanKernels.cl", "OpenCLKernels"); splitMove.create("split_move"); #endif // load tree-build kernels /* set context */ build_key_list.setContext(devContext); build_valid_list.setContext(devContext); build_nodes.setContext(devContext); link_tree.setContext(devContext); define_groups.setContext(devContext); build_level_list.setContext(devContext); boundaryReduction.setContext(devContext); boundaryReductionGroups.setContext(devContext); build_body2group_list.setContext(devContext); store_groups.setContext(devContext); segmentedCoarseGroupBoundary.setContext(devContext); /* load kernels tree properties */ #ifdef USE_CUDA build_key_list.load_source("./build_tree.ptx", pathName.c_str()); build_valid_list.load_source("./build_tree.ptx", pathName.c_str()); build_nodes.load_source("./build_tree.ptx", pathName.c_str()); link_tree.load_source("./build_tree.ptx", pathName.c_str()); define_groups.load_source("./build_tree.ptx", pathName.c_str()); build_level_list.load_source("./build_tree.ptx", pathName.c_str()); boundaryReduction.load_source("./build_tree.ptx", pathName.c_str()); boundaryReductionGroups.load_source("./build_tree.ptx", pathName.c_str()); build_body2group_list.load_source("./build_tree.ptx", pathName.c_str()); store_groups.load_source("./build_tree.ptx", pathName.c_str()); segmentedCoarseGroupBoundary.load_source("./build_tree.ptx", pathName.c_str()); /* create kernels */ build_key_list.create("cl_build_key_list", (const void*)&cl_build_key_list); build_valid_list.create("cl_build_valid_list", (const void*)&cl_build_valid_list); build_nodes.create("cl_build_nodes", (const void*)&cl_build_nodes); link_tree.create("cl_link_tree", (const void*)&cl_link_tree); define_groups.create("build_group_list2", (const void*)&build_group_list2); build_level_list.create("build_level_list", (const void*)&gpu_build_level_list); boundaryReduction.create("boundaryReduction", (const void*)&gpu_boundaryReduction); boundaryReductionGroups.create("boundaryReductionGroups", (const void*)&gpu_boundaryReductionGroups); // build_body2group_list.create("build_body2group_list", (const void*)&gpu_build_body2group_list); store_groups.create("store_group_list", (const void*)&store_group_list); segmentedCoarseGroupBoundary.create("segmentedCoarseGroupBoundary", (const void*)&gpu_segmentedCoarseGroupBoundary); #else build_key_list.load_source("build_tree.cl", ""); build_valid_list.load_source("build_tree.cl", ""); build_nodes.load_source("build_tree.cl", ""); link_tree.load_source("build_tree.cl", ""); /* create kernels */ build_key_list.create("cl_build_key_list"); build_valid_list.create("cl_build_valid_list"); build_nodes.create("cl_build_nodes"); link_tree.create("cl_link_tree"); #endif // load tree-props kernels propsNonLeafD.setContext(devContext); propsLeafD.setContext(devContext); propsScalingD.setContext(devContext); setPHGroupData.setContext(devContext); setPHGroupDataGetKey.setContext(devContext); setPHGroupDataGetKey2.setContext(devContext); /* load kernels */ #ifdef USE_CUDA propsNonLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1); propsLeafD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "", -1); propsScalingD.load_source("./compute_propertiesD.ptx", pathName.c_str(), "",-1); setPHGroupData.load_source("./compute_propertiesD.ptx", pathName.c_str()); setPHGroupDataGetKey.load_source("./compute_propertiesD.ptx", pathName.c_str()); setPHGroupDataGetKey2.load_source("./compute_propertiesD.ptx", pathName.c_str()); /* create kernels */ propsNonLeafD.create("compute_non_leaf", (const void*)&compute_non_leaf); propsLeafD.create("compute_leaf", (const void*)&compute_leaf); propsScalingD.create("compute_scaling", (const void*)&compute_scaling); setPHGroupData.create("setPHGroupData", (const void*)&gpu_setPHGroupData); setPHGroupDataGetKey.create("setPHGroupDataGetKey", (const void*)&gpu_setPHGroupDataGetKey); setPHGroupDataGetKey2.create("setPHGroupDataGetKey2", (const void*)&gpu_setPHGroupDataGetKey2); #else propsNonLeaf.load_source("compProps.cl", ""); propsLeaf.load_source("compProps.cl", ""); propsScaling.load_source("compProps.cl", ""); /* create kernels */ propsNonLeaf.create("compute_non_leaf"); propsLeaf.create("compute_leaf"); propsScaling.create("compute_scaling"); #endif /* Tree iteration */ getTNext.setContext(devContext); predictParticles.setContext(devContext); getNActive.setContext(devContext); approxGrav.setContext(devContext); directGrav.setContext(devContext); correctParticles.setContext(devContext); computeDt.setContext(devContext); computeEnergy.setContext(devContext); setActiveGrps.setContext(devContext); distanceCheck.setContext(devContext); approxGravLET.setContext(devContext); determineLET.setContext(devContext); #ifdef USE_CUDA getTNext.load_source("./timestep.ptx", pathName.c_str(), "", -1); predictParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1); getNActive.load_source("./timestep.ptx", pathName.c_str(), "", -1); approxGrav.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); directGrav.load_source("./dev_direct_gravity.ptx", pathName.c_str(), "", 64); correctParticles.load_source("./timestep.ptx", pathName.c_str(), "", -1); computeDt.load_source("./timestep.ptx", pathName.c_str(), "", -1); computeEnergy.load_source("./timestep.ptx", pathName.c_str(), "", -1); setActiveGrps.load_source("./timestep.ptx", pathName.c_str(), "", -1); distanceCheck.load_source("./timestep.ptx", pathName.c_str(), "", -1); approxGravLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); determineLET.load_source("./dev_approximate_gravity.ptx", pathName.c_str(), "", 64); /* create kernels */ getTNext.create("get_Tnext", (const void*)&get_Tnext); predictParticles.create("predict_particles", (const void*)&predict_particles); getNActive.create("get_nactive", (const void*)&get_nactive); approxGrav.create("dev_approximate_gravity", (const void*)&dev_approximate_gravity); #ifdef KEPLER /* preferL1 equal egaburov */ cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity, cudaFuncCachePreferL1); cudaFuncSetCacheConfig((const void*)&dev_approximate_gravity_let, cudaFuncCachePreferL1); #if 0 #if 1 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte); #else cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); #endif #endif #endif directGrav.create("dev_direct_gravity", (const void*)&dev_direct_gravity); correctParticles.create("correct_particles", (const void*)&correct_particles); computeDt.create("compute_dt", (const void*)&compute_dt); setActiveGrps.create("setActiveGroups", (const void*)&setActiveGroups); computeEnergy.create("compute_energy_double", (const void*)&compute_energy_double); distanceCheck.create("distanceCheck", (const void*)&distanceCheck); approxGravLET.create("dev_approximate_gravity_let", (const void*)&dev_approximate_gravity_let); #if 0 /* egaburov, doesn't compile with this */ determineLET.create("dev_determineLET", (const void*)&dev_determineLET); #endif #else getTNext.load_source("", ""); /* create kernels */ getTNext.create(""); #endif //Parallel kernels domainCheck.setContext(devContext); extractSampleParticles.setContext(devContext); extractOutOfDomainR4.setContext(devContext); extractOutOfDomainBody.setContext(devContext); insertNewParticles.setContext(devContext); internalMove.setContext(devContext); build_parallel_grps.setContext(devContext); segmentedSummaryBasic.setContext(devContext); domainCheckSFC.setContext(devContext); internalMoveSFC.setContext(devContext); internalMoveSFC2.setContext(devContext); extractOutOfDomainParticlesAdvancedSFC.setContext(devContext); extractOutOfDomainParticlesAdvancedSFC2.setContext(devContext); insertNewParticlesSFC.setContext(devContext); extractSampleParticlesSFC.setContext(devContext); domainCheckSFCAndAssign.setContext(devContext); #ifdef USE_CUDA domainCheck.load_source("./parallel.ptx", pathName.c_str()); extractSampleParticles.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainR4.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainBody.load_source("./parallel.ptx", pathName.c_str()); insertNewParticles.load_source("./parallel.ptx", pathName.c_str()); internalMove.load_source("./parallel.ptx", pathName.c_str()); build_parallel_grps.load_source("./build_tree.ptx", pathName.c_str()); segmentedSummaryBasic.load_source("./build_tree.ptx", pathName.c_str()); domainCheckSFC.load_source("./parallel.ptx", pathName.c_str()); internalMoveSFC.load_source("./parallel.ptx", pathName.c_str()); internalMoveSFC2.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainParticlesAdvancedSFC.load_source("./parallel.ptx", pathName.c_str()); extractOutOfDomainParticlesAdvancedSFC2.load_source("./parallel.ptx", pathName.c_str()); insertNewParticlesSFC.load_source("./parallel.ptx", pathName.c_str()); extractSampleParticlesSFC.load_source("./parallel.ptx", pathName.c_str()); domainCheckSFCAndAssign.load_source("./parallel.ptx", pathName.c_str()); domainCheck.create("doDomainCheck", (const void*)&doDomainCheck); extractSampleParticles.create("extractSampleParticles", (const void*)&gpu_extractSampleParticles); extractOutOfDomainR4.create("extractOutOfDomainParticlesR4", (const void*)&extractOutOfDomainParticlesR4); extractOutOfDomainBody.create("extractOutOfDomainParticlesAdvanced", (const void*)&extractOutOfDomainParticlesAdvanced); insertNewParticles.create("insertNewParticles", (const void*)&gpu_insertNewParticles); internalMove.create("internalMove", (const void*)&gpu_internalMove); extractSampleParticlesSFC.create("build_parallel_grps", (const void*)&gpu_extractSampleParticlesSFC); build_parallel_grps.create("build_parallel_grps", (const void*)&gpu_build_parallel_grps); segmentedSummaryBasic.create("segmentedSummaryBasic", (const void*)&gpu_segmentedSummaryBasic); domainCheckSFC.create("domainCheckSFC", (const void*)&gpu_domainCheckSFC); internalMoveSFC.create("internalMoveSFC", (const void*)&gpu_internalMoveSFC); internalMoveSFC2.create("internalMoveSFC2", (const void*)&gpu_internalMoveSFC2); extractOutOfDomainParticlesAdvancedSFC.create("extractOutOfDomainParticlesAdvancedSFC", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC); extractOutOfDomainParticlesAdvancedSFC2.create("extractOutOfDomainParticlesAdvancedSFC2", (const void*)&gpu_extractOutOfDomainParticlesAdvancedSFC2); insertNewParticlesSFC.create("insertNewParticlesSFC", (const void*)&gpu_insertNewParticlesSFC); domainCheckSFCAndAssign.create("domainCheckSFCAndAssign", (const void*)&gpu_domainCheckSFCAndAssign); #else #endif #ifdef USE_DUST define_dust_groups.setContext(devContext); define_dust_groups.load_source("./build_tree.ptx", pathName.c_str()); define_dust_groups.create("define_dust_groups"); store_dust_groups.setContext(devContext); store_dust_groups.load_source("./build_tree.ptx", pathName.c_str()); store_dust_groups.create("store_dust_groups"); predictDust.setContext(devContext); predictDust.load_source("./build_tree.ptx", pathName.c_str()); predictDust.create("predict_dust_particles"); correctDust.setContext(devContext); correctDust.load_source("./build_tree.ptx", pathName.c_str()); correctDust.create("correct_dust_particles"); #endif }
int main(int argc, char *argv[]) { typedef int IndexType; typedef double ValueType; typedef cusp::device_memory MemorySpace; //typedef cusp::row_major Orientation; bool success = true; bool verbose = false; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString("This test performance of block multiply routines.\n"); IndexType n = 32; CLP.setOption("n", &n, "Number of mesh points in the each direction"); IndexType nrhs_begin = 32; CLP.setOption("begin", &nrhs_begin, "Staring number of right-hand-sides"); IndexType nrhs_end = 512; CLP.setOption("end", &nrhs_end, "Ending number of right-hand-sides"); IndexType nrhs_step = 32; CLP.setOption("step", &nrhs_step, "Increment in number of right-hand-sides"); IndexType nits = 10; CLP.setOption("nits", &nits, "Number of multiply iterations"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); CLP.parse( argc, argv ); // Set CUDA device cudaSetDevice(device_id); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); // create 3D Poisson problem cusp::csr_matrix<IndexType, ValueType, MemorySpace> A; cusp::gallery::poisson27pt(A, n, n, n); std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , " << "col_time , col_gflops" << std::endl; for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) { double flops = 2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs); // test row-major storage cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row( A.num_rows, nrhs, 0); cusp::detail::timer row_timer; row_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_row, y_row); } cudaDeviceSynchronize(); double row_time = row_timer.seconds_elapsed() / nits; double row_gflops = 1.0e-9 * flops / row_time; // test column-major storage cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col( A.num_rows, nrhs, 0); cusp::detail::timer col_timer; col_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_col, y_col); } cudaDeviceSynchronize(); double col_time = col_timer.seconds_elapsed() / nits; double col_gflops = 1.0e-9 * flops / col_time; std::cout << nrhs << " , " << A.num_rows << " , " << A.num_entries << " , " << row_time << " , " << row_gflops << " , " << col_time << " , " << col_gflops << std::endl; } } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }