int main(int argc, char **argv) { OptionParser op; op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_INT, "1", "specify problem size", 's'); op.addOption("target", OPT_INT, "0", "specify MIC target device number", 't'); // If benchmark has any specific options, add those addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { op.usage(); return -1; } ResultDatabase resultDB; // Run the test RunBenchmark(op, resultDB); // Print out results to stdout resultDB.DumpDetailed(cout); return 0; }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // Convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Collect basic MPI information int size, rank; MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION"; runTest<float> ("TPScan-SP", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; runTest<double> ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; runTest<double> ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { char atts[1024] = "DP_Not_Supported"; cout << "Warning, rank " << rank << "'s device does not support DP\n"; // ResultDB requires every rank to report something. If this rank // doesn't support DP, submit FLT_MAX (this is handled as no result by // ResultDB. int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult("TPScan-DP-Kernel" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-Kernel+PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-MPI_ExScan" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-Overall" , atts, "GB/s", FLT_MAX); } } }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros runTest<float>("SGEMM", dev, ctx, queue, resultDB, op, "-DSINGLE_PRECISION"); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; runTest<double>("DGEMM", dev, ctx, queue, resultDB, op, "-DK_DOUBLE_PRECISION "); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; runTest<double>("DGEMM", dev, ctx, queue, resultDB, op, "-DAMD_DOUBLE_PRECISION "); } else { cout << "DP Not Supported\n"; char atts[1024] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (; passes > 0; --passes) { for (int i = 0; i < 2; i++) { const char transb = i ? 'T' : 'N'; string testName="DGEMM"; resultDB.AddResult(testName+"-"+transb, atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_PCIe", atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_Parity", atts, "N", FLT_MAX); } } } }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION "; RunTest<float>("S3D-SP", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; RunTest<double> ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; RunTest<double> ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { cout << "DP Not Supported\n"; char atts[1024] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult("S3D-DP" , atts, "GB/s", FLT_MAX); resultDB.AddResult("S3D-DP_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("S3D-DP_Parity" , atts, "GB/s", FLT_MAX); } } }
void RunBenchmark(cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op) { // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION"; runTest<float, float4, float4> ("MD-LJ", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; runTest<double, double4, double4> ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; runTest<double, double4, double4> ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { cout << "DP Not Supported\n"; char atts[32] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int i = 0; i < passes; i++) { resultDB.AddResult("MD-LJ-DP" , atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP-Bandwidth", atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP-Bandwidth_PCIe", atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP_Parity" , atts, "GB/s", FLT_MAX); } } }
void RunBenchmark(OptionParser& opts, ResultDatabase& resultDB ) { int device; #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); #endif // defined(PARALLEL) #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Running single precision test" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) //omp_set_num_threads(124); DoTest<float>( "SP_Sten2D", resultDB, opts ); // check if we can run double precision tests if( //deviceProps.major == 1) && (deviceProps.minor >= 3)) || //eviceProps.major >= 2)) 1) { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "DP supported\n" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) //omp_set_num_threads(93); DoTest<double>( "DP_Sten2D", resultDB, opts ); } else { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Double precision not supported - skipping" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires neg entry for every possible result int nPasses = (int)opts.getOptionInt( "passes" ); for( int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "s", FLT_MAX); } } }
void RunBenchmark(ResultDatabase &resultDB, OptionParser &op) { // Test to see if this device supports double precision cudaGetDevice(&fftDevice); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, fftDevice); bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) || (deviceProp.major >= 2); cout << "Running single precision test" << endl; runTest<float2>("SP-FFT", resultDB, op); if (has_dp) { cout << "Running double precision test" << endl; runTest<double2>("DP-FFT", resultDB, op); } else { cout << "Skipping double precision test" << endl; char atts[32] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k=0; k<passes; k++) { resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX); } } }
static void fillResultDB(const string& name, const string& reason, OptionParser &op, ResultDatabase& resultDB) { // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k=0; k<passes; k++) { resultDB.AddResult(name , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"_PCIe" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"_Parity" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV_PCIe" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV_Parity" , reason, "GB/s", FLT_MAX); } }
void RunTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op, string& compileFlags) { int n_species = 22; int i, j, err; int probSizes_SP[4] = { 24, 32, 40, 48}; int probSizes_DP[4] = { 16, 24, 32, 40}; int *probSizes = (sizeof(T) == sizeof(double)) ? probSizes_DP : probSizes_SP; int size = probSizes[op.getOptionInt("size")-1]; // The number of grid points int n = size * size * size; // For now these conversion factors are just 1 T pconv = 1.0; // 1418365.88544; T tconv = 1.0; //120.0; T rateconv = 1.0; //11.0393507649917; // Host copies of data T* h_t = new T[n]; T* h_p = new T[n]; T* h_y = new T[n*n_species]; T* h_wdot = new T[n*n_species]; T* h_molwt = new T[n_species]; // Device data cl_mem d_t; // Temperatures array cl_mem d_p; // Pressures array cl_mem d_y; // Input variables cl_mem d_wdot; // Output variables // intermediate variables cl_mem d_rf, d_rb, d_rklow, d_c, d_a, d_eg, d_molwt; // Initialize host memory for (i=0; i<n; i++) { h_p[i] = 1.0132e6; h_t[i] = 1000.0; } for (j=0; j<22; j++) { for (i=0; i<n; i++) { h_y[(j*n)+i]= 0.0; if (j==14) h_y[(j*n)+i] = 0.064; if (j==3) h_y[(j*n)+i] = 0.218; if (j==21) h_y[(j*n)+i] = 0.718; } } for (int i=0; i<n_species; i++) { h_molwt[i] = 1.0f; } // // Initialize molecular weights // h_molwt[0]= 2.01594E-03; // h_molwt[1]= 1.00797E-03; // h_molwt[2]= 1.59994E-02; // h_molwt[3]= 3.19988E-02; // h_molwt[4]= 1.700737E-02; // h_molwt[5]= 1.801534E-02; // h_molwt[6]= 3.300677E-02; // h_molwt[7]= 3.401473999999999E-02; // h_molwt[8]= 1.503506E-02; // h_molwt[9] = 1.604303E-02; // h_molwt[10] = 2.801055E-02; // h_molwt[11] = 4.400995E-02; // h_molwt[12] = 3.002649E-02; // h_molwt[13] = 2.603824E-02; // h_molwt[14] = 2.805418E-02; // h_molwt[15] = 3.007012E-02; // h_molwt[16] = 4.102967E-02; // h_molwt[17] = 4.203764E-02; // h_molwt[18] = 4.405358E-02; // h_molwt[19] = 4.10733E-02; // h_molwt[20] = 4.208127E-02; // h_molwt[21] = 2.80134E-02; // Allocate device memory size_t base = n * sizeof(T); clMalloc(d_t, base); clMalloc(d_p, base); clMalloc(d_y, n_species*base); clMalloc(d_wdot, n_species*base); clMalloc(d_rf, 206*base); clMalloc(d_rb, 206*base); clMalloc(d_rklow, 21*base); clMalloc(d_c, C_SIZE*base); clMalloc(d_a, A_SIZE*base); clMalloc(d_eg, EG_SIZE*base); clMalloc(d_molwt, n_species*sizeof(T)); // Copy over input params long inputTransferTime = 0; Event evTransfer("PCIe Transfer"); clMemtoDevice(d_t, h_t, base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_p, h_p, base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_y, h_y, n_species*base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_molwt, h_molwt, n_species*sizeof(T)); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); // Set up macros compileFlags += "-DDIM=" + toString(size) + " " + "-DN_GP=" + toString(n) + " "; unsigned int passes = op.getOptionInt("passes"); for (unsigned int i = 0; i < passes; i++) { size_t globalWorkSize = n; size_t localWorkSize = 128; // -------------------- phase 1 ----------------- // Setup Program Objects (phase 1) clProg(gr_prog, cl_source_gr_base); clProg(rdsmh_prog, cl_source_rdsmh); clProg(ratt_prog, cl_source_ratt); clProg(ratt2_prog, cl_source_ratt2); clProg(ratt3_prog, cl_source_ratt3); clProg(ratt4_prog, cl_source_ratt4); clProg(ratt5_prog, cl_source_ratt5); clProg(ratt6_prog, cl_source_ratt6); clProg(ratt7_prog, cl_source_ratt7); clProg(ratt8_prog, cl_source_ratt8); clProg(ratt9_prog, cl_source_ratt9); clProg(ratt10_prog, cl_source_ratt10); clProg(ratx_prog, cl_source_ratx); clProg(ratxb_prog, cl_source_ratxb); clProg(ratx2_prog, cl_source_ratx2); clProg(ratx4_prog, cl_source_ratx4); // Build the kernels (phase 1) cout << "Compiling kernels (phase 1)..."; cout.flush(); clBuild(gr_prog); clBuild(rdsmh_prog); clBuild(ratt_prog); clBuild(ratt2_prog); clBuild(ratt3_prog); clBuild(ratt4_prog); clBuild(ratt5_prog); clBuild(ratt6_prog); clBuild(ratt7_prog); clBuild(ratt8_prog); clBuild(ratt9_prog); clBuild(ratt10_prog); clBuild(ratx_prog); clBuild(ratxb_prog); clBuild(ratx2_prog); clBuild(ratx4_prog); cout << "done." << endl; // Extract out kernel objects (phase 1) cout << "Generating OpenCL Kernel Objects (phase 1)..."; cout.flush(); // GR Base Kernels cl_kernel grBase_kernel = clCreateKernel(gr_prog, "gr_base", &err); CL_CHECK_ERROR(err); // RDSMH Kernels cl_kernel rdsmh_kernel = clCreateKernel(rdsmh_prog, "rdsmh_kernel", &err); CL_CHECK_ERROR(err); // RATT Kernels cl_kernel ratt_kernel = clCreateKernel(ratt_prog, "ratt_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt2_kernel = clCreateKernel(ratt2_prog, "ratt2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt3_kernel = clCreateKernel(ratt3_prog, "ratt3_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt4_kernel = clCreateKernel(ratt4_prog, "ratt4_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt5_kernel = clCreateKernel(ratt5_prog, "ratt5_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt6_kernel = clCreateKernel(ratt6_prog, "ratt6_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt7_kernel = clCreateKernel(ratt7_prog, "ratt7_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt8_kernel = clCreateKernel(ratt8_prog, "ratt8_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt9_kernel = clCreateKernel(ratt9_prog, "ratt9_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt10_kernel = clCreateKernel(ratt10_prog, "ratt10_kernel", &err); CL_CHECK_ERROR(err); // RATX Kernels cl_kernel ratx_kernel = clCreateKernel(ratx_prog, "ratx_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratxb_kernel = clCreateKernel(ratxb_prog, "ratxb_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratx2_kernel = clCreateKernel(ratx2_prog, "ratx2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratx4_kernel = clCreateKernel(ratx4_prog, "ratx4_kernel", &err); CL_CHECK_ERROR(err); cout << "done." << endl; //Set kernel arguments (phase 1) err = clSetKernelArg(grBase_kernel, 0, sizeof(cl_mem), (void*)&d_p); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 1, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 2, sizeof(cl_mem), (void*)&d_y); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 3, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 4, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 5, sizeof(T), (void*)&pconv); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 1, sizeof(cl_mem), (void*)&d_eg); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 1, sizeof(cl_mem), (void*)&d_rf); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); clSetRattArg(ratt2_kernel); clSetRattArg(ratt3_kernel); clSetRattArg(ratt4_kernel); clSetRattArg(ratt5_kernel); clSetRattArg(ratt6_kernel); clSetRattArg(ratt7_kernel); clSetRattArg(ratt8_kernel); clSetRattArg(ratt9_kernel); err = clSetKernelArg(ratt10_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt10_kernel, 1, sizeof(cl_mem), (void*)&d_rklow); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt10_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); clSetRatxArg(ratx_kernel); clSetRatxArg(ratxb_kernel); err = clSetKernelArg(ratx2_kernel, 0, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx2_kernel, 1, sizeof(cl_mem), (void*)&d_rf); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx4_kernel, 0, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx4_kernel, 1, sizeof(cl_mem), (void*)&d_rb); CL_CHECK_ERROR(err); // Execute kernels (phase 1) cout << "Executing kernels (phase 1)..."; cout.flush(); Event evFirst_1("first kernel phase 1"); Event evLast_1("last kernel phase 1"); clLaunchKernelEv(grBase_kernel, evFirst_1.CLEvent()); clLaunchKernel(ratt_kernel); clLaunchKernel(rdsmh_kernel); clLaunchKernel(ratt2_kernel); clLaunchKernel(ratt3_kernel); clLaunchKernel(ratt4_kernel); clLaunchKernel(ratt5_kernel); clLaunchKernel(ratt6_kernel); clLaunchKernel(ratt7_kernel); clLaunchKernel(ratt8_kernel); clLaunchKernel(ratt9_kernel); clLaunchKernel(ratt10_kernel); clLaunchKernel(ratx_kernel); clLaunchKernel(ratxb_kernel); clLaunchKernel(ratx2_kernel); clLaunchKernelEv(ratx4_kernel, evLast_1.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); cout << "done. " << endl; evFirst_1.FillTimingInfo(); evLast_1.FillTimingInfo(); double total_phase1 = evLast_1.EndTime() - evFirst_1.StartTime(); // Release Kernels (phase 1) clReleaseKernel(grBase_kernel); clReleaseKernel(rdsmh_kernel); clReleaseKernel(ratt_kernel); clReleaseKernel(ratt2_kernel); clReleaseKernel(ratt3_kernel); clReleaseKernel(ratt4_kernel); clReleaseKernel(ratt5_kernel); clReleaseKernel(ratt6_kernel); clReleaseKernel(ratt7_kernel); clReleaseKernel(ratt8_kernel); clReleaseKernel(ratt9_kernel); clReleaseKernel(ratt10_kernel); clReleaseKernel(ratx_kernel); clReleaseKernel(ratxb_kernel); clReleaseKernel(ratx2_kernel); clReleaseKernel(ratx4_kernel); // Release Programs (phase 1) clReleaseProgram(gr_prog); clReleaseProgram(rdsmh_prog); clReleaseProgram(ratt_prog); clReleaseProgram(ratt2_prog); clReleaseProgram(ratt3_prog); clReleaseProgram(ratt4_prog); clReleaseProgram(ratt5_prog); clReleaseProgram(ratt6_prog); clReleaseProgram(ratt7_prog); clReleaseProgram(ratt8_prog); clReleaseProgram(ratt9_prog); clReleaseProgram(ratt10_prog); clReleaseProgram(ratx_prog); clReleaseProgram(ratxb_prog); clReleaseProgram(ratx2_prog); clReleaseProgram(ratx4_prog); // -------------------- phase 2 ----------------- // Setup Program Objects (phase 2) clProg(qssa_prog, cl_source_qssa); clProg(qssab_prog, cl_source_qssab); clProg(qssa2_prog, cl_source_qssa2); clProg(rdwdot_prog, cl_source_rdwdot); clProg(rdwdot2_prog, cl_source_rdwdot2); clProg(rdwdot3_prog, cl_source_rdwdot3); clProg(rdwdot6_prog, cl_source_rdwdot6); clProg(rdwdot7_prog, cl_source_rdwdot7); clProg(rdwdot8_prog, cl_source_rdwdot8); clProg(rdwdot9_prog, cl_source_rdwdot9); clProg(rdwdot10_prog, cl_source_rdwdot10); // Build the kernels (phase 2) cout << "Compiling kernels (phase 2)..."; cout.flush(); clBuild(qssa_prog); clBuild(qssab_prog); clBuild(qssa2_prog); clBuild(rdwdot_prog); clBuild(rdwdot2_prog); clBuild(rdwdot3_prog); clBuild(rdwdot6_prog); clBuild(rdwdot7_prog); clBuild(rdwdot8_prog); clBuild(rdwdot9_prog); clBuild(rdwdot10_prog); cout << "done." << endl; // Extract out kernel objects (phase 2) cout << "Generating OpenCL Kernel Objects (phase 2)..."; cout.flush(); // QSSA Kernels cl_kernel qssa_kernel = clCreateKernel(qssa_prog, "qssa_kernel", &err); CL_CHECK_ERROR(err); cl_kernel qssab_kernel = clCreateKernel(qssab_prog, "qssab_kernel", &err); CL_CHECK_ERROR(err); cl_kernel qssa2_kernel = clCreateKernel(qssa2_prog, "qssa2_kernel", &err); CL_CHECK_ERROR(err); // RDWDOT Kernels cl_kernel rdwdot_kernel = clCreateKernel(rdwdot_prog, "rdwdot_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot2_kernel = clCreateKernel(rdwdot2_prog, "rdwdot2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot3_kernel = clCreateKernel(rdwdot3_prog, "rdwdot3_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot6_kernel = clCreateKernel(rdwdot6_prog, "rdwdot6_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot7_kernel = clCreateKernel(rdwdot7_prog, "rdwdot7_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot8_kernel = clCreateKernel(rdwdot8_prog, "rdwdot8_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot9_kernel = clCreateKernel(rdwdot9_prog, "rdwdot9_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot10_kernel = clCreateKernel(rdwdot10_prog, "rdwdot10_kernel", &err); CL_CHECK_ERROR(err); cout << "done." << endl; //Set kernel arguments (phase 2) clSetQssaArg(qssa_kernel); clSetQssaArg(qssab_kernel); clSetQssaArg(qssa2_kernel); clSetRdwdotArg(rdwdot_kernel); clSetRdwdotArg(rdwdot2_kernel); clSetRdwdotArg(rdwdot3_kernel); clSetRdwdotArg(rdwdot6_kernel); clSetRdwdotArg(rdwdot7_kernel); clSetRdwdotArg(rdwdot8_kernel); clSetRdwdotArg(rdwdot9_kernel); clSetRdwdotArg(rdwdot10_kernel); // Execute kernels (phase 2) cout << "Executing kernels (phase 2)..."; cout.flush(); Event evFirst_2("first kernel phase 2"); Event evLast_2("last kernel phase 2"); clLaunchKernelEv(qssa_kernel, evFirst_2.CLEvent()); clLaunchKernel(qssab_kernel); clLaunchKernel(qssa2_kernel); clLaunchKernel(rdwdot_kernel); clLaunchKernel(rdwdot2_kernel); clLaunchKernel(rdwdot3_kernel); clLaunchKernel(rdwdot6_kernel); clLaunchKernel(rdwdot7_kernel); clLaunchKernel(rdwdot8_kernel); clLaunchKernel(rdwdot9_kernel); clLaunchKernelEv(rdwdot10_kernel, evLast_2.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); cout << "done. " << endl; evFirst_2.FillTimingInfo(); evLast_2.FillTimingInfo(); double total_phase2 = evLast_2.EndTime() - evFirst_2.StartTime(); // Release Kernels (phase 2) clReleaseKernel(qssa_kernel); clReleaseKernel(qssab_kernel); clReleaseKernel(qssa2_kernel); clReleaseKernel(rdwdot_kernel); clReleaseKernel(rdwdot2_kernel); clReleaseKernel(rdwdot3_kernel); clReleaseKernel(rdwdot6_kernel); clReleaseKernel(rdwdot7_kernel); clReleaseKernel(rdwdot8_kernel); clReleaseKernel(rdwdot9_kernel); clReleaseKernel(rdwdot10_kernel); // Release Programs (phase 2) clReleaseProgram(qssa_prog); clReleaseProgram(qssab_prog); clReleaseProgram(qssa2_prog); clReleaseProgram(rdwdot_prog); clReleaseProgram(rdwdot2_prog); clReleaseProgram(rdwdot3_prog); clReleaseProgram(rdwdot6_prog); clReleaseProgram(rdwdot7_prog); clReleaseProgram(rdwdot8_prog); clReleaseProgram(rdwdot9_prog); clReleaseProgram(rdwdot10_prog); // -------------------- timings ----------------- double total = total_phase1 + total_phase2; // Estimate GFLOPs (roughly 10k flops / point) double gflops = (n*10000.) / total; // Copy results back err = clEnqueueReadBuffer(queue, d_wdot, true, 0, n*n_species*sizeof(T), h_wdot, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransferTime = inputTransferTime + evTransfer.StartEndRuntime(); double gflops_pcie = (n*10000.) / (total + totalTransferTime); resultDB.AddResult(testName, "cubic", "GFLOPS", gflops); resultDB.AddResult(testName+"_PCIe", "cubic", "GFLOPS", gflops_pcie); resultDB.AddResult(testName+"_Parity", "cubic", "n", totalTransferTime / total ); } // Release Memory err = clReleaseMemObject(d_t); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_p); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_y); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_wdot); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rf); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rb); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_c); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_eg); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rklow); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_a); CL_CHECK_ERROR(err); // Cleanup Host Memory Objects delete[] h_t; delete[] h_p; delete[] h_y; delete[] h_wdot; delete[] h_molwt; }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Runs the stablity test. The algorithm for the parallel // version of the test, which enables testing of an entire GPU // cluster at the same time, is as follows. Each participating node // first allocates its data, while node zero additionally determines // start and finish times based on a user input parameter. All nodes // then enter the outermost loop, copying fresh data from the CPU // before entering the core of the test. In the core, each node // performs a loop consisting of the forward kernel, a potential // check, and then the inverse kernel. After performing a configurable // number of forward/inverse iterations, along with a configurable // number of checks, each node sends the number of failures it // encountered to node zero. Node zero collects and reports the error // counts, determines whether the test has run its course, and // broadcasts the decision. If the decision is to proceed, each node // begins the next iteration of the outer loop, copying fresh data and // then performing the kernels and checks of the core loop. // // Arguments: // resultDB: the benchmark stores its results in this ResultDatabase // op: the options parser / parameter database // // Returns: nothing // // Programmer: Collin McCurdy // Creation: September 08, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(ResultDatabase &resultDB, OptionParser& op) { int mpi_rank, mpi_size, node_rank; int i, j; float2* source, * result; void* work, * chk; #ifdef PARALLEL MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); NodeInfo NI; node_rank = NI.nodeRank(); cout << "MPI Task " << mpi_rank << " of " << mpi_size << " (noderank=" << node_rank << ") starting....\n"; #else mpi_rank = 0; mpi_size = 1; node_rank = 0; #endif // ensure chk buffer alloc succeeds before grabbing the // rest of available memory. allocDeviceBuffer(&chk, 1); unsigned long avail_bytes = findAvailBytes(); // unsigned long avail_bytes = 1024*1024*1024-1; // now determine how much available memory will be used (subject // to CUDA's constraint on the maximum block dimension size) int blocks = avail_bytes / (512*sizeof(float2)); int slices = 1; while (blocks/slices > 65535) { slices *= 2; } int half_n_ffts = ((blocks/slices)*slices)/2; int n_ffts = half_n_ffts * 2; fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", avail_bytes, blocks, n_ffts); int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2); cout << mpi_rank << ": testing " << used_bytes/((double)1024*1024) << " MBs\n"; // allocate host memory source = (float2*)malloc(used_bytes); result = (float2*)malloc(used_bytes); // alloc device memory allocDeviceBuffer(&work, used_bytes); // alloc gather buffer int* recvbuf = (int*)malloc(mpi_size*sizeof(int)); // compute start and finish times time_t start = time(NULL); time_t finish = start + (time_t)(op.getOptionInt("time")*60); struct tm start_tm, finish_tm; localtime_r(&start, &start_tm); localtime_r(&finish, &finish_tm); if (mpi_rank == 0) { printf("start = %s", asctime(&start_tm)); printf("finish = %s", asctime(&finish_tm)); } for (int iter = 0; ; iter++) { bool failed = false; int errorCount = 0, stop = 0; // (re-)init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // copy to device copyToDevice(work, source, used_bytes); copyToDevice(chk, &errorCount, 1); forward(work, n_ffts); if (check(work, chk, half_n_ffts, half_n_cmplx)) { fprintf(stderr, "First check failed..."); failed = true; } if (!failed) { for (i = 1; i <= CHECKS; i++) { for (j = 1; j <= ITERS_PER_CHECK; j++) { inverse(work, n_ffts); forward(work, n_ffts); } if (check(work, chk, half_n_ffts, half_n_cmplx)) { failed = true; break; } } } // failing node is responsible for verifying failure, counting // errors and reporting count to node 0. if (failed) { fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter); // repeat check on CPU copyFromDevice(result, work, used_bytes); float2* result2 = result + half_n_cmplx; for (j = 0; j < half_n_cmplx; j++) { if (result[j].x != result2[j].x || result[j].y != result2[j].y) { errorCount++; } } if (!errorCount) { fprintf(stderr, "verification failed!\n"); } else { fprintf(stderr, "%d errors\n", errorCount); } } #ifdef PARALLEL MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); #else recvbuf[0] = errorCount; #endif // node 0 collects and reports error counts, determines // whether test has run its course, and broadcasts decision if (mpi_rank == 0) { time_t curtime = time(NULL); struct tm curtm; localtime_r(&curtime, &curtm); fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm)); for (i = 0; i < mpi_size; i++) { if (recvbuf[i]) { fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i); } } if (curtime > finish) { stop = 1; } } #ifdef PARALLEL MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif resultDB.AddResult("Check", "", "Failures", errorCount); if (stop) break; } freeDeviceBuffer(work); freeDeviceBuffer(chk); free(source); free(result); free(recvbuf); }
void RunTest(string testName, ResultDatabase &resultDB, OptionParser &op) { int pbIndex = op.getOptionInt("size") - 1; int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); int micdev = op.getOptionInt("target"); int nThreads = 240; // Default #pragma offload target(mic) inout(nThreads) { nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 4; // Leave something for the OS } printf("Using %d available threads for MIC run.\n", nThreads); size_t szOptimum = L1B * nThreads; int pbSizesMB[] = { 1, 8, 32, 64, 128, 256, 512, 768 }; size_t pbSizeBytes = szOptimum * pbSizesMB[pbIndex] / 8; int pbSizeElements = pbSizeBytes / sizeof(T); // Allocate Host Memory __declspec(target(MIC)) static T* h_idata; __declspec(target(MIC)) static T* reference; __declspec(target(MIC)) static T* h_odata; h_idata = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN); reference = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN); h_odata = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN); //Manually align memory h_idata += ALIGN - 1; h_odata += ALIGN - 1; srand(time(NULL)); // Initialize host memory for (int i = 0; i < pbSizeElements; i++) { h_idata[i] = rand() % 21 - 10; // Fill with some pattern h_odata[i] = 0.0; reference[i] = 0.0; } // Allocate data to mic #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) free_if(0)) \ out(h_odata:length(pbSizeElements + 1) free_if(0)) { } double start = curr_second(); // Get data transfer time #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) alloc_if(0) \ free_if(0)) out(h_odata:length(pbSizeElements + 1) alloc_if(0) free_if(0)) { } float transferTime = curr_second()-start; cout << "Running benchmark with size " << pbSizeElements << endl; for (int k = 0; k < passes; k++) { double totalScanTime = 0.0f; start = curr_second(); #pragma offload target(mic:micdev) nocopy(h_idata:length(pbSizeElements + 1) \ alloc_if(0) free_if(0)) nocopy(h_odata:length(pbSizeElements + 1) \ alloc_if(0) free_if(0)) { if (pbIndex > 0) { size_t elementsOptimum = szOptimum / sizeof(T); int nChunks = pbSizesMB[pbIndex] / 8; T fOffset = 0; for (int iChunk = 0; iChunk < nChunks; iChunk++) { SCAN_KNC<T>(h_idata + iChunk * elementsOptimum, h_odata + iChunk * elementsOptimum, elementsOptimum, iters, fOffset); fOffset = (h_odata + iChunk * elementsOptimum)[elementsOptimum - 1]; } } else SCAN_KNC<T>(h_idata, h_odata, szOptimum / (8 * sizeof(T)), iters, 0.0); } double stop = curr_second(); totalScanTime = (stop-start); #pragma offload target(mic:micdev) out(h_odata:length(pbSizeElements + 1) \ alloc_if(0) free_if(0)) { } // If results aren't correct, don't report perf numbers if (! scanCPU<T>(h_idata, reference, h_odata, pbSizeElements)) { return; } char atts[1024]; double avgTime = (totalScanTime / (double) iters); sprintf(atts, "%d items", pbSizeElements); double gb = (double)(pbSizeElements * sizeof(T)) / (1000. * 1000. * 1000.); resultDB.AddResult(testName, atts, "GB/s", gb / avgTime); resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gb / (avgTime + transferTime)); resultDB.AddResult(testName+"_Parity", atts, "N", transferTime / avgTime); } // Clean up #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) alloc_if(0) ) \ out(h_odata:length(pbSizeElements + 1) alloc_if(0)) { } _mm_free(h_idata - ALIGN + 1); _mm_free(h_odata - ALIGN + 1); _mm_free(reference); }
void RunBenchmark( ResultDatabase& resultDB, OptionParser& opts ) { int device; #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); #endif // defined(PARALLEL) cudaGetDevice( &device ); cudaDeviceProp deviceProps; cudaGetDeviceProperties( &deviceProps, device ); // Configure to allocate performance-critical memory in // a programming model-specific way. Matrix2D<float>::SetAllocator( new CUDAPMSMemMgr<float> ); #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Running single precision test" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<float>( "SP_Sten2D", resultDB, opts ); // check if we can run double precision tests if( ((deviceProps.major == 1) && (deviceProps.minor >= 3)) || (deviceProps.major >= 2) ) { // Configure to allocate performance-critical memory in // a programming model-specific way. Matrix2D<double>::SetAllocator( new CUDAPMSMemMgr<double> ); #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\n\nDP supported" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", resultDB, opts ); } else { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Double precision not supported - skipping" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires neg entry for every possible result int nPasses = (int)opts.getOptionInt( "passes" ); for( int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "GFLOPS", FLT_MAX ); } } std::cout << "\n" << std::endl; }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; try { #if defined(PARALLEL) stdStencilFactory = new MPIHostStencilFactory<T>; testStencilFactory = new MPICUDAStencilFactory<T>; #else stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new CUDAStencilFactory<T>; #endif // defined(PARALLEL) assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::vector<long long> lDims = opts.getOptionVecInt( "lsize" ); assert( lDims.size() == 2 ); std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << lDims[0] << 'x' << lDims[1]; unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" ); unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" ); // compute the expected result on the host // or read it from a pre-existing file std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" ); #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) if( !matrixFilenameBase.empty() ) { std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n" << std::endl; } else { std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n" << "Depending on host capabilities, this may take a while." << std::endl; } #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> expected( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); bool haveExpectedData = false; if( ! matrixFilenameBase.empty() ) { bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); if( readOK ) { if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) || (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) ) { std::cerr << "The matrix read from file \'" << GetMatrixFileName<T>( matrixFilenameBase ) << "\' does not match the matrix size specified on the command line.\n"; expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); } else { haveExpectedData = true; } } if( !haveExpectedData ) { std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n" << "Depending on host capabilities, this may take a while." << std::endl; } } if( !haveExpectedData ) { init( expected ); haveExpectedData = true; if( beVerbose ) { std::cout << "initial state:\n" << expected << std::endl; } stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( expected, nIters ); } if( beVerbose ) { std::cout << "expected result:\n" << expected << std::endl; } // determine whether we are to save the expected matrix values to a file // to speed up future runs matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" ); if( !matrixFilenameBase.empty() ) { SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); } assert( haveExpectedData ); // compute the result on the CUDA device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); // Compute the number of floating point operations we will perform. // // Note: in the truly-parallel case, we count flops for redundant // work due to the need for a halo. // But we do not add to the count for the local 1-wide halo since // we aren't computing new values for those items. unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); #if defined(PARALLEL) MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil ); assert( mpiTestStencil != NULL ); int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0; int numParticipating = 0; MPI_Allreduce( &participating, // src &numParticipating, // dest 1, // count MPI_INT, // type MPI_SUM, // op MPI_COMM_WORLD ); // communicator npts *= numParticipating; #endif // defined(PARALLEL) // In our 9-point stencil, there are 11 floating point operations // per point (3 multiplies and 11 adds): // // newval = weight_center * centerval + // weight_cardinal * (northval + southval + eastval + westval) + // weight_diagnoal * (neval + nwval + seval + swval) // // we do this stencil operation 'nIters' times unsigned long nflops = npts * 11 * nIters; #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Performing " << nWarmupPasses << " warmup passes..."; #if defined(PARALLEL) } #endif // defined(PARALLEL) for( unsigned int pass = 0; pass < nWarmupPasses; pass++ ) { init(data); (*testStencil)( data, nIters ); } #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "done." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) for( unsigned int pass = 0; pass < nPasses; pass++ ) { #if !defined(PARALLEL) std::cout << "pass " << pass << ": "; #endif // !defined(PARALLEL) init( data ); int timerHandle = Timer::Start(); (*testStencil)( data, nIters ); double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" ); // find and report the computation rate double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) StencilValidater<T>* validater = new MPIStencilValidater<T>; #else StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) validater->ValidateResult( expected, data, valErrThreshold, nValErrsToPrint ); } } catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; } // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { int err = 0; // Collect basic MPI information int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_scan, NULL, &err); CL_CHECK_ERROR(err); // Before proceeding, make sure the kernel code compiles and // all kernels are valid. if (mpi_rank == 0) { cout << "Compiling scan kernels." << endl; } err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the 3 kernels cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err); CL_CHECK_ERROR(err); cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err); CL_CHECK_ERROR(err); // If the device doesn't support at least 256 work items in a // group, use a different kernel (TODO) if (getMaxWorkGroupSize(dev) < 256) { cout << "Scan requires work group size of at least 256" << endl; char atts[1024] = "GSize_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult(testName+"-Kernel" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Overall" , atts, "GB/s", FLT_MAX); } return; } // Problem Sizes int probSizes[4] = { 1, 8, 32, 64 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MB size = (size * 1024 * 1024) / sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data (h_idata) cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for output data (h_odata) cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory if (mpi_rank == 0) { cout << "Initializing host memory." << endl; } for (int i = 0; i < size; i++) { h_idata[i] = i % 2; //Fill with some pattern h_odata[i] = -1; } // Allocate device memory for input array cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output array cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Number of local work items per group const size_t local_wsize = 256; // Number of local work groups and total work items const size_t num_work_groups = 64; const size_t global_wsize = local_wsize * num_work_groups; // Allocate device memory for local work group intermediate sums cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_work_groups * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for intermediate block sums (h_isums) cl_mem h_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, num_work_groups * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); T* h_isums = (T*)clEnqueueMapBuffer(queue, h_b, true, CL_MAP_READ|CL_MAP_WRITE, 0, num_work_groups * sizeof(T), 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the top-level scan err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Repeat the test multiple times to get a good measurement int passes = op.getOptionInt("passes"); if (mpi_rank == 0) { cout << "Running benchmark with size " << size << endl; } for (int k = 0; k < passes; k++) { // Timing variables double pcie_time=0., kernel_time=0., mpi_time=0.; // Copy data to GPU Event evTransfer("PCIe transfer"); double time_temp = 0.; err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() / 1e9; // This code uses a reduce-then-scan strategy. // The major steps of the algorithm are: // 1. Local reduction on a node // 2. Global exclusive scan of the reduction values // 3. Local inclusive scan, seeded with the node's result // from the global exclusive scan Event ev_reduce("Reduction Kernel"); err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &global_wsize, &local_wsize, 0, NULL, &ev_reduce.CLEvent()); err = clFinish(queue); ev_reduce.FillTimingInfo(); kernel_time += (double)ev_reduce.StartEndRuntime() * 1e-9; // Next step is to copy the reduced blocks back to the host, // sum them, and perform the MPI exlcusive (top level) scan. err = clEnqueueReadBuffer(queue, d_isums, true, 0, num_work_groups*sizeof(T), h_isums, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9; // Start the timer for MPI Scan int globscan_th = Timer::Start(); T reduced=0., scanned=0.; // To get the true sum for this node, we have to add up // the block sums before MPI scanning. for (int i = 0; i < num_work_groups; i++) { reduced += h_isums[i]; } // Next step is an exclusive scan across MPI ranks. // Then a local scan seeded with the result from MPI. globalExscan(&reduced, &scanned); mpi_time += Timer::Stop(globscan_th, "Global Scan"); // Now, scanned contains all the information we need from other nodes // Next step is to perform the local top level (i.e. across blocks) scan, // but seed it with the "scanned", the sum of elems on all lower ranks. h_isums[0] += scanned; err = clEnqueueWriteBuffer(queue, d_isums, true, 0, sizeof(T), h_isums, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9; Event ev_scan("Scan Kernel"); err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL, &local_wsize, &local_wsize, 0, NULL, &ev_scan.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); ev_scan.FillTimingInfo(); kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9); // Finally, a bottom-level scan is performed by each block // that is seeded with the scanned value in block sums err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL, &global_wsize, &local_wsize, 0, NULL, &ev_scan.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); ev_scan.FillTimingInfo(); kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9); // Read data back for correctness check err = clEnqueueReadBuffer(queue, d_odata, true, 0, bytes, h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); // Lightweight correctness check -- won't apply // if data is not initialized to i%2 above if (mpi_rank == mpi_size-1) { if (h_odata[size-1] != (mpi_size * size) / 2) { cout << "Test Failed\n"; } else { cout << "Test Passed\n"; } } char atts[1024]; sprintf(atts, "%d items", size); double global_gb = (double)(mpi_size * size * sizeof(T)) / (1000. * 1000. * 1000.); resultDB.AddResult(testName+"-Kernel" , atts, "GB/s", global_gb / kernel_time); resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s", global_gb / (kernel_time + pcie_time)); resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s", (mpi_size * sizeof(T) *1e-9) / mpi_time); resultDB.AddResult(testName+"-Overall" , atts, "GB/s", global_gb / (kernel_time + pcie_time + mpi_time)); } // Clean up device memory err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_isums); CL_CHECK_ERROR(err); // Clean up pinned host memory err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_b, h_isums, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_b); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); err = clReleaseKernel(top_scan); CL_CHECK_ERROR(err); err = clReleaseKernel(bottom_scan); CL_CHECK_ERROR(err); }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { int N; if (op.getOptionInt("KiB") == 0) { int probSizes[4] = { 1, 4, 8, 16 }; N = probSizes[op.getOptionInt("size")-1] * 1024 / sizeof(T); } else { N = op.getOptionInt("KiB") * 1024 / sizeof(T); } cl_int err; int waitForEvents = 1; size_t m = N, n = N, k = N; size_t lda, ldb, ldc; const T alpha = 1; const T beta = -1; int i, j; lda = ldb = ldc = N; cl_uint numDimensions = 0; clGetDeviceInfo (dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &numDimensions, NULL); size_t *maxWorkSizes = new size_t[numDimensions]; clGetDeviceInfo (dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*numDimensions, maxWorkSizes, NULL); if (numDimensions<2 || maxWorkSizes[0]<16 || maxWorkSizes[1] < 4) { cout << "SGEMM needs a 2-dimensional work group size of at least {16,4}." << endl; int passes = op.getOptionInt("passes"); char atts[1024] = "GSize_Not_Supported"; for (; passes > 0; --passes) { for (i = 0; i < 2; i++) { const char transb = i ? 'T' : 'N'; resultDB.AddResult(testName+"-"+transb, atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_PCIe", atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_Parity", atts, "N", FLT_MAX); } } return; } size_t localWorkSize[2] = {16,4}; // Create program object cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_sgemmN, NULL, &err); CL_CHECK_ERROR(err); string flags = compileFlags + "-cl-mad-enable"; err = clBuildProgram(prog, 0, NULL, flags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); // If compilation fails, print error messages and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo (prog, dev, CL_PROGRAM_BUILD_LOG, 5000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; exit(-1); } // Generate the kernel objects cl_kernel sgemmNN = clCreateKernel(prog, "sgemmNN", &err); CL_CHECK_ERROR(err); cl_kernel sgemmNT = clCreateKernel(prog, "sgemmNT", &err); CL_CHECK_ERROR(err); // Allocate memory for the matrices T *A, *B, *C; cl_mem Aobj, Bobj, Cobj; if (true) // pinned { Aobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(T)*N*N, NULL, &err); CL_CHECK_ERROR(err); A =(T*)clEnqueueMapBuffer(queue,Aobj,true,CL_MAP_READ|CL_MAP_WRITE, 0,sizeof(T)*N*N,0, NULL,NULL,&err); CL_CHECK_ERROR(err); Bobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(T)*N*N, NULL, &err); CL_CHECK_ERROR(err); B =(T*)clEnqueueMapBuffer(queue,Bobj,true,CL_MAP_READ|CL_MAP_WRITE, 0,sizeof(T)*N*N,0, NULL,NULL,&err); CL_CHECK_ERROR(err); Cobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(T)*N*N, NULL, &err); CL_CHECK_ERROR(err); C =(T*)clEnqueueMapBuffer(queue,Cobj,true,CL_MAP_READ|CL_MAP_WRITE, 0,sizeof(T)*N*N,0, NULL,NULL,&err); CL_CHECK_ERROR(err); } else { A = (T*)malloc( N*N*sizeof( T ) ); B = (T*)malloc( N*N*sizeof( T ) ); C = (T*)malloc( N*N*sizeof( T ) ); } // Initialize inputs srand48(13579862); for(i=0; i<m; ++i){ for(j=0; j<k; ++j){ A[i*k+j] = (T)(0.5 + drand48()*1.5); } } for(i=0; i<k; ++i){ for(j=0; j<n; ++j){ B[i*n+j] = (T)(0.5 + drand48()*1.5); } } for(i=0; i<m; ++i){ for(j=0; j<n; ++j){ C[i*n+j] = 0.0; } } // Pass A and B to the GPU and create a GPU buffer for C cl_mem Agpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE, m*k * sizeof(T), NULL, &err); CL_BAIL_ON_ERROR(err); cl_mem Bgpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE, k*n * sizeof(T), NULL, &err); CL_BAIL_ON_ERROR(err); cl_mem Cgpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE, m*n * sizeof(T), NULL, &err); CL_BAIL_ON_ERROR(err); // Set arguments to the sgemmNN kernel err = clSetKernelArg(sgemmNN, 0, sizeof(cl_mem), (void*)&Agpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 1, sizeof(int), (void*)&lda); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 2, sizeof(cl_mem), (void*)&Bgpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 3, sizeof(int), (void*)&ldb); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 4, sizeof(cl_mem), (void*)&Cgpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 5, sizeof(int), (void*)&ldc); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 6, sizeof(int), (void*)&k); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 7, sizeof(T), (void*)&alpha); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNN, 8, sizeof(T), (void*)&beta); CL_BAIL_ON_ERROR(err); // Pass arguments to the sgemmNT kernel err = clSetKernelArg(sgemmNT, 0, sizeof(cl_mem), (void*)&Agpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 1, sizeof(int), (void*)&lda); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 2, sizeof(cl_mem), (void*)&Bgpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 3, sizeof(int), (void*)&ldb); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 4, sizeof(cl_mem), (void*)&Cgpu); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 5, sizeof(int), (void*)&ldc); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 6, sizeof(int), (void*)&k); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 7, sizeof(T), (void*)&alpha); CL_BAIL_ON_ERROR(err); err = clSetKernelArg(sgemmNT, 8, sizeof(T), (void*)&beta); CL_BAIL_ON_ERROR(err); const size_t globalWorkSize[2] = {m/4,n/4}; int passes = op.getOptionInt("passes"); // Run NN for (int i = 0; i < passes; i++) { Event evDownload1("Download A"); Event evUpload("Upload"); Event evNN("sgemmNN"); err = clEnqueueWriteBuffer(queue, Agpu, CL_TRUE, 0, m*n*sizeof(T), A, 0, NULL, &evDownload1.CLEvent()); err = clEnqueueWriteBuffer(queue, Bgpu, CL_TRUE, 0, m*n*sizeof(T), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T), C, 0, NULL, NULL); // Wait until data transfers finish clFinish(queue); CL_BAIL_ON_ERROR(err); //Launch Kernels err = clEnqueueNDRangeKernel(queue, sgemmNN, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evNN.CLEvent()); clFinish(queue); CL_BAIL_ON_ERROR(err); err = clEnqueueReadBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T), C, 0, NULL, &evUpload.CLEvent()); clFinish(queue); CL_BAIL_ON_ERROR(err); evNN.FillTimingInfo(); evDownload1.FillTimingInfo(); evUpload.FillTimingInfo(); double user_wait_time = 0.0; double gemm_pure_time = 0.0; gemm_pure_time = evNN.SubmitEndRuntime(); user_wait_time = evUpload.EndTime() - evDownload1.QueuedTime(); double transfer_time = user_wait_time - gemm_pure_time; double flops = 2.0*(double)N*N*N; resultDB.AddResult(testName+"-N", toString(N), "GFLOPS", flops / gemm_pure_time); resultDB.AddResult(testName+"-N_PCIe", toString(N), "GFLOPS", flops / user_wait_time); resultDB.AddResult(testName+"-N_Parity", toString(N), "N", transfer_time / gemm_pure_time); } // Run NT for (int i = 0; i < passes; i++) { Event evDownload1("Download A"); Event evUpload("Upload"); Event evNT("sgemmNT"); err = clEnqueueWriteBuffer(queue, Agpu, CL_TRUE, 0, m*n*sizeof(T), A, 0, NULL, &evDownload1.CLEvent()); err = clEnqueueWriteBuffer(queue, Bgpu, CL_TRUE, 0, m*n*sizeof(T), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T), C, 0, NULL, NULL); clFinish(queue); CL_BAIL_ON_ERROR(err); //Launch Kernels err = clEnqueueNDRangeKernel(queue, sgemmNT, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evNT.CLEvent()); clFinish(queue); CL_BAIL_ON_ERROR(err); err = clEnqueueReadBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T), C, 0, NULL, &evUpload.CLEvent()); clFinish(queue); CL_BAIL_ON_ERROR(err); evNT.FillTimingInfo(); evDownload1.FillTimingInfo(); evUpload.FillTimingInfo(); double user_wait_time = 0.0; double gemm_pure_time = 0.0; gemm_pure_time = evNT.SubmitEndRuntime(); user_wait_time = evUpload.EndTime() - evDownload1.QueuedTime(); double transfer_time = user_wait_time - gemm_pure_time; double flops = 2.0*(double)N*N*N; resultDB.AddResult(testName+"-T", toString(N), "GFLOPS", flops / gemm_pure_time); resultDB.AddResult(testName+"-T_PCIe", toString(N), "GFLOPS", flops / user_wait_time); resultDB.AddResult(testName+"-T_Parity", toString(N), "N", transfer_time / gemm_pure_time); } if (true) // pinned { err = clReleaseMemObject(Aobj); CL_CHECK_ERROR(err); err = clReleaseMemObject(Bobj); CL_CHECK_ERROR(err); err = clReleaseMemObject(Cobj); CL_CHECK_ERROR(err); } else { free(A); free(B); free(C); } err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(sgemmNN); CL_CHECK_ERROR(err); err = clReleaseKernel(sgemmNT); CL_CHECK_ERROR(err); err = clReleaseMemObject(Agpu); CL_CHECK_ERROR(err); err = clReleaseMemObject(Bgpu); CL_CHECK_ERROR(err); err = clReleaseMemObject(Cgpu); CL_CHECK_ERROR(err); }
template <class T> void RunTest(cl_device_id id, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, int npasses, int verbose, int quiet, float repeatF, size_t localWorkSize, ProgressBar &pb, const char* typeName, const char* precision, const char* pragmaText) { int err; cl_mem mem1; char sizeStr[128]; T *hostMem, *hostMem2; int aIdx = 0; while ((aTests!=0) && (aTests[aIdx].name!=0)) { ostringstream oss; struct _benchmark_type temp = aTests[aIdx]; // Calculate adjusted repeat factor int tentativeRepeats = (int)round(repeatF*temp.numRepeats); if (tentativeRepeats < 2) { tentativeRepeats = 2; double realRepeatF = ((double)tentativeRepeats) / temp.numRepeats; if (realRepeatF>8.0*repeatF) // do not cut the number of unrolls // by more than a factor of 8 realRepeatF = 8.0*repeatF; temp.numUnrolls = (int)round(repeatF*temp.numUnrolls/realRepeatF); } temp.numRepeats = tentativeRepeats; // Generate kernel source code generateKernel(oss, temp, typeName, pragmaText); std::string kernelCode(oss.str()); // If in verbose mode, print the kernel if (verbose) { cout << "Code for kernel " << temp.name << ":\n" + kernelCode << endl; } // Alloc host memory int halfNumFloatsMax = temp.halfBufSizeMax*1024; int numFloatsMax = 2*halfNumFloatsMax; hostMem = new T[numFloatsMax]; hostMem2 = new T[numFloatsMax]; // Allocate device memory mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(T)*numFloatsMax, NULL, &err); CL_CHECK_ERROR(err); // Issue a copy to force device allocation err = clEnqueueWriteBuffer(queue, mem1, true, 0, numFloatsMax*sizeof(T), hostMem, 0, NULL, NULL); CL_CHECK_ERROR(err); // Create kernel program object const char* progSource[] = {kernelCode.c_str()}; cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, NULL, &err); CL_CHECK_ERROR(err); // Compile the program err = clBuildProgram(prog, 1, &id, opts, NULL, NULL); CL_CHECK_ERROR(err); if (err != 0) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, id, CL_PROGRAM_BUILD_LOG, 5000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Check if we have to dump the PTX (NVIDIA only) // Disabled by default // Set environment variable DUMP_PTX to enable char* dumpPtx = getenv("DUMP_PTX"); if (dumpPtx && !strcmp(dumpPtx, "1")) { // must dump the PTX dumpPTXCode(ctx, prog, temp.name); } // Extract out kernel cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err); CL_CHECK_ERROR(err); err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1); CL_CHECK_ERROR (err); err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), (void*)&temp.numRepeats); CL_CHECK_ERROR (err); if (verbose) { cout << "Running kernel " << temp.name << endl; } for (int halfNumFloats=temp.halfBufSizeMin*1024 ; halfNumFloats<=temp.halfBufSizeMax*1024 ; halfNumFloats*=temp.halfBufSizeStride) { // Set up input memory, first half = second half int numFloats = 2*halfNumFloats; for (int j=0; j<halfNumFloats; ++j) { hostMem[j] = hostMem[numFloats-j-1] = (T)(drand48()*5.0); } size_t globalWorkSize = numFloats; for (int pas=0 ; pas<npasses ; ++pas) { err = clEnqueueWriteBuffer (queue, mem1, true, 0, numFloats*sizeof(T), hostMem, 0, NULL, NULL); CL_CHECK_ERROR(err); Event evKernel(temp.name); err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); CL_CHECK_ERROR(err); err = clWaitForEvents(1, &evKernel.CLEvent()); CL_CHECK_ERROR(err); evKernel.FillTimingInfo(); double flopCount = (double)numFloats * temp.flopCount * temp.numRepeats * temp.numUnrolls * temp.numStreams; double gflop = flopCount / (double)(evKernel.SubmitEndRuntime()); sprintf (sizeStr, "Size:%07d", numFloats); resultDB.AddResult(string(temp.name)+precision, sizeStr, "GFLOPS", gflop); // Zero out the test host memory for (int j=0 ; j<numFloats ; ++j) { hostMem2[j] = 0.0; } // Read the result device memory back to the host err = clEnqueueReadBuffer(queue, mem1, true, 0, numFloats*sizeof(T), hostMem2, 0, NULL, NULL); CL_CHECK_ERROR(err); // Check the result -- At a minimum the first half of memory // should match the second half exactly for (int j=0 ; j<halfNumFloats ; ++j) { if (hostMem2[j] != hostMem2[numFloats-j-1]) { cout << "Error; hostMem2[" << j << "]=" << hostMem2[j] << " is different from its twin element hostMem2[" << (numFloats-j-1) << "]=" << hostMem2[numFloats-j-1] <<"; stopping check\n"; break; } } // update progress bar pb.addItersDone(); if (!verbose && !quiet) pb.Show(stdout); } } err = clReleaseKernel (kernel_madd); CL_CHECK_ERROR(err); err = clReleaseProgram (prog); CL_CHECK_ERROR(err); err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); aIdx += 1; delete[] hostMem; delete[] hostMem2; } // Now, test hand-tuned custom kernels // 2D - width and height of input const int w = 2048, h = 2048; const int bytes = w * h * sizeof(T); // Allocate some device memory mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Get a couple non-zero random numbers float val1 = 0, val2 = 0; while (val1==0 || val2==0) { val1 = drand48(); val2 = drand48(); } // For each custom kernel for (int kCounter = 0; kCounter < 2; kCounter++) { // Calculate adjusted repeat factor int tentativeRepeats = (int)round(repeatF*5); int nUnrolls = 100; if (tentativeRepeats < 2) { tentativeRepeats = 2; double realRepeatF = ((double)tentativeRepeats) / 5; if (realRepeatF>8.0*repeatF) // do not cut the number of unrolls // by more than a factor of 8 realRepeatF = 8.0*repeatF; nUnrolls = (int)round(repeatF*100/realRepeatF); } // Double precision not currently supported string kSource = generateUKernel(kCounter, false, tentativeRepeats, nUnrolls, typeName, pragmaText); const char* progSource[] = {kSource.c_str()}; cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, NULL, &err); CL_CHECK_ERROR(err); // Compile kernel err = clBuildProgram(prog, 1, &id, opts, NULL, NULL); CL_CHECK_ERROR(err); // Extract out kernel cl_kernel kernel_madd = clCreateKernel(prog, "peak", &err); // Calculate kernel launch parameters //size_t localWorkSize = maxGroupSize<128?maxGroupSize:128; size_t globalWorkSize = w * h; // Set the arguments err = clSetKernelArg(kernel_madd, 0, sizeof(cl_mem), (void*)&mem1); CL_CHECK_ERROR(err); err = clSetKernelArg(kernel_madd, 1, sizeof(T), (void*)&val1); CL_CHECK_ERROR(err); err = clSetKernelArg(kernel_madd, 2, sizeof(T), (void*)&val2); CL_CHECK_ERROR(err); // Event object for timing Event evKernel_madd("madd"); for (int passCounter=0; passCounter < npasses; passCounter++) { err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel_madd.CLEvent()); CL_CHECK_ERROR(err); // Wait for the kernel to finish err = clWaitForEvents(1, &evKernel_madd.CLEvent()); CL_CHECK_ERROR(err); evKernel_madd.FillTimingInfo(); // Calculate result and add to DB char atts[1024]; double nflopsPerItem = getUFlopCount(kCounter, false, tentativeRepeats, nUnrolls); sprintf(atts, "Size:%d", w*h); double gflops = (double) (nflopsPerItem*w*h) / (double) evKernel_madd.SubmitEndRuntime(); if (kCounter) { resultDB.AddResult(string("MulMAddU")+precision, atts, "GFLOPS", gflops); } else { resultDB.AddResult(string("MAddU")+precision, atts, "GFLOPS", gflops); } // update progress bar pb.addItersDone(); if (!verbose && !quiet) { pb.Show(stdout); } } err = clReleaseKernel(kernel_madd); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); } err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); }
void runTest(const string& name, ResultDatabase &resultDB, OptionParser& op) { int i, j; void* work, * chk; T2* source, * result; unsigned long bytes = 0; if (op.getOptionInt("MB") == 0) { int probSizes[4] = { 1, 8, 96, 256 }; int sizeIndex = op.getOptionInt("size")-1; if (sizeIndex < 0 || sizeIndex >= 4) { cerr << "Invalid size index specified\n"; exit(-1); } bytes = probSizes[sizeIndex]; } else { bytes = op.getOptionInt("MB"); } // Convert to MB bytes *= 1024 * 1024; bool do_dp = dp<T2>(); init(op, do_dp); int passes = op.getOptionInt("passes"); // now determine how much available memory will be used int half_n_ffts = bytes / (512*sizeof(T2)*2); int n_ffts = half_n_ffts * 2; int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(T2); double N = half_n_cmplx*2; // allocate host and device memory allocHostBuffer((void**)&source, used_bytes); allocHostBuffer((void**)&result, used_bytes); // init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // alloc device memory allocDeviceBuffer(&work, used_bytes); allocDeviceBuffer(&chk, 1); // Copy to device, and record transfer time fprintf(stderr, "used_bytes=%d, N=%g\n", used_bytes, N); int pcie_TH = Timer::Start(); copyToDevice(work, source, used_bytes); double transfer_time = Timer::Stop(pcie_TH, "PCIe Transfer Time"); char chk_init = 0; copyToDevice(chk, &chk_init, 1); const char *sizeStr; stringstream ss; ss << "N=" << (long)N; sizeStr = strdup(ss.str().c_str()); for (int k=0; k<passes; k++) { // time fft kernel int TH = Timer::Start(); forward(work, n_ffts); double t = Timer::Stop(TH, "fft"); double fftsz = 512; double Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f); double gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) / ((transfer_time+t)*1e9f); resultDB.AddResult(name, sizeStr, "GFLOPS", Gflops); resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", gflopsPCIe); resultDB.AddResult(name+"_Parity", sizeStr, "N", transfer_time / t); // time ifft kernel TH = Timer::Start(); inverse(work, n_ffts); t = Timer::Stop(TH, "ifft"); Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f); gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) / ((transfer_time+t)*1e9f); resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", Gflops); resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", gflopsPCIe); resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", transfer_time / t); // time check kernel int failed = check(work, chk, half_n_ffts, half_n_cmplx); cout << "pass " << k << ((failed) ? ": failed\n" : ": passed\n"); } freeDeviceBuffer(work); freeDeviceBuffer(chk); freeHostBuffer(source); freeHostBuffer(result); }
// Modifications: // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For // serial code, we report all trial values. // void SerialStencilTimingReporter::ReportTimings( ResultDatabase& resultDB ) const { resultDB.DumpDetailed( std::cout ); }
void csrTest(cl_device_id dev, cl_context ctx, string compileFlags, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, floatType* h_val, int* h_cols, int* h_rowDelimiters, floatType* h_vec, floatType* h_out, int numRows, int numNonZeroes, floatType* refOut, bool padded, const size_t maxImgWidth) { if (devSupportsImages) { char texflags[64]; sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth); compileFlags+=string(texflags); } // Set up OpenCL Program Object int err = 0; cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL, &err); CL_CHECK_ERROR(err); // Build the openCL kernels err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); // CL_CHECK_ERROR(err); // if we check and fail here, we never get to see // the OpenCL compiler's build log // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Device data structures cl_mem d_val, d_vec, d_out; cl_mem d_cols, d_rowDelimiters; // Allocate device memory d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes * sizeof(cl_int), NULL, &err); CL_CHECK_ERROR(err); int imgHeight = 0; if (devSupportsImages) { imgHeight=(numRows+maxImgWidth-1)/maxImgWidth; cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT; if(sizeof(floatType)==4) fmt.image_channel_order=CL_R; else fmt.image_channel_order=CL_RG; d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth, imgHeight, 0, NULL, &err); CL_CHECK_ERROR(err); } else { d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); } d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_rowDelimiters = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numRows+1) * sizeof(cl_int), NULL, &err); CL_CHECK_ERROR(err); // Setup events for timing Event valTransfer("transfer Val data over PCIe bus"); Event colsTransfer("transfer cols data over PCIe bus"); Event vecTransfer("transfer vec data over PCIe bus"); Event rowDelimitersTransfer("transfer rowDelimiters data over PCIe bus"); // Transfer data to device err = clEnqueueWriteBuffer(queue, d_val, true, 0, numNonZeroes * sizeof(floatType), h_val, 0, NULL, &valTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue, d_cols, true, 0, numNonZeroes * sizeof(int), h_cols, 0, NULL, &colsTransfer.CLEvent()); CL_CHECK_ERROR(err); if (devSupportsImages) { size_t offset[3]={0}; size_t size[3]={maxImgWidth,(size_t)imgHeight,1}; err = clEnqueueWriteImage(queue,d_vec, true, offset, size, 0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } else { err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows * sizeof(floatType), h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } err = clEnqueueWriteBuffer(queue, d_rowDelimiters, true, 0, (numRows+1) * sizeof(int), h_rowDelimiters, 0, NULL, &rowDelimitersTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); valTransfer.FillTimingInfo(); colsTransfer.FillTimingInfo(); vecTransfer.FillTimingInfo(); rowDelimitersTransfer.FillTimingInfo(); double iTransferTime = valTransfer.StartEndRuntime() + colsTransfer.StartEndRuntime() + vecTransfer.StartEndRuntime() + rowDelimitersTransfer.StartEndRuntime(); int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); // Results description info char atts[TEMP_BUFFER_SIZE]; sprintf(atts, "%d_elements_%d_rows", numNonZeroes, numRows); string prefix = ""; prefix += (padded) ? "Padded_" : ""; double gflop = 2 * (double) numNonZeroes; cout << "CSR Scalar Kernel\n"; Event kernelExec("kernel Execution"); // Set up CSR Kernels cl_kernel csrScalar, csrVector; csrScalar = clCreateKernel(prog, "spmv_csr_scalar_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 3, sizeof(cl_mem), (void*) &d_rowDelimiters); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 4, sizeof(cl_int), (void*) &numRows); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); csrVector = clCreateKernel(prog, "spmv_csr_vector_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 3, sizeof(cl_mem), (void*) &d_rowDelimiters); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 4, sizeof(cl_int), (void*) &numRows); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); // Append correct suffix to resultsDB entry string suffix; if (sizeof(floatType) == sizeof(float)) { suffix = "-SP"; } else { suffix = "-DP"; } const size_t scalarGlobalWSize = numRows; size_t localWorkSize = BLOCK_SIZE; for (int k = 0; k < passes; k++) { double scalarKernelTime = 0.0; // Run Scalar Kernel for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, csrScalar, 1, NULL, &scalarGlobalWSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); scalarKernelTime += kernelExec.StartEndRuntime(); } // Transfer data back to host Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; // If results don't match, don't report performance } scalarKernelTime = scalarKernelTime / (double)iters; string testName = prefix+"CSR-Scalar"+suffix; double totalTransfer = iTransferTime + oTransferTime; resultDB.AddResult(testName, atts, "Gflop/s", gflop/(scalarKernelTime)); resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s", gflop / (scalarKernelTime+totalTransfer)); } // Clobber correct answer, so we can be sure the vector kernel is correct err = clEnqueueWriteBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_vec, 0, NULL, NULL); CL_CHECK_ERROR(err); cout << "CSR Vector Kernel\n"; // Verify Local work group size size_t maxLocal = getMaxWorkGroupSize(ctx, csrVector); if (maxLocal < 32) { cout << "Warning: CSRVector requires a work group size >= 32" << endl; cout << "Skipping this kernel." << endl; err = clReleaseMemObject(d_rowDelimiters); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); err = clReleaseKernel(csrScalar); CL_CHECK_ERROR(err); err = clReleaseKernel(csrVector); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); return; } localWorkSize = VECTOR_SIZE; while (localWorkSize+VECTOR_SIZE <= maxLocal && localWorkSize+VECTOR_SIZE <= BLOCK_SIZE) { localWorkSize += VECTOR_SIZE; } const size_t vectorGlobalWSize = numRows * VECTOR_SIZE; // 1 warp per row for (int k = 0; k < passes; k++) { // Run Vector Kernel double vectorKernelTime = 0.0; for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, csrVector, 1, NULL, &vectorGlobalWSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); vectorKernelTime += kernelExec.StartEndRuntime(); } Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; // If results don't match, don't report performance } vectorKernelTime = vectorKernelTime / (double)iters; string testName = prefix+"CSR-Vector"+suffix; double totalTransfer = iTransferTime + oTransferTime; resultDB.AddResult(testName, atts, "Gflop/s", gflop/vectorKernelTime); resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s", gflop/(vectorKernelTime+totalTransfer)); } // Free device memory err = clReleaseMemObject(d_rowDelimiters); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); err = clReleaseKernel(csrScalar); CL_CHECK_ERROR(err); err = clReleaseKernel(csrVector); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { int err = 0; // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_sort, NULL, &err); CL_CHECK_ERROR(err); // Before proceeding, make sure the kernel code compiles and // all kernels are valid. cout << "Compiling sort kernels." << endl; err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the 3 kernels // Note that these kernels are analogs of those in use for // scan, but have had "visiting" logic added to them // as described by Merrill et al. See // http://www.cs.virginia.edu/~dgm4d/ cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err); CL_CHECK_ERROR(err); cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err); CL_CHECK_ERROR(err); // If the device doesn't support at least 256 work items in a // group, use a different kernel (TODO) if (getMaxWorkGroupSize(dev) < 256) { cout << "Scan requires work group size of at least 256" << endl; char atts[1024] = "GSize_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult(testName , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_Parity" , atts, "GB/s", FLT_MAX); } return; } // Problem Sizes int probSizes[4] = { 1, 8, 32, 64 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MiB size = (size * 1024 * 1024) / sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data (h_idata) cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for output data (h_odata) cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory cout << "Initializing host memory." << endl; for (int i = 0; i < size; i++) { h_idata[i] = i % 16; // Fill with some pattern h_odata[i] = -1; } // The radix width in bits const int radix_width = 4; // Changing this requires major kernel updates const int num_digits = (int)pow((double)2, radix_width); // n possible digits // Allocate device memory for input array cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output array cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Number of local work items per group const size_t local_wsize = 256; // Number of global work items const size_t global_wsize = 16384; // i.e. 64 work groups const size_t num_work_groups = global_wsize / local_wsize; // Allocate device memory for local work group intermediate sums cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_work_groups * num_digits * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the top-level scan err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Copy data to GPU cout << "Copying input data to device." << endl; Event evTransfer("PCIe transfer"); err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double inTransferTime = evTransfer.StartEndRuntime(); // Repeat the test multiplie times to get a good measurement int passes = op.getOptionInt("passes"); cout << "Running benchmark with size " << size << endl; for (int k = 0; k < passes; k++) { int th = Timer::Start(); // Assuming an 8 bit byte. for (int shift = 0; shift < sizeof(T)*8; shift += radix_width) { // Like scan, we use a reduce-then-scan approach // But before proceeding, update the shift appropriately // for each kernel. This is how many bits to shift to the // right used in binning. err = clSetKernelArg(reduce, 4, sizeof(cl_int), (void*)&shift); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 5, sizeof(cl_int), (void*)&shift); CL_CHECK_ERROR(err); // Also, the sort is not in place, so swap the input and output // buffers on each pass. bool even = ((shift / radix_width) % 2 == 0) ? true : false; if (even) { // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); } else // i.e. odd pass { // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); } // Each thread block gets an equal portion of the // input array, and computes occurrences of each digit. err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &global_wsize, &local_wsize, 0, NULL, NULL); // Next, a top-level exclusive scan is performed on the // per block histograms. This is done by a single // work group (note global size here is the same as local). err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL, &local_wsize, &local_wsize, 0, NULL, NULL); // Finally, a bottom-level scan is performed by each block // that is seeded with the scanned histograms which rebins, // locally scans, then scatters keys to global memory err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL, &global_wsize, &local_wsize, 0, NULL, NULL); } err = clFinish(queue); CL_CHECK_ERROR(err); double total_sort = Timer::Stop(th, "total sort time"); err = clEnqueueReadBuffer(queue, d_idata, true, 0, bytes, h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransfer = inTransferTime + evTransfer.StartEndRuntime(); totalTransfer /= 1.e9; // Convert to seconds // If answer is incorrect, stop test and do not report performance if (! verifySort(h_odata, size)) { return; } char atts[1024]; double avgTime = total_sort; double gbs = (double) (size * sizeof(T)) / (1000. * 1000. * 1000.); sprintf(atts, "%d_items", size); resultDB.AddResult(testName, atts, "GB/s", gbs / (avgTime)); resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gbs / (avgTime + totalTransfer)); resultDB.AddResult(testName+"_Parity", atts, "N", totalTransfer / avgTime); } // Clean up device memory err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_isums); CL_CHECK_ERROR(err); // Clean up pinned host memory err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); // Clean up program and kernel objects err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); err = clReleaseKernel(top_scan); CL_CHECK_ERROR(err); err = clReleaseKernel(bottom_scan); CL_CHECK_ERROR(err); }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, string compileFlags) { // Problem Parameters const int probSizes[4] = { 12288, 24576, 36864, 73728 }; int sizeClass = op.getOptionInt("size"); assert(sizeClass >= 0 && sizeClass < 5); int nAtom = probSizes[sizeClass - 1]; // Allocate problem data on host cl_mem h_pos, h_force, h_neigh; posVecType* position; forceVecType* force; int* neighborList; int passes = op.getOptionInt("passes"); int iter = op.getOptionInt("iterations"); // Allocate and map pinned host memory int err = 0; // Position h_pos = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(posVecType)*nAtom, NULL, &err); CL_CHECK_ERROR(err); position = (posVecType*)clEnqueueMapBuffer(queue, h_pos, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(posVecType)*nAtom , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Force h_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(forceVecType)*nAtom, NULL, &err); CL_CHECK_ERROR(err); force = (forceVecType*)clEnqueueMapBuffer(queue, h_force, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(forceVecType)*nAtom , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Neighbor List h_neigh = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(int) * nAtom * maxNeighbors, NULL, &err); CL_CHECK_ERROR(err); neighborList = (int*)clEnqueueMapBuffer(queue, h_neigh, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(int) * nAtom * maxNeighbors, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory cl_mem d_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE, nAtom * sizeof(forceVecType), NULL, &err); CL_CHECK_ERROR(err); cl_mem d_position = clCreateBuffer(ctx, CL_MEM_READ_WRITE, nAtom * sizeof(posVecType), NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory neighbor list cl_mem d_neighborList = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxNeighbors * nAtom * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); size_t maxGroupSize = getMaxWorkGroupSize(dev); if (maxGroupSize < 128) { cout << "MD requires a work group size of at least 128" << endl; // Add special values to the results database char atts[1024]; sprintf(atts, "GSize_Not_Supported"); for (int i=0 ; i<passes ; ++i) { resultDB.AddResult(testName, atts, "GFLOPS", FLT_MAX); resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS", FLT_MAX); resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_Parity", atts, "N", FLT_MAX); } return; } size_t localSize = 128; size_t globalSize = nAtom; cout << "Initializing test problem (this can take several " "minutes for large problems).\n "; // Seed random number generator srand48(8650341L); // Initialize positions -- random distribution in cubic domain for (int i = 0; i < nAtom; i++) { position[i].x = (drand48() * domainEdge); position[i].y = (drand48() * domainEdge); position[i].z = (drand48() * domainEdge); } // Copy position to GPU Event evTransfer("h->d transfer"); err = clEnqueueWriteBuffer(queue, d_position, true, 0, nAtom * sizeof(posVecType), position, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); long transferTime = evTransfer.StartEndRuntime(); // Keep track of how many atoms are within the cutoff distance to // accurately calculate FLOPS later int totalPairs = buildNeighborList<T, posVecType>(nAtom, position, neighborList); cout << "Finished.\n"; cout << totalPairs << " of " << nAtom*maxNeighbors << " pairs within cutoff distance = " << 100.0 * ((double)totalPairs / (nAtom*maxNeighbors)) << " %" << endl; // Copy data to GPU err = clEnqueueWriteBuffer(queue, d_neighborList, true, 0, maxNeighbors * nAtom * sizeof(int), neighborList, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); clFinish(queue); evTransfer.FillTimingInfo(); transferTime += evTransfer.StartEndRuntime(); // Build the openCL kernel cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_md, NULL, &err); CL_CHECK_ERROR(err); err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the kernels cl_kernel lj_kernel = clCreateKernel(prog, "compute_lj_force", &err); CL_CHECK_ERROR(err); T lj1_t = (T) lj1; T lj2_t = (T) lj2; T cutsq_t = (T) cutsq; // Set kernel arguments err = clSetKernelArg(lj_kernel, 0, sizeof(cl_mem), (void*) &d_force); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 1, sizeof(cl_mem), (void*) &d_position); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 2, sizeof(cl_int), (void*) &maxNeighbors); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 3, sizeof(cl_mem), (void*) &d_neighborList); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 4, sizeof(T), (void*) &cutsq_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 5, sizeof(T), (void*) &lj1_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 6, sizeof(T), (void*) &lj2_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 7, sizeof(cl_int), (void*) &nAtom); CL_CHECK_ERROR(err); Event evLJ("computeLJ"); // Warm up the kernel and check correctness err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &evLJ.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); err = clEnqueueReadBuffer(queue, d_force, true, 0, nAtom * sizeof(forceVecType), force, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); transferTime += evTransfer.StartEndRuntime(); cout << "Performing Correctness Check (can take several minutes)\n"; // If results are correct, skip the performance tests if (!checkResults<T, forceVecType, posVecType>(force, position, neighborList, nAtom)) { return; } for (int i = 0; i < passes; i++) { double total_time = 0.0; for (int j = 0; j < iter; j++) { //Launch Kernels err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &evLJ.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); // Collect timing info from events evLJ.FillTimingInfo(); total_time += evLJ.SubmitEndRuntime(); } char atts[1024]; long int nflops = (8 * nAtom * maxNeighbors) + (totalPairs * 13); sprintf(atts, "%d_atoms", nAtom); total_time /= (double) iter; resultDB.AddResult(testName, atts, "GFLOPS", ((double) nflops) / total_time); resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS", ((double) nflops) / (total_time + transferTime)); long int numPairs = nAtom * maxNeighbors; long int nbytes = (3 * sizeof(T) * (1+numPairs)) + // position data (3 * sizeof(T) * nAtom) + // force for each atom (sizeof(int) * numPairs); // neighbor list double gbytes = (double)nbytes / (1000. * 1000. * 1000.); double seconds = total_time / 1.e9; resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", gbytes / seconds); resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", gbytes / (seconds + (transferTime / 1.e9))); resultDB.AddResult(testName+"_Parity", atts, "N", (transferTime / 1.e9) / seconds); } // Clean up // Host memory err = clEnqueueUnmapMemObject(queue, h_pos, position, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_force, force, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_neigh, neighborList, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_pos); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_force); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_neigh); CL_CHECK_ERROR(err); // Program Objects err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(lj_kernel); CL_CHECK_ERROR(err); // Device Memory err = clReleaseMemObject(d_force); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_position); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_neighborList); CL_CHECK_ERROR(err); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; //try { stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new MICStencilFactory<T>; assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; //throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { const int probSizes[4] = { 768, 1408, 2048, 4096 }; int sizeClass = opts.getOptionInt("size"); if (!(sizeClass >= 0 && sizeClass < 5)) { //throw InvalidArgValue( "Size class must be between 1-4" ); } arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1]; } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << LROWS << 'x' << LCOLS; unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" ); unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); unsigned long nflops = npts * 11 * nIters; cout<<"flops are = "<<nflops<<endl; // compute the expected result on the host #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n" << "Depending on host capabilities, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); init( exp ); if( beVerbose ) { std::cout << "initial state:\n" << exp << std::endl; } Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( exp, nIters ); if( beVerbose ) { std::cout << "expected result:\n" << exp << std::endl; } // compute the result on the MIC device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) std::cout<<"Passes:"<<nPasses<<endl; for( unsigned int pass = 0; pass < nPasses; pass++ ) { init( data ); double start = curr_second(); (*testStencil)( data, nIters ); double elapsedTime = curr_second()-start; double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) //StencilValidater<T>* validater = new MPIStencilValidater<T>; #else //StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) MICValidate(exp,data,valErrThreshold,nValErrsToPrint); /*validater->ValidateResult( exp, data, valErrThreshold, nValErrsToPrint );*/ } } /* catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; }*/ // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
void RunTest(ResultDatabase &resultDB, const int npasses, const int verbose, const int noPB, const float repeatF, ProgressBar &pb, const char* precision, const int micdev) { char sizeStr[128]; static __declspec(target(mic)) T *hostMem; int realRepeats = (int)round(repeatF*20); if (realRepeats < 2) realRepeats = 2; // Allocate host memory int halfNumFloats = 1024*1024; int numFloats = 2*halfNumFloats; hostMem = (T*)_mm_malloc(sizeof(T)*numFloats,64); sprintf (sizeStr, "Size:%07d", numFloats); float t = 0.0f; double TH; double flopCount; double gflop; for (int pass=0 ; pass<npasses ; ++pass) { ////////// Add1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add1_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * omp_get_num_threads(); gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add2_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 120 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add4_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 60 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add8_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 80 * 3; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul1_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 200; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul2_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 100 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul4_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 50 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul8_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 25 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd1_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * omp_get_num_threads() * 1; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd2_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 120 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd4_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 60 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd8_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 30 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd1_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 160 * 1; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd2_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 80 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd4_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 40 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd8_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 20 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); } _mm_free(hostMem); }
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB) { const bool verbose = op.getOptionBool("verbose"); // Sizes are in kb const int nSizes = 17; int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768, 65536}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; // Create host memory hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN); if(hostMem==NULL) { cerr << "Couldn't allocate CPU memory! \n"; cerr << "Test failed." << endl; return; } // Initialize memory with some pattern. for (int i = 0; i < numMaxFloats; i++) { hostMem[i] = i % 77; } const unsigned int passes = op.getOptionInt("passes"); int micdev = op.getOptionInt("target"); // Allocate memory on the card #pragma offload target(mic:micdev) \ nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) ) { } // Three passes, forward and backward both for (int pass = 0; pass < passes; pass++) { // Step through sizes forward on even passes and backward on odd for (int i = 0; i < nSizes; i++) { int sizeIndex; if ((pass % 2) == 0) { sizeIndex = i; } else { sizeIndex = (nSizes - 1) - i; } int nbytes = sizes[sizeIndex] * 1024; // D->H test double start = curr_second(); #pragma offload target(mic:micdev) \ out(hostMem:length((1024*sizes[sizeIndex]/4)) \ free_if(0) alloc_if(0) ) { } double t = curr_second()-start; if (verbose) { cerr << "Size " << sizes[sizeIndex] << "k took " << t << " sec\n"; } double speed = (double(sizes[sizeIndex]) * 1024 / (1000. * 1000. * 1000.)) / t; char sizeStr[256]; sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]); resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed); resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000); } } // Free memory allocated on the mic #pragma offload target(mic:micdev) \ in(hostMem:length(numMaxFloats) alloc_if(0) ) { } // Cleanup _mm_free(hostMem); }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { // Collect basic MPI information int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); int err; int waitForEvents = 1; // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_reduction, NULL, &err); CL_CHECK_ERROR(err); if (mpi_rank == 0) { cout << "Compiling reduction kernel." << endl; } err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != 0) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the kernels cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel cpureduce = clCreateKernel(prog, "reduceNoLocal", &err); CL_CHECK_ERROR(err); size_t localWorkSize = 256; bool nolocal = false; if (getMaxWorkGroupSize(ctx, reduce) == 1) { nolocal = true; localWorkSize = 1; } int probSizes[4] = { 1, 8, 64, 128 }; int size = probSizes[op.getOptionInt("size")-1]; size = (size * 1024 * 1024) / sizeof(T); unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory if (mpi_rank == 0) { cout << "Initializing host memory." << endl; } for(int i=0; i<size; i++) { h_idata[i] = i % 2; //Fill with some pattern } // Allocate device memory for input data cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); int num_blocks; if (!nolocal) { num_blocks = 64; } else { num_blocks = 1; // NB: This should only be the case on Apple's CPU // implementation, which is quite restrictive on // work group sizes. } // Allocate host memory for output cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(T)*num_blocks, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(T) * num_blocks , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_blocks * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Copy data to GPU Event evTransfer("PCIe Transfer"); err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double inputTransfer = evTransfer.StartEndRuntime(); err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, localWorkSize * sizeof(T), NULL); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 1, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); size_t globalWorkSize; if (!nolocal) { globalWorkSize = localWorkSize * 64; // Use 64 work groups } else { globalWorkSize = 1; } int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); if (mpi_rank == 0) { cout << "Running benchmark." << endl; } for (int k = 0; k < passes; k++) { // Synch processes at the start of each test. MPI_Barrier(MPI_COMM_WORLD); double totalReduceTime = 0.0; Event evKernel("reduce kernel"); for (int m = 0; m < iters; m++) { if (nolocal) { err = clEnqueueNDRangeKernel(queue, cpureduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); } else { err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); } CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR (err); evKernel.FillTimingInfo(); totalReduceTime += (evKernel.SubmitEndRuntime() / 1.e9); } err = clEnqueueReadBuffer(queue, d_odata, true, 0, num_blocks*sizeof(T), h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransfer = (inputTransfer + evTransfer.StartEndRuntime()) / 1.e9; T local_result = 0.0f, global_result = 0.0f; // Start a wallclock timer for MPI int TH_global = Timer::Start(); // Perform reduction of block sums and MPI allreduce call for (int m = 0; m < iters; m++) { local_result = 0.0f; for (int i=0; i<num_blocks; i++) { local_result += h_odata[i]; } global_result = 0.0f; globalReduction(&local_result, &global_result); } double mpi_time = Timer::Stop(TH_global,"global all reduce") / iters; // Compute local reference solution T cpu_result = reduceCPU<T>(h_idata, size); // Use some error threshold for floating point rounding double threshold = 1.0e-6; T diff = fabs(local_result - cpu_result); if (diff > threshold) { cout << "Error in local reduction detected in rank " << mpi_rank << "\n"; cout << "Diff: " << diff << endl; } if (global_result != (mpi_size * local_result)) { cout << "Test Failed, error in global all reduce detected in rank " << mpi_rank << endl; } else { if (mpi_rank == 0) { cout << "Test Passed.\n"; } } // Calculate results char atts[1024]; sprintf(atts, "%d_itemsPerRank",size); double local_gbytes = (double)(size*sizeof(T))/(1000.*1000.*1000.); double global_gbytes = local_gbytes * mpi_size; totalReduceTime /= iters; // use average time over the iterations resultDB.AddResult(testName+"-Kernel", atts, "GB/s", global_gbytes / totalReduceTime); resultDB.AddResult(testName+"-Kernel+PCIe", atts, "GB/s", global_gbytes / (totalReduceTime + totalTransfer)); resultDB.AddResult(testName+"-MPI_Allreduce", atts, "GB/s", (sizeof(T)*mpi_size*1.e-9) / (mpi_time)); resultDB.AddResult(testName+"-Overall", atts, "GB/s", global_gbytes / (totalReduceTime + totalTransfer + mpi_time)); } err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); }
void RunBenchmark( cl::Device& dev, cl::Context& ctx, cl::CommandQueue& queue, ResultDatabase& resultDB, OptionParser& op ) { #if defined(PARALLEL) int cwrank; #endif // defined(PARALLEL) // single precision DoTest<float>( "SP_Sten2D", dev, ctx, queue, resultDB, op, "-DSINGLE_PRECISION" ); // double precision - might not be supported if( checkExtension( dev, "cl_khr_fp64" )) { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DK_DOUBLE_PRECISION" ); } else if( checkExtension( dev, "cl_amd_fp64" )) { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DAMD_DOUBLE_PRECISION" ); } else { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP not supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires an entry for every possible result int nPasses = (int)op.getOptionInt( "passes" ); for( unsigned int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "GFLOPS", FLT_MAX ); } } std::cout << '\n' << std::endl; }
void ellPackTest(cl_device_id dev, cl_context ctx, string compileFlags, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, floatType* h_val, int* h_cols, int* h_rowDelimiters, floatType* h_vec, floatType* h_out, int numRows, int numNonZeroes, floatType* refOut, bool padded, int paddedSize, const size_t maxImgWidth) { if (devSupportsImages) { char texflags[64]; sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth); compileFlags+=string(texflags); } // Set up OpenCL Program Object int err = 0; cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL, &err); CL_CHECK_ERROR(err); // Build the openCL kernels err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } int *h_rowLengths = new int[paddedSize]; int maxrl = 0; for (int k=0; k<numRows; k++) { h_rowLengths[k] = h_rowDelimiters[k+1] - h_rowDelimiters[k]; if (h_rowLengths[k] > maxrl) { maxrl = h_rowLengths[k]; } } for (int p=numRows; p < paddedSize; p++) { h_rowLengths[p] = 0; } // Column major format host data structures int cmSize = padded ? paddedSize : numRows; floatType *h_valcm = new floatType[maxrl * cmSize]; int *h_colscm = new int[maxrl * cmSize]; convertToColMajor(h_val, h_cols, numRows, h_rowDelimiters, h_valcm, h_colscm, h_rowLengths, maxrl, padded); // Device data structures cl_mem d_val, d_vec, d_out; // floating point cl_mem d_cols, d_rowLengths; // integer // Allocate device memory d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); int imgHeight = 0; if (devSupportsImages) { imgHeight=(numRows+maxImgWidth-1)/maxImgWidth; cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT; if(sizeof(floatType)==4) fmt.image_channel_order=CL_R; else fmt.image_channel_order=CL_RG; d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth, imgHeight, 0, NULL, &err); CL_CHECK_ERROR(err); } else { d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); } d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, paddedSize * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_rowLengths = clCreateBuffer(ctx, CL_MEM_READ_WRITE, cmSize * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); // Setup events for timing Event valTransfer("transfer Val data over PCIe bus"); Event colsTransfer("transfer cols data over PCIe bus"); Event vecTransfer("transfer vec data over PCIe bus"); Event rowLengthsTransfer("transfer rowLengths data over PCIe bus"); // Transfer data to device err = clEnqueueWriteBuffer(queue, d_val, true, 0, maxrl * cmSize * sizeof(clFloatType), h_valcm, 0, NULL, &valTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue, d_cols, true, 0, maxrl * cmSize * sizeof(cl_int), h_colscm, 0, NULL, &colsTransfer.CLEvent()); CL_CHECK_ERROR(err); if (devSupportsImages) { size_t offset[3]={0}; size_t size[3]={maxImgWidth,(size_t)imgHeight,1}; err = clEnqueueWriteImage(queue,d_vec, true, offset, size, 0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } else { err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows * sizeof(clFloatType), h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } err = clEnqueueWriteBuffer(queue, d_rowLengths, true, 0, cmSize * sizeof(int), h_rowLengths, 0, NULL, &rowLengthsTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); valTransfer.FillTimingInfo(); colsTransfer.FillTimingInfo(); vecTransfer.FillTimingInfo(); rowLengthsTransfer.FillTimingInfo(); double iTransferTime = valTransfer.StartEndRuntime() + colsTransfer.StartEndRuntime() + vecTransfer.StartEndRuntime() + rowLengthsTransfer.StartEndRuntime(); // Set up kernel arguments cl_kernel ellpackr = clCreateKernel(prog, "spmv_ellpackr_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 3, sizeof(cl_mem), (void*) &d_rowLengths); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 4, sizeof(cl_int), (void*) &cmSize); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); const size_t globalWorkSize = cmSize; const size_t localWorkSize = BLOCK_SIZE; Event kernelExec("ELLPACKR Kernel Execution"); int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); for (int k = 0; k < passes; k++) { double totalKernelTime = 0.0; for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, ellpackr, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); totalKernelTime += kernelExec.StartEndRuntime(); } Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(clFloatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; } char atts[TEMP_BUFFER_SIZE]; char benchName[TEMP_BUFFER_SIZE]; double avgTime = totalKernelTime / (double)iters; sprintf(atts, "%d_elements_%d_rows", numNonZeroes, cmSize); double gflop = 2 * (double) numNonZeroes; bool dpTest = (sizeof(floatType) == sizeof(double)); sprintf(benchName, "%sELLPACKR-%s", padded ? "Padded_":"", dpTest ? "DP":"SP"); resultDB.AddResult(benchName, atts, "Gflop/s", gflop/avgTime); sprintf(benchName, "%s_PCIe", benchName); resultDB.AddResult(benchName, atts, "Gflop/s", gflop / (avgTime + iTransferTime + oTransferTime)); } err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(ellpackr); CL_CHECK_ERROR(err); // Free device memory err = clReleaseMemObject(d_rowLengths); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); // Free host memory delete[] h_rowLengths; delete[] h_valcm; delete[] h_colscm; }
void RunTest(string testName, ResultDatabase &resultDB, OptionParser &op) { int probSizes[4] = { 1, 8, 48, 96 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MiB size = (size*1024*1024)/sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate Host Memory __declspec(target(MIC)) static T *hkey, *outkey; __declspec(target(MIC)) static T *hvalue, *outvalue; hkey = (T*)_mm_malloc(bytes,ALIGN); hvalue = (T*)_mm_malloc(bytes,ALIGN); outkey = (T*)_mm_malloc(bytes,ALIGN); outvalue = (T*)_mm_malloc(bytes,ALIGN); // Initialize host memory cout << "Initializing host memory." << endl; srand(time(NULL)); for (int i = 0; i < size; i++) { hkey[i] = hvalue[i]= (i+255) % 1089; // Fill with some pattern } int micdev = op.getOptionInt("target"); int iters = op.getOptionInt("passes"); int numThreads = op.getOptionInt("nthreads"); cout << "nthreads = " <<numThreads<< endl; cout << "Running benchmark" << endl; for(int it=0;it<iters;it++) { // Allocating buffer on card #pragma offload target(mic:micdev) in(hkey:length(size) free_if(0)) \ in(hvalue:length(size) free_if(0))\ out(outkey:length(size) free_if(0))\ out(outvalue:length(size) free_if(0)) { } double start = curr_second(); // Get data transfer time #pragma offload target(mic:micdev) in(hkey:length(size) alloc_if(0) \ free_if(0)) in(hvalue:length(size) alloc_if(0) free_if(0)) { } float transferTime = curr_second()-start; double totalRunTime = 0.0f; start = curr_second(); #pragma offload target(mic:micdev) nocopy(hkey:length(size) \ alloc_if(0) free_if(0)) \ nocopy(hvalue:length(size) alloc_if(0) free_if(0)) \ nocopy(outkey:length(size) alloc_if(0) free_if(0)) \ nocopy(outvalue:length(size) alloc_if(0) free_if(0))\ in(numThreads) { sortKernel<T>(hkey, hvalue, outkey, outvalue, size, numThreads); } totalRunTime = curr_second()-start; #pragma offload target(mic:micdev) nocopy(hkey:length(size) \ alloc_if(0) free_if(1)) \ nocopy(hvalue:length(size) alloc_if(0) free_if(1)) \ out(outkey:length(size) alloc_if(0)) \ out(outvalue:length(size) alloc_if(0)) { } // If results aren't correct, don't report perf numbers if (!verifyResult<T>(outkey, outvalue, size)) { return; } char atts[1024]; double avgTime = (totalRunTime / (double) iters); sprintf(atts, "%d items", size); double gb = (double)(size * sizeof(T)) / (1000. * 1000. * 1000.); resultDB.AddResult(testName, atts, "GB/s", gb / avgTime); resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gb / (avgTime + transferTime)); resultDB.AddResult(testName+"_Parity", atts, "N", transferTime / avgTime); } // Clean up _mm_free(hkey); _mm_free(hvalue); }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Executes the sparse matrix - vector multiplication benchmark // // Arguments: // dev: the opencl device id to use for the benchmark // ctx: the opencl context to use for the benchmark // queue: the opencl command queue to issue commands to // resultDB: stores results from the benchmark // op: the options parser / parameter database // // Returns: nothing // Programmer: Lukasz Wesolowski // Creation: August 13, 2010 // // Modifications: // // **************************************************************************** void RunBenchmark(cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op) { //create list of problem sizes int probSizes[4] = {1024, 8192, 12288, 16384}; int sizeClass = op.getOptionInt("size") - 1; // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros cout <<"Single precision tests:\n"; string spMacros = "-DSINGLE_PRECISION "; RunTest<float, cl_float> (dev, ctx, queue, resultDB, op, spMacros, probSizes[sizeClass]); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "Double precision tests\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; RunTest<double, cl_double> (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "Double precision tests\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; RunTest<double, cl_double> (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]); } else { std::cout << "Double precision not supported by chosen device, skipping" << std::endl; // driver script still needs entries for all tests, even if not run int nPasses = (int)op.getOptionInt( "passes" ); for( unsigned int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"CSR-Scalar-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Scalar-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Vector-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Vector-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"ELLPACKR-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"ELLPACKR-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); } } }
// **************************************************************************** // Method: main() // // Purpose: // serial and parallel main for OpenCL level0 benchmarks // // Arguments: // argc, argv // // Programmer: SHOC Team // Creation: The Epoch // // Modifications: // Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010 // Changed the way device selection works. It now defaults to the device // index corresponding to the process's rank within a node if no devices // are specified on the command command line, and otherwise, round-robins // the list of devices among the tasks. // // Gabriel Marin, Tue Jun 01 15:38 EST 2010 // Check that we have valid (not NULL) context and queue objects before // running the benchmarks. Errors inside CreateContextFromSingleDevice or // CreateCommandQueueForContextAndDevice were not propagated out to the main // program. // // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n"; #endif OptionParser op; //Add shared options to the parser op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use", 'p'); op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_VECINT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1 ); } if (op.getOptionBool("infoDevices")) { #define DEBUG_DEVICE_CONTAINER 0 #ifdef PARALLEL // execute following code only if I am the process of lowest // rank on this node NodeInfo NI; int mynoderank = NI.nodeRank(); if (mynoderank==0) { int nlrrank, nlrsize; MPI_Comm nlrcomm = NI.getNLRComm(); MPI_Comm_size(nlrcomm, &nlrsize); MPI_Comm_rank(nlrcomm, &nlrrank); OpenCLNodePlatformContainer ndc1; OpenCLMultiNodeContainer localMnc(ndc1); localMnc.doMerge (nlrrank, nlrsize, nlrcomm); if (rank==0) // I am the global rank 0, print all configurations localMnc.Print (cout); } #else OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); #if DEBUG_DEVICE_CONTAINER OpenCLMultiNodeContainer mnc1(ndc1), mnc2; mnc1.Print (cout); ostringstream oss; mnc1.writeObject (oss); std::string temp(oss.str()); cout << "Serialized MultiNodeContainer:\n" << temp; istringstream iss(temp); mnc2.readObject (iss); cout << "Unserialized object2:\n"; mnc2.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 1:\n"; mnc1.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 2:\n"; mnc1.Print (cout); #endif // DEBUG #endif // PARALLEL return (0); } bool verbose = op.getOptionBool("verbose"); // The device option supports specifying more than one device // for now, just choose the first one. int platform = op.getOptionInt("platform"); #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); if (verbose) cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl; #else int myNodeRank = 0; #endif // If they haven't specified any devices, assume they // want the process with in-node rank N to use device N int deviceIdx = myNodeRank; // If they have, then round-robin the list of devices // among the processes on a node. vector<long long> deviceVec = op.getOptionVecInt("device"); if (deviceVec.size() > 0) { int len = deviceVec.size(); deviceIdx = deviceVec[myNodeRank % len]; } // Check for an erroneous device if (deviceIdx >= GetNumOclDevices(platform)) { cerr << "Warning: device index: " << deviceIdx << " out of range, defaulting to device 0.\n"; deviceIdx = 0; } // Initialization if (verbose) cout << ">> initializing\n"; cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx); cl_int clErr; cl_context ctx = clCreateContext( NULL, // properties 1, // number of devices &devID, // device NULL, // notification function NULL, &clErr ); CL_CHECK_ERROR(clErr); cl_command_queue queue = clCreateCommandQueue( ctx, devID, CL_QUEUE_PROFILING_ENABLE, &clErr ); CL_CHECK_ERROR(clErr); ResultDatabase resultDB; // Run the benchmark RunBenchmark(devID, ctx, queue, resultDB, op); clReleaseCommandQueue( queue ); clReleaseContext( ctx ); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { std::cerr << "unrecognized exception caught" << std::endl; ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif return ret; }