// validate stencil-independent values void CheckOptions( const OptionParser& opts ) { // check matrix dimensions - must be 2d, must be positive std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { throw InvalidArgValue( "overall size must have two dimensions" ); } if( (arrayDims[0] < 0) || (arrayDims[1] < 0) ) { throw InvalidArgValue( "each size dimension must be positive" ); } // validation error threshold must be positive float valThreshold = opts.getOptionFloat( "val-threshold" ); if( valThreshold <= 0.0f ) { throw InvalidArgValue( "validation threshold must be positive" ); } // number of validation errors to print must be non-negative int nErrsToPrint = opts.getOptionInt( "val-print-limit" ); if( nErrsToPrint < 0 ) { throw InvalidArgValue( "number of validation errors to print must be non-negative" ); } int nWarmupPasses = opts.getOptionInt( "warmupPasses" ); if( nWarmupPasses < 0 ) { throw InvalidArgValue( "number of warmup passes must be non-negative" ); } }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Measures the floating point capability of the device for a variety of // combinations of arithmetic operations. // // Arguments: // op: the options parser / parameter database // // Returns: nothing // // Programmer: Zhi Ying([email protected]) // Jun Jin([email protected]) // // Creation: May 23, 2011 // // Modifications: // 12/12/12 - Kyle Spafford - Code style and minor integration updates // // **************************************************************************** void RunBenchmark(OptionParser &op, ResultDatabase &resultDB) { const bool verbose = op.getOptionBool("verbose"); // Quiet == no progress bar. const bool quiet = op.getOptionBool("quiet"); const unsigned int passes = op.getOptionInt("passes"); const int micdev = op.getOptionInt("target"); double repeatF = 3; cout << "Adjust repeat factor = " << repeatF << "\n"; // Initialize progress bar int totalRuns = 16*passes*2; ProgressBar pb(totalRuns); if (!verbose && !quiet) { pb.Show(stdout); } RunTest<float>(resultDB, passes, verbose, quiet, repeatF, pb, "-SP", micdev); RunTest<double>(resultDB, passes, verbose, quiet, repeatF, pb, "-DP", micdev); if (!verbose) cout << endl; }
// **************************************************************************** // Function: GPUSetup // // Purpose: // do the necessary OpenCL setup for GPU part of the test // // Arguments: // op: the options parser / parameter database // mympirank: for printing errors in case of failure // mynoderank: this is typically the device ID (the mapping done in main) // // Returns: success/failure // // Creation: 2009 // // Modifications: // // **************************************************************************** // int GPUSetup(OptionParser &op, int mympirank, int mynoderank) { addBenchmarkSpecOptions(op); if (op.getOptionBool("infoDevices")) { OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); return (0); } // The device option supports specifying more than one device int platform = op.getOptionInt("platform"); int deviceIdx = mynoderank; if( deviceIdx >= op.getOptionVecInt( "device" ).size() ) { std::ostringstream estr; estr << "Warning: not enough devices specified with --device flag for task " << mympirank << " ( node rank " << mynoderank << ") to claim its own device; forcing to use first device "; std::cerr << estr.str() << std::endl; deviceIdx = 0; } int device = op.getOptionVecInt("device")[deviceIdx]; // Initialization _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) ); std::vector<cl::Device> ctxDevices; ctxDevices.push_back( *_mpicontention_ocldev ); _mpicontention_ocldriver_ctx = new cl::Context( ctxDevices ); _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE ); _mpicontention_gpuop = op; return 0; }
void RunBenchmark(ResultDatabase &resultDB, OptionParser &op) { // Test to see if this device supports double precision cudaGetDevice(&fftDevice); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, fftDevice); bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) || (deviceProp.major >= 2); cout << "Running single precision test" << endl; runTest<float2>("SP-FFT", resultDB, op); if (has_dp) { cout << "Running double precision test" << endl; runTest<double2>("DP-FFT", resultDB, op); } else { cout << "Skipping double precision test" << endl; char atts[32] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k=0; k<passes; k++) { resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX); } } }
void dump2D(OptionParser& op) { int i; void* work, *temp; T2* source, * result; unsigned long bytes = 0; int probSizes[7] = { 128, 256, 512, 1024, 2048, 4096, 8192}; int sizeIndex = op.getOptionInt("pts1")-1; int sizeIndey = op.getOptionInt("pts2")-1; if (sizeIndex < 0 || sizeIndex >= 7) { cerr << "Invalid size index specified\n"; exit(-1); } if (sizeIndey < 0 || sizeIndey >= 7) { cerr << "Invalid size index specified\n"; exit(-1); } int FFTN1=probSizes[sizeIndex],FFTN2=probSizes[sizeIndey]; //int FFTN1=8192,FFTN2=512; unsigned long used_bytes = FFTN1*FFTN2*sizeof(T2); bool do_dp = dp<T2>(); init2(op, do_dp, FFTN1, FFTN2); int n_ffts = 1; double N = FFTN1*FFTN2; // allocate host and device memory allocHostBuffer((void**)&source, used_bytes); allocHostBuffer((void**)&result, used_bytes); // init host memory... for (i = 0; i < N; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; } // alloc device memory allocDeviceBuffer(&work, used_bytes); allocDeviceBuffer(&temp, used_bytes); copyToDevice(work, source, used_bytes); forward2(work, temp, n_ffts, FFTN1, FFTN2); copyFromDevice(result, work, used_bytes); #ifdef PRINT_RESULT for (i = 0; i < N; i++) { fprintf(stdout, "data[%d] (%g, %g) \n",i, result[i].x, result[i].y); } #endif freeDeviceBuffer(work); freeDeviceBuffer(temp); freeHostBuffer(source); freeHostBuffer(result); }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // Convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Collect basic MPI information int size, rank; MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION"; runTest<float> ("TPScan-SP", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; runTest<double> ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; runTest<double> ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { char atts[1024] = "DP_Not_Supported"; cout << "Warning, rank " << rank << "'s device does not support DP\n"; // ResultDB requires every rank to report something. If this rank // doesn't support DP, submit FLT_MAX (this is handled as no result by // ResultDB. int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult("TPScan-DP-Kernel" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-Kernel+PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-MPI_ExScan" , atts, "GB/s", FLT_MAX); resultDB.AddResult("TPScan-DP-Overall" , atts, "GB/s", FLT_MAX); } } }
void dump1D(OptionParser& op) { int i; int fftn; void* work, *temp; T2* source, * result; unsigned long bytes = 0; int probSizes[7] = { 128, 256, 512, 1024, 2048, 4096, 8192 }; int sizeIndex = op.getOptionInt("pts")-1; if (sizeIndex < 0 || sizeIndex >= 7) { cerr << "Invalid size index specified\n"; exit(-1); } fftn = probSizes[sizeIndex]; // Convert to MB unsigned long used_bytes = fftn * sizeof(T2); bool do_dp = dp<T2>(); init(op, do_dp, fftn); // now determine how much available memory will be used //int half_n_ffts = bytes / (fftn*sizeof(T2)*2); int n_ffts = 1; double N = fftn; fprintf(stdout, "used_bytes=%lu, N=%g\n", used_bytes, N); // allocate host and device memory allocHostBuffer((void**)&source, used_bytes); allocHostBuffer((void**)&result, used_bytes); // init host memory... for (i = 0; i < N; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; } // alloc device memory allocDeviceBuffer(&work, used_bytes); allocDeviceBuffer(&temp, used_bytes); copyToDevice(work, source, used_bytes); forward(work, temp, n_ffts, fftn); copyFromDevice(result, work, used_bytes); #ifdef PRINT_RESULT for (i = 0; i < N; i++) { fprintf(stdout, "data[%d] (%g, %g)\n", i, result[i].x, result[i].y); } #endif freeDeviceBuffer(work); freeDeviceBuffer(temp); freeHostBuffer(source); freeHostBuffer(result); }
void RunBenchmark(OptionParser& opts, ResultDatabase& resultDB ) { int device; #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); #endif // defined(PARALLEL) #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Running single precision test" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) //omp_set_num_threads(124); DoTest<float>( "SP_Sten2D", resultDB, opts ); // check if we can run double precision tests if( //deviceProps.major == 1) && (deviceProps.minor >= 3)) || //eviceProps.major >= 2)) 1) { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "DP supported\n" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) //omp_set_num_threads(93); DoTest<double>( "DP_Sten2D", resultDB, opts ); } else { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Double precision not supported - skipping" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires neg entry for every possible result int nPasses = (int)opts.getOptionInt( "passes" ); for( int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "s", FLT_MAX); } } }
void MPICUDAStencilFactory<T>::CheckOptions( const OptionParser& opts ) const { // let base class check its options first CommonCUDAStencilFactory<T>::CheckOptions( opts ); // check our options std::vector<long long> shDims = opts.getOptionVecInt( "lsize" ); std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims[0] == 0 ) { // custom size was not specified - we are using a standard size int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } assert( shDims.size() == 2 ); assert( arrayDims.size() == 2 ); size_t gRows = (size_t)arrayDims[0]; size_t gCols = (size_t)arrayDims[1]; size_t lRows = shDims[0]; size_t lCols = shDims[1]; unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); // verify that MPI halo width will result in a matrix being passed // to the kernel that also has its global size as a multiple of // the local work size // // Because the MPI halo width is arbitrary, and the kernel halo width // is always 1, we have to ensure that: // ((size + 2*halo) - 2) % lsize == 0 if( (((gRows + 2*haloWidth) - 2) % lRows) != 0 ) { throw InvalidArgValue( "rows including halo must be even multiple of lsize (e.g., lsize rows evenly divides ((rows + 2*halo) - 2) )" ); } if( (((gCols + 2*haloWidth) - 2) % lCols) != 0 ) { throw InvalidArgValue( "columns including halo must be even multiple of lsize (e.g., lsize cols evenly divides ((cols + 2*halo) - 2) )" ); } }
void StencilFactory<T>::CheckOptions( const OptionParser& options ) const { // number of iterations must be positive unsigned int nIters = (unsigned int)options.getOptionInt( "num-iters" ); if( nIters == 0 ) { throw InvalidArgValue( "number of iterations must be positive" ); } // no restrictions on weight values, just that we have them }
static void fillResultDB(const string& name, const string& reason, OptionParser &op, ResultDatabase& resultDB) { // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k=0; k<passes; k++) { resultDB.AddResult(name , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"_PCIe" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"_Parity" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV_PCIe" , reason, "GB/s", FLT_MAX); resultDB.AddResult(name+"-INV_Parity" , reason, "GB/s", FLT_MAX); } }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros runTest<float>("SGEMM", dev, ctx, queue, resultDB, op, "-DSINGLE_PRECISION"); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; runTest<double>("DGEMM", dev, ctx, queue, resultDB, op, "-DK_DOUBLE_PRECISION "); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; runTest<double>("DGEMM", dev, ctx, queue, resultDB, op, "-DAMD_DOUBLE_PRECISION "); } else { cout << "DP Not Supported\n"; char atts[1024] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (; passes > 0; --passes) { for (int i = 0; i < 2; i++) { const char transb = i ? 'T' : 'N'; string testName="DGEMM"; resultDB.AddResult(testName+"-"+transb, atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_PCIe", atts, "GFlops", FLT_MAX); resultDB.AddResult(testName+"-"+transb+"_Parity", atts, "N", FLT_MAX); } } } }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION "; RunTest<float>("S3D-SP", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; RunTest<double> ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; RunTest<double> ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { cout << "DP Not Supported\n"; char atts[1024] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult("S3D-DP" , atts, "GB/s", FLT_MAX); resultDB.AddResult("S3D-DP_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("S3D-DP_Parity" , atts, "GB/s", FLT_MAX); } } }
void RunBenchmark(cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op) { // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros string spMacros = "-DSINGLE_PRECISION"; runTest<float, float4, float4> ("MD-LJ", dev, ctx, queue, resultDB, op, spMacros); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; runTest<double, double4, double4> ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "DP Supported\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; runTest<double, double4, double4> ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros); } else { cout << "DP Not Supported\n"; char atts[32] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int i = 0; i < passes; i++) { resultDB.AddResult("MD-LJ-DP" , atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP-Bandwidth", atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP-Bandwidth_PCIe", atts, "GB/s", FLT_MAX); resultDB.AddResult("MD-LJ-DP_Parity" , atts, "GB/s", FLT_MAX); } } }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Runs the stablity test. The algorithm for the parallel // version of the test, which enables testing of an entire GPU // cluster at the same time, is as follows. Each participating node // first allocates its data, while node zero additionally determines // start and finish times based on a user input parameter. All nodes // then enter the outermost loop, copying fresh data from the CPU // before entering the core of the test. In the core, each node // performs a loop consisting of the forward kernel, a potential // check, and then the inverse kernel. After performing a configurable // number of forward/inverse iterations, along with a configurable // number of checks, each node sends the number of failures it // encountered to node zero. Node zero collects and reports the error // counts, determines whether the test has run its course, and // broadcasts the decision. If the decision is to proceed, each node // begins the next iteration of the outer loop, copying fresh data and // then performing the kernels and checks of the core loop. // // Arguments: // resultDB: the benchmark stores its results in this ResultDatabase // op: the options parser / parameter database // // Returns: nothing // // Programmer: Collin McCurdy // Creation: September 08, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(ResultDatabase &resultDB, OptionParser& op) { int mpi_rank, mpi_size, node_rank; int i, j; float2* source, * result; void* work, * chk; #ifdef PARALLEL MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); NodeInfo NI; node_rank = NI.nodeRank(); cout << "MPI Task " << mpi_rank << " of " << mpi_size << " (noderank=" << node_rank << ") starting....\n"; #else mpi_rank = 0; mpi_size = 1; node_rank = 0; #endif // ensure chk buffer alloc succeeds before grabbing the // rest of available memory. allocDeviceBuffer(&chk, 1); unsigned long avail_bytes = findAvailBytes(); // unsigned long avail_bytes = 1024*1024*1024-1; // now determine how much available memory will be used (subject // to CUDA's constraint on the maximum block dimension size) int blocks = avail_bytes / (512*sizeof(float2)); int slices = 1; while (blocks/slices > 65535) { slices *= 2; } int half_n_ffts = ((blocks/slices)*slices)/2; int n_ffts = half_n_ffts * 2; fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", avail_bytes, blocks, n_ffts); int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2); cout << mpi_rank << ": testing " << used_bytes/((double)1024*1024) << " MBs\n"; // allocate host memory source = (float2*)malloc(used_bytes); result = (float2*)malloc(used_bytes); // alloc device memory allocDeviceBuffer(&work, used_bytes); // alloc gather buffer int* recvbuf = (int*)malloc(mpi_size*sizeof(int)); // compute start and finish times time_t start = time(NULL); time_t finish = start + (time_t)(op.getOptionInt("time")*60); struct tm start_tm, finish_tm; localtime_r(&start, &start_tm); localtime_r(&finish, &finish_tm); if (mpi_rank == 0) { printf("start = %s", asctime(&start_tm)); printf("finish = %s", asctime(&finish_tm)); } for (int iter = 0; ; iter++) { bool failed = false; int errorCount = 0, stop = 0; // (re-)init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // copy to device copyToDevice(work, source, used_bytes); copyToDevice(chk, &errorCount, 1); forward(work, n_ffts); if (check(work, chk, half_n_ffts, half_n_cmplx)) { fprintf(stderr, "First check failed..."); failed = true; } if (!failed) { for (i = 1; i <= CHECKS; i++) { for (j = 1; j <= ITERS_PER_CHECK; j++) { inverse(work, n_ffts); forward(work, n_ffts); } if (check(work, chk, half_n_ffts, half_n_cmplx)) { failed = true; break; } } } // failing node is responsible for verifying failure, counting // errors and reporting count to node 0. if (failed) { fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter); // repeat check on CPU copyFromDevice(result, work, used_bytes); float2* result2 = result + half_n_cmplx; for (j = 0; j < half_n_cmplx; j++) { if (result[j].x != result2[j].x || result[j].y != result2[j].y) { errorCount++; } } if (!errorCount) { fprintf(stderr, "verification failed!\n"); } else { fprintf(stderr, "%d errors\n", errorCount); } } #ifdef PARALLEL MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); #else recvbuf[0] = errorCount; #endif // node 0 collects and reports error counts, determines // whether test has run its course, and broadcasts decision if (mpi_rank == 0) { time_t curtime = time(NULL); struct tm curtm; localtime_r(&curtime, &curtm); fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm)); for (i = 0; i < mpi_size; i++) { if (recvbuf[i]) { fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i); } } if (curtime > finish) { stop = 1; } } #ifdef PARALLEL MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif resultDB.AddResult("Check", "", "Failures", errorCount); if (stop) break; } freeDeviceBuffer(work); freeDeviceBuffer(chk); free(source); free(result); free(recvbuf); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; try { #if defined(PARALLEL) stdStencilFactory = new MPIHostStencilFactory<T>; testStencilFactory = new MPICUDAStencilFactory<T>; #else stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new CUDAStencilFactory<T>; #endif // defined(PARALLEL) assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::vector<long long> lDims = opts.getOptionVecInt( "lsize" ); assert( lDims.size() == 2 ); std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << lDims[0] << 'x' << lDims[1]; unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" ); unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" ); // compute the expected result on the host // or read it from a pre-existing file std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" ); #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) if( !matrixFilenameBase.empty() ) { std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n" << std::endl; } else { std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n" << "Depending on host capabilities, this may take a while." << std::endl; } #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> expected( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); bool haveExpectedData = false; if( ! matrixFilenameBase.empty() ) { bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); if( readOK ) { if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) || (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) ) { std::cerr << "The matrix read from file \'" << GetMatrixFileName<T>( matrixFilenameBase ) << "\' does not match the matrix size specified on the command line.\n"; expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); } else { haveExpectedData = true; } } if( !haveExpectedData ) { std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n" << "Depending on host capabilities, this may take a while." << std::endl; } } if( !haveExpectedData ) { init( expected ); haveExpectedData = true; if( beVerbose ) { std::cout << "initial state:\n" << expected << std::endl; } stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( expected, nIters ); } if( beVerbose ) { std::cout << "expected result:\n" << expected << std::endl; } // determine whether we are to save the expected matrix values to a file // to speed up future runs matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" ); if( !matrixFilenameBase.empty() ) { SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); } assert( haveExpectedData ); // compute the result on the CUDA device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); // Compute the number of floating point operations we will perform. // // Note: in the truly-parallel case, we count flops for redundant // work due to the need for a halo. // But we do not add to the count for the local 1-wide halo since // we aren't computing new values for those items. unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); #if defined(PARALLEL) MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil ); assert( mpiTestStencil != NULL ); int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0; int numParticipating = 0; MPI_Allreduce( &participating, // src &numParticipating, // dest 1, // count MPI_INT, // type MPI_SUM, // op MPI_COMM_WORLD ); // communicator npts *= numParticipating; #endif // defined(PARALLEL) // In our 9-point stencil, there are 11 floating point operations // per point (3 multiplies and 11 adds): // // newval = weight_center * centerval + // weight_cardinal * (northval + southval + eastval + westval) + // weight_diagnoal * (neval + nwval + seval + swval) // // we do this stencil operation 'nIters' times unsigned long nflops = npts * 11 * nIters; #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Performing " << nWarmupPasses << " warmup passes..."; #if defined(PARALLEL) } #endif // defined(PARALLEL) for( unsigned int pass = 0; pass < nWarmupPasses; pass++ ) { init(data); (*testStencil)( data, nIters ); } #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "done." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) for( unsigned int pass = 0; pass < nPasses; pass++ ) { #if !defined(PARALLEL) std::cout << "pass " << pass << ": "; #endif // !defined(PARALLEL) init( data ); int timerHandle = Timer::Start(); (*testStencil)( data, nIters ); double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" ); // find and report the computation rate double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) StencilValidater<T>* validater = new MPIStencilValidater<T>; #else StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) validater->ValidateResult( expected, data, valErrThreshold, nValErrsToPrint ); } } catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; } // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
void RunBenchmark( ResultDatabase& resultDB, OptionParser& opts ) { int device; #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); #endif // defined(PARALLEL) cudaGetDevice( &device ); cudaDeviceProp deviceProps; cudaGetDeviceProperties( &deviceProps, device ); // Configure to allocate performance-critical memory in // a programming model-specific way. Matrix2D<float>::SetAllocator( new CUDAPMSMemMgr<float> ); #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Running single precision test" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<float>( "SP_Sten2D", resultDB, opts ); // check if we can run double precision tests if( ((deviceProps.major == 1) && (deviceProps.minor >= 3)) || (deviceProps.major >= 2) ) { // Configure to allocate performance-critical memory in // a programming model-specific way. Matrix2D<double>::SetAllocator( new CUDAPMSMemMgr<double> ); #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\n\nDP supported" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", resultDB, opts ); } else { #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Double precision not supported - skipping" << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires neg entry for every possible result int nPasses = (int)opts.getOptionInt( "passes" ); for( int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "GFLOPS", FLT_MAX ); } } std::cout << "\n" << std::endl; }
void RunBenchmark( cl::Device& dev, cl::Context& ctx, cl::CommandQueue& queue, ResultDatabase& resultDB, OptionParser& op ) { #if defined(PARALLEL) int cwrank; #endif // defined(PARALLEL) // single precision DoTest<float>( "SP_Sten2D", dev, ctx, queue, resultDB, op, "-DSINGLE_PRECISION" ); // double precision - might not be supported if( checkExtension( dev, "cl_khr_fp64" )) { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DK_DOUBLE_PRECISION" ); } else if( checkExtension( dev, "cl_amd_fp64" )) { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DAMD_DOUBLE_PRECISION" ); } else { #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nDP not supported\n"; #if defined(PARALLEL) } #endif // defined(PARALLEL) // resultDB requires an entry for every possible result int nPasses = (int)op.getOptionInt( "passes" ); for( unsigned int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "GFLOPS", FLT_MAX ); } } std::cout << '\n' << std::endl; }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { // Collect basic MPI information int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); int err; int waitForEvents = 1; // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_reduction, NULL, &err); CL_CHECK_ERROR(err); if (mpi_rank == 0) { cout << "Compiling reduction kernel." << endl; } err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != 0) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the kernels cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel cpureduce = clCreateKernel(prog, "reduceNoLocal", &err); CL_CHECK_ERROR(err); size_t localWorkSize = 256; bool nolocal = false; if (getMaxWorkGroupSize(ctx, reduce) == 1) { nolocal = true; localWorkSize = 1; } int probSizes[4] = { 1, 8, 64, 128 }; int size = probSizes[op.getOptionInt("size")-1]; size = (size * 1024 * 1024) / sizeof(T); unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory if (mpi_rank == 0) { cout << "Initializing host memory." << endl; } for(int i=0; i<size; i++) { h_idata[i] = i % 2; //Fill with some pattern } // Allocate device memory for input data cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); int num_blocks; if (!nolocal) { num_blocks = 64; } else { num_blocks = 1; // NB: This should only be the case on Apple's CPU // implementation, which is quite restrictive on // work group sizes. } // Allocate host memory for output cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(T)*num_blocks, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(T) * num_blocks , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_blocks * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Copy data to GPU Event evTransfer("PCIe Transfer"); err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double inputTransfer = evTransfer.StartEndRuntime(); err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, localWorkSize * sizeof(T), NULL); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 1, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(cpureduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); size_t globalWorkSize; if (!nolocal) { globalWorkSize = localWorkSize * 64; // Use 64 work groups } else { globalWorkSize = 1; } int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); if (mpi_rank == 0) { cout << "Running benchmark." << endl; } for (int k = 0; k < passes; k++) { // Synch processes at the start of each test. MPI_Barrier(MPI_COMM_WORLD); double totalReduceTime = 0.0; Event evKernel("reduce kernel"); for (int m = 0; m < iters; m++) { if (nolocal) { err = clEnqueueNDRangeKernel(queue, cpureduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); } else { err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); } CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR (err); evKernel.FillTimingInfo(); totalReduceTime += (evKernel.SubmitEndRuntime() / 1.e9); } err = clEnqueueReadBuffer(queue, d_odata, true, 0, num_blocks*sizeof(T), h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransfer = (inputTransfer + evTransfer.StartEndRuntime()) / 1.e9; T local_result = 0.0f, global_result = 0.0f; // Start a wallclock timer for MPI int TH_global = Timer::Start(); // Perform reduction of block sums and MPI allreduce call for (int m = 0; m < iters; m++) { local_result = 0.0f; for (int i=0; i<num_blocks; i++) { local_result += h_odata[i]; } global_result = 0.0f; globalReduction(&local_result, &global_result); } double mpi_time = Timer::Stop(TH_global,"global all reduce") / iters; // Compute local reference solution T cpu_result = reduceCPU<T>(h_idata, size); // Use some error threshold for floating point rounding double threshold = 1.0e-6; T diff = fabs(local_result - cpu_result); if (diff > threshold) { cout << "Error in local reduction detected in rank " << mpi_rank << "\n"; cout << "Diff: " << diff << endl; } if (global_result != (mpi_size * local_result)) { cout << "Test Failed, error in global all reduce detected in rank " << mpi_rank << endl; } else { if (mpi_rank == 0) { cout << "Test Passed.\n"; } } // Calculate results char atts[1024]; sprintf(atts, "%d_itemsPerRank",size); double local_gbytes = (double)(size*sizeof(T))/(1000.*1000.*1000.); double global_gbytes = local_gbytes * mpi_size; totalReduceTime /= iters; // use average time over the iterations resultDB.AddResult(testName+"-Kernel", atts, "GB/s", global_gbytes / totalReduceTime); resultDB.AddResult(testName+"-Kernel+PCIe", atts, "GB/s", global_gbytes / (totalReduceTime + totalTransfer)); resultDB.AddResult(testName+"-MPI_Allreduce", atts, "GB/s", (sizeof(T)*mpi_size*1.e-9) / (mpi_time)); resultDB.AddResult(testName+"-Overall", atts, "GB/s", global_gbytes / (totalReduceTime + totalTransfer + mpi_time)); } err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); }
void dump(OptionParser& op) { int i, j; void* work; T2* source, * result; unsigned long bytes = 0; if (op.getOptionInt("MB") == 0) { int probSizes[4] = { 1, 8, 96, 256 }; int sizeIndex = op.getOptionInt("size")-1; if (sizeIndex < 0 || sizeIndex >= 4) { cerr << "Invalid size index specified\n"; exit(-1); } bytes = probSizes[sizeIndex]; } else { bytes = op.getOptionInt("MB"); } // Convert to MB bytes *= 1024 * 1024; bool do_dp = dp<T2>(); init(op, do_dp); // now determine how much available memory will be used int half_n_ffts = bytes / (512*sizeof(T2)*2); int n_ffts = half_n_ffts * 2; int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(T2); double N = half_n_cmplx*2; fprintf(stderr, "used_bytes=%d, N=%g\n", used_bytes, N); // allocate host and device memory allocHostBuffer((void**)&source, used_bytes); allocHostBuffer((void**)&result, used_bytes); // init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // alloc device memory allocDeviceBuffer(&work, used_bytes); copyToDevice(work, source, used_bytes); fprintf(stdout, "INITIAL:\n"); for (i = 0; i < N; i++) { fprintf(stdout, "(%g, %g)\n", source[i].x, source[i].y); } forward(work, n_ffts); copyFromDevice(result, work, used_bytes); fprintf(stdout, "FORWARD:\n"); for (i = 0; i < N; i++) { fprintf(stdout, "(%g, %g)\n", result[i].x, result[i].y); } inverse(work, n_ffts); copyFromDevice(result, work, used_bytes); fprintf(stdout, "\nINVERSE:\n"); for (i = 0; i < N; i++) { fprintf(stdout, "(%g, %g)\n", result[i].x, result[i].y); } freeDeviceBuffer(work); freeHostBuffer(source); freeHostBuffer(result); }
void csrTest(cl_device_id dev, cl_context ctx, string compileFlags, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, floatType* h_val, int* h_cols, int* h_rowDelimiters, floatType* h_vec, floatType* h_out, int numRows, int numNonZeroes, floatType* refOut, bool padded, const size_t maxImgWidth) { if (devSupportsImages) { char texflags[64]; sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth); compileFlags+=string(texflags); } // Set up OpenCL Program Object int err = 0; cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL, &err); CL_CHECK_ERROR(err); // Build the openCL kernels err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); // CL_CHECK_ERROR(err); // if we check and fail here, we never get to see // the OpenCL compiler's build log // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Device data structures cl_mem d_val, d_vec, d_out; cl_mem d_cols, d_rowDelimiters; // Allocate device memory d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes * sizeof(cl_int), NULL, &err); CL_CHECK_ERROR(err); int imgHeight = 0; if (devSupportsImages) { imgHeight=(numRows+maxImgWidth-1)/maxImgWidth; cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT; if(sizeof(floatType)==4) fmt.image_channel_order=CL_R; else fmt.image_channel_order=CL_RG; d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth, imgHeight, 0, NULL, &err); CL_CHECK_ERROR(err); } else { d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); } d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_rowDelimiters = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numRows+1) * sizeof(cl_int), NULL, &err); CL_CHECK_ERROR(err); // Setup events for timing Event valTransfer("transfer Val data over PCIe bus"); Event colsTransfer("transfer cols data over PCIe bus"); Event vecTransfer("transfer vec data over PCIe bus"); Event rowDelimitersTransfer("transfer rowDelimiters data over PCIe bus"); // Transfer data to device err = clEnqueueWriteBuffer(queue, d_val, true, 0, numNonZeroes * sizeof(floatType), h_val, 0, NULL, &valTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue, d_cols, true, 0, numNonZeroes * sizeof(int), h_cols, 0, NULL, &colsTransfer.CLEvent()); CL_CHECK_ERROR(err); if (devSupportsImages) { size_t offset[3]={0}; size_t size[3]={maxImgWidth,(size_t)imgHeight,1}; err = clEnqueueWriteImage(queue,d_vec, true, offset, size, 0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } else { err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows * sizeof(floatType), h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } err = clEnqueueWriteBuffer(queue, d_rowDelimiters, true, 0, (numRows+1) * sizeof(int), h_rowDelimiters, 0, NULL, &rowDelimitersTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); valTransfer.FillTimingInfo(); colsTransfer.FillTimingInfo(); vecTransfer.FillTimingInfo(); rowDelimitersTransfer.FillTimingInfo(); double iTransferTime = valTransfer.StartEndRuntime() + colsTransfer.StartEndRuntime() + vecTransfer.StartEndRuntime() + rowDelimitersTransfer.StartEndRuntime(); int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); // Results description info char atts[TEMP_BUFFER_SIZE]; sprintf(atts, "%d_elements_%d_rows", numNonZeroes, numRows); string prefix = ""; prefix += (padded) ? "Padded_" : ""; double gflop = 2 * (double) numNonZeroes; cout << "CSR Scalar Kernel\n"; Event kernelExec("kernel Execution"); // Set up CSR Kernels cl_kernel csrScalar, csrVector; csrScalar = clCreateKernel(prog, "spmv_csr_scalar_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 3, sizeof(cl_mem), (void*) &d_rowDelimiters); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 4, sizeof(cl_int), (void*) &numRows); CL_CHECK_ERROR(err); err = clSetKernelArg(csrScalar, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); csrVector = clCreateKernel(prog, "spmv_csr_vector_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 3, sizeof(cl_mem), (void*) &d_rowDelimiters); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 4, sizeof(cl_int), (void*) &numRows); CL_CHECK_ERROR(err); err = clSetKernelArg(csrVector, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); // Append correct suffix to resultsDB entry string suffix; if (sizeof(floatType) == sizeof(float)) { suffix = "-SP"; } else { suffix = "-DP"; } const size_t scalarGlobalWSize = numRows; size_t localWorkSize = BLOCK_SIZE; for (int k = 0; k < passes; k++) { double scalarKernelTime = 0.0; // Run Scalar Kernel for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, csrScalar, 1, NULL, &scalarGlobalWSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); scalarKernelTime += kernelExec.StartEndRuntime(); } // Transfer data back to host Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; // If results don't match, don't report performance } scalarKernelTime = scalarKernelTime / (double)iters; string testName = prefix+"CSR-Scalar"+suffix; double totalTransfer = iTransferTime + oTransferTime; resultDB.AddResult(testName, atts, "Gflop/s", gflop/(scalarKernelTime)); resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s", gflop / (scalarKernelTime+totalTransfer)); } // Clobber correct answer, so we can be sure the vector kernel is correct err = clEnqueueWriteBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_vec, 0, NULL, NULL); CL_CHECK_ERROR(err); cout << "CSR Vector Kernel\n"; // Verify Local work group size size_t maxLocal = getMaxWorkGroupSize(ctx, csrVector); if (maxLocal < 32) { cout << "Warning: CSRVector requires a work group size >= 32" << endl; cout << "Skipping this kernel." << endl; err = clReleaseMemObject(d_rowDelimiters); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); err = clReleaseKernel(csrScalar); CL_CHECK_ERROR(err); err = clReleaseKernel(csrVector); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); return; } localWorkSize = VECTOR_SIZE; while (localWorkSize+VECTOR_SIZE <= maxLocal && localWorkSize+VECTOR_SIZE <= BLOCK_SIZE) { localWorkSize += VECTOR_SIZE; } const size_t vectorGlobalWSize = numRows * VECTOR_SIZE; // 1 warp per row for (int k = 0; k < passes; k++) { // Run Vector Kernel double vectorKernelTime = 0.0; for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, csrVector, 1, NULL, &vectorGlobalWSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); vectorKernelTime += kernelExec.StartEndRuntime(); } Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; // If results don't match, don't report performance } vectorKernelTime = vectorKernelTime / (double)iters; string testName = prefix+"CSR-Vector"+suffix; double totalTransfer = iTransferTime + oTransferTime; resultDB.AddResult(testName, atts, "Gflop/s", gflop/vectorKernelTime); resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s", gflop/(vectorKernelTime+totalTransfer)); } // Free device memory err = clReleaseMemObject(d_rowDelimiters); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); err = clReleaseKernel(csrScalar); CL_CHECK_ERROR(err); err = clReleaseKernel(csrVector); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); }
// **************************************************************************** // Method: main() // // Purpose: // serial and parallel main for OpenCL level0 benchmarks // // Arguments: // argc, argv // // Programmer: SHOC Team // Creation: The Epoch // // Modifications: // Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010 // Changed the way device selection works. It now defaults to the device // index corresponding to the process's rank within a node if no devices // are specified on the command command line, and otherwise, round-robins // the list of devices among the tasks. // // Gabriel Marin, Tue Jun 01 15:38 EST 2010 // Check that we have valid (not NULL) context and queue objects before // running the benchmarks. Errors inside CreateContextFromSingleDevice or // CreateCommandQueueForContextAndDevice were not propagated out to the main // program. // // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n"; #endif OptionParser op; //Add shared options to the parser op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use", 'p'); op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_VECINT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1 ); } if (op.getOptionBool("infoDevices")) { #define DEBUG_DEVICE_CONTAINER 0 #ifdef PARALLEL // execute following code only if I am the process of lowest // rank on this node NodeInfo NI; int mynoderank = NI.nodeRank(); if (mynoderank==0) { int nlrrank, nlrsize; MPI_Comm nlrcomm = NI.getNLRComm(); MPI_Comm_size(nlrcomm, &nlrsize); MPI_Comm_rank(nlrcomm, &nlrrank); OpenCLNodePlatformContainer ndc1; OpenCLMultiNodeContainer localMnc(ndc1); localMnc.doMerge (nlrrank, nlrsize, nlrcomm); if (rank==0) // I am the global rank 0, print all configurations localMnc.Print (cout); } #else OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); #if DEBUG_DEVICE_CONTAINER OpenCLMultiNodeContainer mnc1(ndc1), mnc2; mnc1.Print (cout); ostringstream oss; mnc1.writeObject (oss); std::string temp(oss.str()); cout << "Serialized MultiNodeContainer:\n" << temp; istringstream iss(temp); mnc2.readObject (iss); cout << "Unserialized object2:\n"; mnc2.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 1:\n"; mnc1.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 2:\n"; mnc1.Print (cout); #endif // DEBUG #endif // PARALLEL return (0); } bool verbose = op.getOptionBool("verbose"); // The device option supports specifying more than one device // for now, just choose the first one. int platform = op.getOptionInt("platform"); #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); if (verbose) cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl; #else int myNodeRank = 0; #endif // If they haven't specified any devices, assume they // want the process with in-node rank N to use device N int deviceIdx = myNodeRank; // If they have, then round-robin the list of devices // among the processes on a node. vector<long long> deviceVec = op.getOptionVecInt("device"); if (deviceVec.size() > 0) { int len = deviceVec.size(); deviceIdx = deviceVec[myNodeRank % len]; } // Check for an erroneous device if (deviceIdx >= GetNumOclDevices(platform)) { cerr << "Warning: device index: " << deviceIdx << " out of range, defaulting to device 0.\n"; deviceIdx = 0; } // Initialization if (verbose) cout << ">> initializing\n"; cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx); cl_int clErr; cl_context ctx = clCreateContext( NULL, // properties 1, // number of devices &devID, // device NULL, // notification function NULL, &clErr ); CL_CHECK_ERROR(clErr); cl_command_queue queue = clCreateCommandQueue( ctx, devID, CL_QUEUE_PROFILING_ENABLE, &clErr ); CL_CHECK_ERROR(clErr); ResultDatabase resultDB; // Run the benchmark RunBenchmark(devID, ctx, queue, resultDB, op); clReleaseCommandQueue( queue ); clReleaseContext( ctx ); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { std::cerr << "unrecognized exception caught" << std::endl; ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif return ret; }
void RunTest(string testName, ResultDatabase &resultDB, OptionParser &op) { int probSizes[4] = { 1, 8, 48, 96 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MiB size = (size*1024*1024)/sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate Host Memory __declspec(target(MIC)) static T *hkey, *outkey; __declspec(target(MIC)) static T *hvalue, *outvalue; hkey = (T*)_mm_malloc(bytes,ALIGN); hvalue = (T*)_mm_malloc(bytes,ALIGN); outkey = (T*)_mm_malloc(bytes,ALIGN); outvalue = (T*)_mm_malloc(bytes,ALIGN); // Initialize host memory cout << "Initializing host memory." << endl; srand(time(NULL)); for (int i = 0; i < size; i++) { hkey[i] = hvalue[i]= (i+255) % 1089; // Fill with some pattern } int micdev = op.getOptionInt("target"); int iters = op.getOptionInt("passes"); int numThreads = op.getOptionInt("nthreads"); cout << "nthreads = " <<numThreads<< endl; cout << "Running benchmark" << endl; for(int it=0;it<iters;it++) { // Allocating buffer on card #pragma offload target(mic:micdev) in(hkey:length(size) free_if(0)) \ in(hvalue:length(size) free_if(0))\ out(outkey:length(size) free_if(0))\ out(outvalue:length(size) free_if(0)) { } double start = curr_second(); // Get data transfer time #pragma offload target(mic:micdev) in(hkey:length(size) alloc_if(0) \ free_if(0)) in(hvalue:length(size) alloc_if(0) free_if(0)) { } float transferTime = curr_second()-start; double totalRunTime = 0.0f; start = curr_second(); #pragma offload target(mic:micdev) nocopy(hkey:length(size) \ alloc_if(0) free_if(0)) \ nocopy(hvalue:length(size) alloc_if(0) free_if(0)) \ nocopy(outkey:length(size) alloc_if(0) free_if(0)) \ nocopy(outvalue:length(size) alloc_if(0) free_if(0))\ in(numThreads) { sortKernel<T>(hkey, hvalue, outkey, outvalue, size, numThreads); } totalRunTime = curr_second()-start; #pragma offload target(mic:micdev) nocopy(hkey:length(size) \ alloc_if(0) free_if(1)) \ nocopy(hvalue:length(size) alloc_if(0) free_if(1)) \ out(outkey:length(size) alloc_if(0)) \ out(outvalue:length(size) alloc_if(0)) { } // If results aren't correct, don't report perf numbers if (!verifyResult<T>(outkey, outvalue, size)) { return; } char atts[1024]; double avgTime = (totalRunTime / (double) iters); sprintf(atts, "%d items", size); double gb = (double)(size * sizeof(T)) / (1000. * 1000. * 1000.); resultDB.AddResult(testName, atts, "GB/s", gb / avgTime); resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gb / (avgTime + transferTime)); resultDB.AddResult(testName+"_Parity", atts, "N", transferTime / avgTime); } // Clean up _mm_free(hkey); _mm_free(hvalue); }
void RunTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op, string& compileFlags) { int n_species = 22; int i, j, err; int probSizes_SP[4] = { 24, 32, 40, 48}; int probSizes_DP[4] = { 16, 24, 32, 40}; int *probSizes = (sizeof(T) == sizeof(double)) ? probSizes_DP : probSizes_SP; int size = probSizes[op.getOptionInt("size")-1]; // The number of grid points int n = size * size * size; // For now these conversion factors are just 1 T pconv = 1.0; // 1418365.88544; T tconv = 1.0; //120.0; T rateconv = 1.0; //11.0393507649917; // Host copies of data T* h_t = new T[n]; T* h_p = new T[n]; T* h_y = new T[n*n_species]; T* h_wdot = new T[n*n_species]; T* h_molwt = new T[n_species]; // Device data cl_mem d_t; // Temperatures array cl_mem d_p; // Pressures array cl_mem d_y; // Input variables cl_mem d_wdot; // Output variables // intermediate variables cl_mem d_rf, d_rb, d_rklow, d_c, d_a, d_eg, d_molwt; // Initialize host memory for (i=0; i<n; i++) { h_p[i] = 1.0132e6; h_t[i] = 1000.0; } for (j=0; j<22; j++) { for (i=0; i<n; i++) { h_y[(j*n)+i]= 0.0; if (j==14) h_y[(j*n)+i] = 0.064; if (j==3) h_y[(j*n)+i] = 0.218; if (j==21) h_y[(j*n)+i] = 0.718; } } for (int i=0; i<n_species; i++) { h_molwt[i] = 1.0f; } // // Initialize molecular weights // h_molwt[0]= 2.01594E-03; // h_molwt[1]= 1.00797E-03; // h_molwt[2]= 1.59994E-02; // h_molwt[3]= 3.19988E-02; // h_molwt[4]= 1.700737E-02; // h_molwt[5]= 1.801534E-02; // h_molwt[6]= 3.300677E-02; // h_molwt[7]= 3.401473999999999E-02; // h_molwt[8]= 1.503506E-02; // h_molwt[9] = 1.604303E-02; // h_molwt[10] = 2.801055E-02; // h_molwt[11] = 4.400995E-02; // h_molwt[12] = 3.002649E-02; // h_molwt[13] = 2.603824E-02; // h_molwt[14] = 2.805418E-02; // h_molwt[15] = 3.007012E-02; // h_molwt[16] = 4.102967E-02; // h_molwt[17] = 4.203764E-02; // h_molwt[18] = 4.405358E-02; // h_molwt[19] = 4.10733E-02; // h_molwt[20] = 4.208127E-02; // h_molwt[21] = 2.80134E-02; // Allocate device memory size_t base = n * sizeof(T); clMalloc(d_t, base); clMalloc(d_p, base); clMalloc(d_y, n_species*base); clMalloc(d_wdot, n_species*base); clMalloc(d_rf, 206*base); clMalloc(d_rb, 206*base); clMalloc(d_rklow, 21*base); clMalloc(d_c, C_SIZE*base); clMalloc(d_a, A_SIZE*base); clMalloc(d_eg, EG_SIZE*base); clMalloc(d_molwt, n_species*sizeof(T)); // Copy over input params long inputTransferTime = 0; Event evTransfer("PCIe Transfer"); clMemtoDevice(d_t, h_t, base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_p, h_p, base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_y, h_y, n_species*base); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); clMemtoDevice(d_molwt, h_molwt, n_species*sizeof(T)); evTransfer.FillTimingInfo(); inputTransferTime += evTransfer.StartEndRuntime(); // Set up macros compileFlags += "-DDIM=" + toString(size) + " " + "-DN_GP=" + toString(n) + " "; unsigned int passes = op.getOptionInt("passes"); for (unsigned int i = 0; i < passes; i++) { size_t globalWorkSize = n; size_t localWorkSize = 128; // -------------------- phase 1 ----------------- // Setup Program Objects (phase 1) clProg(gr_prog, cl_source_gr_base); clProg(rdsmh_prog, cl_source_rdsmh); clProg(ratt_prog, cl_source_ratt); clProg(ratt2_prog, cl_source_ratt2); clProg(ratt3_prog, cl_source_ratt3); clProg(ratt4_prog, cl_source_ratt4); clProg(ratt5_prog, cl_source_ratt5); clProg(ratt6_prog, cl_source_ratt6); clProg(ratt7_prog, cl_source_ratt7); clProg(ratt8_prog, cl_source_ratt8); clProg(ratt9_prog, cl_source_ratt9); clProg(ratt10_prog, cl_source_ratt10); clProg(ratx_prog, cl_source_ratx); clProg(ratxb_prog, cl_source_ratxb); clProg(ratx2_prog, cl_source_ratx2); clProg(ratx4_prog, cl_source_ratx4); // Build the kernels (phase 1) cout << "Compiling kernels (phase 1)..."; cout.flush(); clBuild(gr_prog); clBuild(rdsmh_prog); clBuild(ratt_prog); clBuild(ratt2_prog); clBuild(ratt3_prog); clBuild(ratt4_prog); clBuild(ratt5_prog); clBuild(ratt6_prog); clBuild(ratt7_prog); clBuild(ratt8_prog); clBuild(ratt9_prog); clBuild(ratt10_prog); clBuild(ratx_prog); clBuild(ratxb_prog); clBuild(ratx2_prog); clBuild(ratx4_prog); cout << "done." << endl; // Extract out kernel objects (phase 1) cout << "Generating OpenCL Kernel Objects (phase 1)..."; cout.flush(); // GR Base Kernels cl_kernel grBase_kernel = clCreateKernel(gr_prog, "gr_base", &err); CL_CHECK_ERROR(err); // RDSMH Kernels cl_kernel rdsmh_kernel = clCreateKernel(rdsmh_prog, "rdsmh_kernel", &err); CL_CHECK_ERROR(err); // RATT Kernels cl_kernel ratt_kernel = clCreateKernel(ratt_prog, "ratt_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt2_kernel = clCreateKernel(ratt2_prog, "ratt2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt3_kernel = clCreateKernel(ratt3_prog, "ratt3_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt4_kernel = clCreateKernel(ratt4_prog, "ratt4_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt5_kernel = clCreateKernel(ratt5_prog, "ratt5_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt6_kernel = clCreateKernel(ratt6_prog, "ratt6_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt7_kernel = clCreateKernel(ratt7_prog, "ratt7_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt8_kernel = clCreateKernel(ratt8_prog, "ratt8_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt9_kernel = clCreateKernel(ratt9_prog, "ratt9_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratt10_kernel = clCreateKernel(ratt10_prog, "ratt10_kernel", &err); CL_CHECK_ERROR(err); // RATX Kernels cl_kernel ratx_kernel = clCreateKernel(ratx_prog, "ratx_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratxb_kernel = clCreateKernel(ratxb_prog, "ratxb_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratx2_kernel = clCreateKernel(ratx2_prog, "ratx2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel ratx4_kernel = clCreateKernel(ratx4_prog, "ratx4_kernel", &err); CL_CHECK_ERROR(err); cout << "done." << endl; //Set kernel arguments (phase 1) err = clSetKernelArg(grBase_kernel, 0, sizeof(cl_mem), (void*)&d_p); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 1, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 2, sizeof(cl_mem), (void*)&d_y); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 3, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 4, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); err = clSetKernelArg(grBase_kernel, 5, sizeof(T), (void*)&pconv); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 1, sizeof(cl_mem), (void*)&d_eg); CL_CHECK_ERROR(err); err = clSetKernelArg(rdsmh_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 1, sizeof(cl_mem), (void*)&d_rf); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); clSetRattArg(ratt2_kernel); clSetRattArg(ratt3_kernel); clSetRattArg(ratt4_kernel); clSetRattArg(ratt5_kernel); clSetRattArg(ratt6_kernel); clSetRattArg(ratt7_kernel); clSetRattArg(ratt8_kernel); clSetRattArg(ratt9_kernel); err = clSetKernelArg(ratt10_kernel, 0, sizeof(cl_mem), (void*)&d_t); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt10_kernel, 1, sizeof(cl_mem), (void*)&d_rklow); CL_CHECK_ERROR(err); err = clSetKernelArg(ratt10_kernel, 2, sizeof(T), (void*)&tconv); CL_CHECK_ERROR(err); clSetRatxArg(ratx_kernel); clSetRatxArg(ratxb_kernel); err = clSetKernelArg(ratx2_kernel, 0, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx2_kernel, 1, sizeof(cl_mem), (void*)&d_rf); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx4_kernel, 0, sizeof(cl_mem), (void*)&d_c); CL_CHECK_ERROR(err); err = clSetKernelArg(ratx4_kernel, 1, sizeof(cl_mem), (void*)&d_rb); CL_CHECK_ERROR(err); // Execute kernels (phase 1) cout << "Executing kernels (phase 1)..."; cout.flush(); Event evFirst_1("first kernel phase 1"); Event evLast_1("last kernel phase 1"); clLaunchKernelEv(grBase_kernel, evFirst_1.CLEvent()); clLaunchKernel(ratt_kernel); clLaunchKernel(rdsmh_kernel); clLaunchKernel(ratt2_kernel); clLaunchKernel(ratt3_kernel); clLaunchKernel(ratt4_kernel); clLaunchKernel(ratt5_kernel); clLaunchKernel(ratt6_kernel); clLaunchKernel(ratt7_kernel); clLaunchKernel(ratt8_kernel); clLaunchKernel(ratt9_kernel); clLaunchKernel(ratt10_kernel); clLaunchKernel(ratx_kernel); clLaunchKernel(ratxb_kernel); clLaunchKernel(ratx2_kernel); clLaunchKernelEv(ratx4_kernel, evLast_1.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); cout << "done. " << endl; evFirst_1.FillTimingInfo(); evLast_1.FillTimingInfo(); double total_phase1 = evLast_1.EndTime() - evFirst_1.StartTime(); // Release Kernels (phase 1) clReleaseKernel(grBase_kernel); clReleaseKernel(rdsmh_kernel); clReleaseKernel(ratt_kernel); clReleaseKernel(ratt2_kernel); clReleaseKernel(ratt3_kernel); clReleaseKernel(ratt4_kernel); clReleaseKernel(ratt5_kernel); clReleaseKernel(ratt6_kernel); clReleaseKernel(ratt7_kernel); clReleaseKernel(ratt8_kernel); clReleaseKernel(ratt9_kernel); clReleaseKernel(ratt10_kernel); clReleaseKernel(ratx_kernel); clReleaseKernel(ratxb_kernel); clReleaseKernel(ratx2_kernel); clReleaseKernel(ratx4_kernel); // Release Programs (phase 1) clReleaseProgram(gr_prog); clReleaseProgram(rdsmh_prog); clReleaseProgram(ratt_prog); clReleaseProgram(ratt2_prog); clReleaseProgram(ratt3_prog); clReleaseProgram(ratt4_prog); clReleaseProgram(ratt5_prog); clReleaseProgram(ratt6_prog); clReleaseProgram(ratt7_prog); clReleaseProgram(ratt8_prog); clReleaseProgram(ratt9_prog); clReleaseProgram(ratt10_prog); clReleaseProgram(ratx_prog); clReleaseProgram(ratxb_prog); clReleaseProgram(ratx2_prog); clReleaseProgram(ratx4_prog); // -------------------- phase 2 ----------------- // Setup Program Objects (phase 2) clProg(qssa_prog, cl_source_qssa); clProg(qssab_prog, cl_source_qssab); clProg(qssa2_prog, cl_source_qssa2); clProg(rdwdot_prog, cl_source_rdwdot); clProg(rdwdot2_prog, cl_source_rdwdot2); clProg(rdwdot3_prog, cl_source_rdwdot3); clProg(rdwdot6_prog, cl_source_rdwdot6); clProg(rdwdot7_prog, cl_source_rdwdot7); clProg(rdwdot8_prog, cl_source_rdwdot8); clProg(rdwdot9_prog, cl_source_rdwdot9); clProg(rdwdot10_prog, cl_source_rdwdot10); // Build the kernels (phase 2) cout << "Compiling kernels (phase 2)..."; cout.flush(); clBuild(qssa_prog); clBuild(qssab_prog); clBuild(qssa2_prog); clBuild(rdwdot_prog); clBuild(rdwdot2_prog); clBuild(rdwdot3_prog); clBuild(rdwdot6_prog); clBuild(rdwdot7_prog); clBuild(rdwdot8_prog); clBuild(rdwdot9_prog); clBuild(rdwdot10_prog); cout << "done." << endl; // Extract out kernel objects (phase 2) cout << "Generating OpenCL Kernel Objects (phase 2)..."; cout.flush(); // QSSA Kernels cl_kernel qssa_kernel = clCreateKernel(qssa_prog, "qssa_kernel", &err); CL_CHECK_ERROR(err); cl_kernel qssab_kernel = clCreateKernel(qssab_prog, "qssab_kernel", &err); CL_CHECK_ERROR(err); cl_kernel qssa2_kernel = clCreateKernel(qssa2_prog, "qssa2_kernel", &err); CL_CHECK_ERROR(err); // RDWDOT Kernels cl_kernel rdwdot_kernel = clCreateKernel(rdwdot_prog, "rdwdot_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot2_kernel = clCreateKernel(rdwdot2_prog, "rdwdot2_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot3_kernel = clCreateKernel(rdwdot3_prog, "rdwdot3_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot6_kernel = clCreateKernel(rdwdot6_prog, "rdwdot6_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot7_kernel = clCreateKernel(rdwdot7_prog, "rdwdot7_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot8_kernel = clCreateKernel(rdwdot8_prog, "rdwdot8_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot9_kernel = clCreateKernel(rdwdot9_prog, "rdwdot9_kernel", &err); CL_CHECK_ERROR(err); cl_kernel rdwdot10_kernel = clCreateKernel(rdwdot10_prog, "rdwdot10_kernel", &err); CL_CHECK_ERROR(err); cout << "done." << endl; //Set kernel arguments (phase 2) clSetQssaArg(qssa_kernel); clSetQssaArg(qssab_kernel); clSetQssaArg(qssa2_kernel); clSetRdwdotArg(rdwdot_kernel); clSetRdwdotArg(rdwdot2_kernel); clSetRdwdotArg(rdwdot3_kernel); clSetRdwdotArg(rdwdot6_kernel); clSetRdwdotArg(rdwdot7_kernel); clSetRdwdotArg(rdwdot8_kernel); clSetRdwdotArg(rdwdot9_kernel); clSetRdwdotArg(rdwdot10_kernel); // Execute kernels (phase 2) cout << "Executing kernels (phase 2)..."; cout.flush(); Event evFirst_2("first kernel phase 2"); Event evLast_2("last kernel phase 2"); clLaunchKernelEv(qssa_kernel, evFirst_2.CLEvent()); clLaunchKernel(qssab_kernel); clLaunchKernel(qssa2_kernel); clLaunchKernel(rdwdot_kernel); clLaunchKernel(rdwdot2_kernel); clLaunchKernel(rdwdot3_kernel); clLaunchKernel(rdwdot6_kernel); clLaunchKernel(rdwdot7_kernel); clLaunchKernel(rdwdot8_kernel); clLaunchKernel(rdwdot9_kernel); clLaunchKernelEv(rdwdot10_kernel, evLast_2.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); cout << "done. " << endl; evFirst_2.FillTimingInfo(); evLast_2.FillTimingInfo(); double total_phase2 = evLast_2.EndTime() - evFirst_2.StartTime(); // Release Kernels (phase 2) clReleaseKernel(qssa_kernel); clReleaseKernel(qssab_kernel); clReleaseKernel(qssa2_kernel); clReleaseKernel(rdwdot_kernel); clReleaseKernel(rdwdot2_kernel); clReleaseKernel(rdwdot3_kernel); clReleaseKernel(rdwdot6_kernel); clReleaseKernel(rdwdot7_kernel); clReleaseKernel(rdwdot8_kernel); clReleaseKernel(rdwdot9_kernel); clReleaseKernel(rdwdot10_kernel); // Release Programs (phase 2) clReleaseProgram(qssa_prog); clReleaseProgram(qssab_prog); clReleaseProgram(qssa2_prog); clReleaseProgram(rdwdot_prog); clReleaseProgram(rdwdot2_prog); clReleaseProgram(rdwdot3_prog); clReleaseProgram(rdwdot6_prog); clReleaseProgram(rdwdot7_prog); clReleaseProgram(rdwdot8_prog); clReleaseProgram(rdwdot9_prog); clReleaseProgram(rdwdot10_prog); // -------------------- timings ----------------- double total = total_phase1 + total_phase2; // Estimate GFLOPs (roughly 10k flops / point) double gflops = (n*10000.) / total; // Copy results back err = clEnqueueReadBuffer(queue, d_wdot, true, 0, n*n_species*sizeof(T), h_wdot, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransferTime = inputTransferTime + evTransfer.StartEndRuntime(); double gflops_pcie = (n*10000.) / (total + totalTransferTime); resultDB.AddResult(testName, "cubic", "GFLOPS", gflops); resultDB.AddResult(testName+"_PCIe", "cubic", "GFLOPS", gflops_pcie); resultDB.AddResult(testName+"_Parity", "cubic", "n", totalTransferTime / total ); } // Release Memory err = clReleaseMemObject(d_t); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_p); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_y); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_wdot); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rf); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rb); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_c); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_eg); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_rklow); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_a); CL_CHECK_ERROR(err); // Cleanup Host Memory Objects delete[] h_t; delete[] h_p; delete[] h_y; delete[] h_wdot; delete[] h_molwt; }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { int err = 0; // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_sort, NULL, &err); CL_CHECK_ERROR(err); // Before proceeding, make sure the kernel code compiles and // all kernels are valid. cout << "Compiling sort kernels." << endl; err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the 3 kernels // Note that these kernels are analogs of those in use for // scan, but have had "visiting" logic added to them // as described by Merrill et al. See // http://www.cs.virginia.edu/~dgm4d/ cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err); CL_CHECK_ERROR(err); cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err); CL_CHECK_ERROR(err); // If the device doesn't support at least 256 work items in a // group, use a different kernel (TODO) if (getMaxWorkGroupSize(dev) < 256) { cout << "Scan requires work group size of at least 256" << endl; char atts[1024] = "GSize_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult(testName , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_Parity" , atts, "GB/s", FLT_MAX); } return; } // Problem Sizes int probSizes[4] = { 1, 8, 32, 64 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MiB size = (size * 1024 * 1024) / sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data (h_idata) cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for output data (h_odata) cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory cout << "Initializing host memory." << endl; for (int i = 0; i < size; i++) { h_idata[i] = i % 16; // Fill with some pattern h_odata[i] = -1; } // The radix width in bits const int radix_width = 4; // Changing this requires major kernel updates const int num_digits = (int)pow((double)2, radix_width); // n possible digits // Allocate device memory for input array cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output array cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Number of local work items per group const size_t local_wsize = 256; // Number of global work items const size_t global_wsize = 16384; // i.e. 64 work groups const size_t num_work_groups = global_wsize / local_wsize; // Allocate device memory for local work group intermediate sums cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_work_groups * num_digits * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the top-level scan err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Copy data to GPU cout << "Copying input data to device." << endl; Event evTransfer("PCIe transfer"); err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double inTransferTime = evTransfer.StartEndRuntime(); // Repeat the test multiplie times to get a good measurement int passes = op.getOptionInt("passes"); cout << "Running benchmark with size " << size << endl; for (int k = 0; k < passes; k++) { int th = Timer::Start(); // Assuming an 8 bit byte. for (int shift = 0; shift < sizeof(T)*8; shift += radix_width) { // Like scan, we use a reduce-then-scan approach // But before proceeding, update the shift appropriately // for each kernel. This is how many bits to shift to the // right used in binning. err = clSetKernelArg(reduce, 4, sizeof(cl_int), (void*)&shift); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 5, sizeof(cl_int), (void*)&shift); CL_CHECK_ERROR(err); // Also, the sort is not in place, so swap the input and output // buffers on each pass. bool even = ((shift / radix_width) % 2 == 0) ? true : false; if (even) { // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); } else // i.e. odd pass { // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); } // Each thread block gets an equal portion of the // input array, and computes occurrences of each digit. err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &global_wsize, &local_wsize, 0, NULL, NULL); // Next, a top-level exclusive scan is performed on the // per block histograms. This is done by a single // work group (note global size here is the same as local). err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL, &local_wsize, &local_wsize, 0, NULL, NULL); // Finally, a bottom-level scan is performed by each block // that is seeded with the scanned histograms which rebins, // locally scans, then scatters keys to global memory err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL, &global_wsize, &local_wsize, 0, NULL, NULL); } err = clFinish(queue); CL_CHECK_ERROR(err); double total_sort = Timer::Stop(th, "total sort time"); err = clEnqueueReadBuffer(queue, d_idata, true, 0, bytes, h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); double totalTransfer = inTransferTime + evTransfer.StartEndRuntime(); totalTransfer /= 1.e9; // Convert to seconds // If answer is incorrect, stop test and do not report performance if (! verifySort(h_odata, size)) { return; } char atts[1024]; double avgTime = total_sort; double gbs = (double) (size * sizeof(T)) / (1000. * 1000. * 1000.); sprintf(atts, "%d_items", size); resultDB.AddResult(testName, atts, "GB/s", gbs / (avgTime)); resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gbs / (avgTime + totalTransfer)); resultDB.AddResult(testName+"_Parity", atts, "N", totalTransfer / avgTime); } // Clean up device memory err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_isums); CL_CHECK_ERROR(err); // Clean up pinned host memory err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); // Clean up program and kernel objects err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); err = clReleaseKernel(top_scan); CL_CHECK_ERROR(err); err = clReleaseKernel(bottom_scan); CL_CHECK_ERROR(err); }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, const string& compileFlags) { int err = 0; // Collect basic MPI information int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); // Program Setup cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_scan, NULL, &err); CL_CHECK_ERROR(err); // Before proceeding, make sure the kernel code compiles and // all kernels are valid. if (mpi_rank == 0) { cout << "Compiling scan kernels." << endl; } err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the 3 kernels cl_kernel reduce = clCreateKernel(prog, "reduce", &err); CL_CHECK_ERROR(err); cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err); CL_CHECK_ERROR(err); cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err); CL_CHECK_ERROR(err); // If the device doesn't support at least 256 work items in a // group, use a different kernel (TODO) if (getMaxWorkGroupSize(dev) < 256) { cout << "Scan requires work group size of at least 256" << endl; char atts[1024] = "GSize_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k = 0; k < passes; k++) { resultDB.AddResult(testName+"-Kernel" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Overall" , atts, "GB/s", FLT_MAX); } return; } // Problem Sizes int probSizes[4] = { 1, 8, 32, 64 }; int size = probSizes[op.getOptionInt("size")-1]; // Convert to MB size = (size * 1024 * 1024) / sizeof(T); // Create input data on CPU unsigned int bytes = size * sizeof(T); // Allocate pinned host memory for input data (h_idata) cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for output data (h_odata) cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bytes, NULL, &err); CL_CHECK_ERROR(err); T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true, CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Initialize host memory if (mpi_rank == 0) { cout << "Initializing host memory." << endl; } for (int i = 0; i < size; i++) { h_idata[i] = i % 2; //Fill with some pattern h_odata[i] = -1; } // Allocate device memory for input array cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory for output array cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Number of local work items per group const size_t local_wsize = 256; // Number of local work groups and total work items const size_t num_work_groups = 64; const size_t global_wsize = local_wsize * num_work_groups; // Allocate device memory for local work group intermediate sums cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE, num_work_groups * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); // Allocate pinned host memory for intermediate block sums (h_isums) cl_mem h_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, num_work_groups * sizeof(T), NULL, &err); CL_CHECK_ERROR(err); T* h_isums = (T*)clEnqueueMapBuffer(queue, h_b, true, CL_MAP_READ|CL_MAP_WRITE, 0, num_work_groups * sizeof(T), 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Set the kernel arguments for the reduction kernel err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the top-level scan err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups); CL_CHECK_ERROR(err); err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Set the kernel arguments for the bottom-level scan err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size); CL_CHECK_ERROR(err); err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL); CL_CHECK_ERROR(err); // Repeat the test multiple times to get a good measurement int passes = op.getOptionInt("passes"); if (mpi_rank == 0) { cout << "Running benchmark with size " << size << endl; } for (int k = 0; k < passes; k++) { // Timing variables double pcie_time=0., kernel_time=0., mpi_time=0.; // Copy data to GPU Event evTransfer("PCIe transfer"); double time_temp = 0.; err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() / 1e9; // This code uses a reduce-then-scan strategy. // The major steps of the algorithm are: // 1. Local reduction on a node // 2. Global exclusive scan of the reduction values // 3. Local inclusive scan, seeded with the node's result // from the global exclusive scan Event ev_reduce("Reduction Kernel"); err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL, &global_wsize, &local_wsize, 0, NULL, &ev_reduce.CLEvent()); err = clFinish(queue); ev_reduce.FillTimingInfo(); kernel_time += (double)ev_reduce.StartEndRuntime() * 1e-9; // Next step is to copy the reduced blocks back to the host, // sum them, and perform the MPI exlcusive (top level) scan. err = clEnqueueReadBuffer(queue, d_isums, true, 0, num_work_groups*sizeof(T), h_isums, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9; // Start the timer for MPI Scan int globscan_th = Timer::Start(); T reduced=0., scanned=0.; // To get the true sum for this node, we have to add up // the block sums before MPI scanning. for (int i = 0; i < num_work_groups; i++) { reduced += h_isums[i]; } // Next step is an exclusive scan across MPI ranks. // Then a local scan seeded with the result from MPI. globalExscan(&reduced, &scanned); mpi_time += Timer::Stop(globscan_th, "Global Scan"); // Now, scanned contains all the information we need from other nodes // Next step is to perform the local top level (i.e. across blocks) scan, // but seed it with the "scanned", the sum of elems on all lower ranks. h_isums[0] += scanned; err = clEnqueueWriteBuffer(queue, d_isums, true, 0, sizeof(T), h_isums, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9; Event ev_scan("Scan Kernel"); err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL, &local_wsize, &local_wsize, 0, NULL, &ev_scan.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); ev_scan.FillTimingInfo(); kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9); // Finally, a bottom-level scan is performed by each block // that is seeded with the scanned value in block sums err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL, &global_wsize, &local_wsize, 0, NULL, &ev_scan.CLEvent()); err = clFinish(queue); CL_CHECK_ERROR(err); ev_scan.FillTimingInfo(); kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9); // Read data back for correctness check err = clEnqueueReadBuffer(queue, d_odata, true, 0, bytes, h_odata, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); // Lightweight correctness check -- won't apply // if data is not initialized to i%2 above if (mpi_rank == mpi_size-1) { if (h_odata[size-1] != (mpi_size * size) / 2) { cout << "Test Failed\n"; } else { cout << "Test Passed\n"; } } char atts[1024]; sprintf(atts, "%d items", size); double global_gb = (double)(mpi_size * size * sizeof(T)) / (1000. * 1000. * 1000.); resultDB.AddResult(testName+"-Kernel" , atts, "GB/s", global_gb / kernel_time); resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s", global_gb / (kernel_time + pcie_time)); resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s", (mpi_size * sizeof(T) *1e-9) / mpi_time); resultDB.AddResult(testName+"-Overall" , atts, "GB/s", global_gb / (kernel_time + pcie_time + mpi_time)); } // Clean up device memory err = clReleaseMemObject(d_idata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_odata); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_isums); CL_CHECK_ERROR(err); // Clean up pinned host memory err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_b, h_isums, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_i); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_o); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_b); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(reduce); CL_CHECK_ERROR(err); err = clReleaseKernel(top_scan); CL_CHECK_ERROR(err); err = clReleaseKernel(bottom_scan); CL_CHECK_ERROR(err); }
void ellPackTest(cl_device_id dev, cl_context ctx, string compileFlags, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, floatType* h_val, int* h_cols, int* h_rowDelimiters, floatType* h_vec, floatType* h_out, int numRows, int numNonZeroes, floatType* refOut, bool padded, int paddedSize, const size_t maxImgWidth) { if (devSupportsImages) { char texflags[64]; sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth); compileFlags+=string(texflags); } // Set up OpenCL Program Object int err = 0; cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL, &err); CL_CHECK_ERROR(err); // Build the openCL kernels err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } int *h_rowLengths = new int[paddedSize]; int maxrl = 0; for (int k=0; k<numRows; k++) { h_rowLengths[k] = h_rowDelimiters[k+1] - h_rowDelimiters[k]; if (h_rowLengths[k] > maxrl) { maxrl = h_rowLengths[k]; } } for (int p=numRows; p < paddedSize; p++) { h_rowLengths[p] = 0; } // Column major format host data structures int cmSize = padded ? paddedSize : numRows; floatType *h_valcm = new floatType[maxrl * cmSize]; int *h_colscm = new int[maxrl * cmSize]; convertToColMajor(h_val, h_cols, numRows, h_rowDelimiters, h_valcm, h_colscm, h_rowLengths, maxrl, padded); // Device data structures cl_mem d_val, d_vec, d_out; // floating point cl_mem d_cols, d_rowLengths; // integer // Allocate device memory d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); int imgHeight = 0; if (devSupportsImages) { imgHeight=(numRows+maxImgWidth-1)/maxImgWidth; cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT; if(sizeof(floatType)==4) fmt.image_channel_order=CL_R; else fmt.image_channel_order=CL_RG; d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth, imgHeight, 0, NULL, &err); CL_CHECK_ERROR(err); } else { d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); } d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, paddedSize * sizeof(clFloatType), NULL, &err); CL_CHECK_ERROR(err); d_rowLengths = clCreateBuffer(ctx, CL_MEM_READ_WRITE, cmSize * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); // Setup events for timing Event valTransfer("transfer Val data over PCIe bus"); Event colsTransfer("transfer cols data over PCIe bus"); Event vecTransfer("transfer vec data over PCIe bus"); Event rowLengthsTransfer("transfer rowLengths data over PCIe bus"); // Transfer data to device err = clEnqueueWriteBuffer(queue, d_val, true, 0, maxrl * cmSize * sizeof(clFloatType), h_valcm, 0, NULL, &valTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue, d_cols, true, 0, maxrl * cmSize * sizeof(cl_int), h_colscm, 0, NULL, &colsTransfer.CLEvent()); CL_CHECK_ERROR(err); if (devSupportsImages) { size_t offset[3]={0}; size_t size[3]={maxImgWidth,(size_t)imgHeight,1}; err = clEnqueueWriteImage(queue,d_vec, true, offset, size, 0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } else { err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows * sizeof(clFloatType), h_vec, 0, NULL, &vecTransfer.CLEvent()); CL_CHECK_ERROR(err); } err = clEnqueueWriteBuffer(queue, d_rowLengths, true, 0, cmSize * sizeof(int), h_rowLengths, 0, NULL, &rowLengthsTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); valTransfer.FillTimingInfo(); colsTransfer.FillTimingInfo(); vecTransfer.FillTimingInfo(); rowLengthsTransfer.FillTimingInfo(); double iTransferTime = valTransfer.StartEndRuntime() + colsTransfer.StartEndRuntime() + vecTransfer.StartEndRuntime() + rowLengthsTransfer.StartEndRuntime(); // Set up kernel arguments cl_kernel ellpackr = clCreateKernel(prog, "spmv_ellpackr_kernel", &err); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 0, sizeof(cl_mem), (void*) &d_val); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 1, sizeof(cl_mem), (void*) &d_vec); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 2, sizeof(cl_mem), (void*) &d_cols); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 3, sizeof(cl_mem), (void*) &d_rowLengths); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 4, sizeof(cl_int), (void*) &cmSize); CL_CHECK_ERROR(err); err = clSetKernelArg(ellpackr, 5, sizeof(cl_mem), (void*) &d_out); CL_CHECK_ERROR(err); const size_t globalWorkSize = cmSize; const size_t localWorkSize = BLOCK_SIZE; Event kernelExec("ELLPACKR Kernel Execution"); int passes = op.getOptionInt("passes"); int iters = op.getOptionInt("iterations"); for (int k = 0; k < passes; k++) { double totalKernelTime = 0.0; for (int j = 0; j < iters; j++) { err = clEnqueueNDRangeKernel(queue, ellpackr, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &kernelExec.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); kernelExec.FillTimingInfo(); totalKernelTime += kernelExec.StartEndRuntime(); } Event outTransfer("d->h data transfer"); err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows * sizeof(clFloatType), h_out, 0, NULL, &outTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); outTransfer.FillTimingInfo(); double oTransferTime = outTransfer.StartEndRuntime(); // Compare reference solution to GPU result if (! verifyResults(refOut, h_out, numRows, k)) { return; } char atts[TEMP_BUFFER_SIZE]; char benchName[TEMP_BUFFER_SIZE]; double avgTime = totalKernelTime / (double)iters; sprintf(atts, "%d_elements_%d_rows", numNonZeroes, cmSize); double gflop = 2 * (double) numNonZeroes; bool dpTest = (sizeof(floatType) == sizeof(double)); sprintf(benchName, "%sELLPACKR-%s", padded ? "Padded_":"", dpTest ? "DP":"SP"); resultDB.AddResult(benchName, atts, "Gflop/s", gflop/avgTime); sprintf(benchName, "%s_PCIe", benchName); resultDB.AddResult(benchName, atts, "Gflop/s", gflop / (avgTime + iTransferTime + oTransferTime)); } err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(ellpackr); CL_CHECK_ERROR(err); // Free device memory err = clReleaseMemObject(d_rowLengths); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_vec); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_out); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_val); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_cols); CL_CHECK_ERROR(err); // Free host memory delete[] h_rowLengths; delete[] h_valcm; delete[] h_colscm; }
void runTest(const string& testName, cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op, string compileFlags) { // Problem Parameters const int probSizes[4] = { 12288, 24576, 36864, 73728 }; int sizeClass = op.getOptionInt("size"); assert(sizeClass >= 0 && sizeClass < 5); int nAtom = probSizes[sizeClass - 1]; // Allocate problem data on host cl_mem h_pos, h_force, h_neigh; posVecType* position; forceVecType* force; int* neighborList; int passes = op.getOptionInt("passes"); int iter = op.getOptionInt("iterations"); // Allocate and map pinned host memory int err = 0; // Position h_pos = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(posVecType)*nAtom, NULL, &err); CL_CHECK_ERROR(err); position = (posVecType*)clEnqueueMapBuffer(queue, h_pos, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(posVecType)*nAtom , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Force h_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(forceVecType)*nAtom, NULL, &err); CL_CHECK_ERROR(err); force = (forceVecType*)clEnqueueMapBuffer(queue, h_force, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(forceVecType)*nAtom , 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Neighbor List h_neigh = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(int) * nAtom * maxNeighbors, NULL, &err); CL_CHECK_ERROR(err); neighborList = (int*)clEnqueueMapBuffer(queue, h_neigh, true, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(int) * nAtom * maxNeighbors, 0, NULL, NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory cl_mem d_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE, nAtom * sizeof(forceVecType), NULL, &err); CL_CHECK_ERROR(err); cl_mem d_position = clCreateBuffer(ctx, CL_MEM_READ_WRITE, nAtom * sizeof(posVecType), NULL, &err); CL_CHECK_ERROR(err); // Allocate device memory neighbor list cl_mem d_neighborList = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxNeighbors * nAtom * sizeof(int), NULL, &err); CL_CHECK_ERROR(err); size_t maxGroupSize = getMaxWorkGroupSize(dev); if (maxGroupSize < 128) { cout << "MD requires a work group size of at least 128" << endl; // Add special values to the results database char atts[1024]; sprintf(atts, "GSize_Not_Supported"); for (int i=0 ; i<passes ; ++i) { resultDB.AddResult(testName, atts, "GFLOPS", FLT_MAX); resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS", FLT_MAX); resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", FLT_MAX); resultDB.AddResult(testName+"_Parity", atts, "N", FLT_MAX); } return; } size_t localSize = 128; size_t globalSize = nAtom; cout << "Initializing test problem (this can take several " "minutes for large problems).\n "; // Seed random number generator srand48(8650341L); // Initialize positions -- random distribution in cubic domain for (int i = 0; i < nAtom; i++) { position[i].x = (drand48() * domainEdge); position[i].y = (drand48() * domainEdge); position[i].z = (drand48() * domainEdge); } // Copy position to GPU Event evTransfer("h->d transfer"); err = clEnqueueWriteBuffer(queue, d_position, true, 0, nAtom * sizeof(posVecType), position, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); long transferTime = evTransfer.StartEndRuntime(); // Keep track of how many atoms are within the cutoff distance to // accurately calculate FLOPS later int totalPairs = buildNeighborList<T, posVecType>(nAtom, position, neighborList); cout << "Finished.\n"; cout << totalPairs << " of " << nAtom*maxNeighbors << " pairs within cutoff distance = " << 100.0 * ((double)totalPairs / (nAtom*maxNeighbors)) << " %" << endl; // Copy data to GPU err = clEnqueueWriteBuffer(queue, d_neighborList, true, 0, maxNeighbors * nAtom * sizeof(int), neighborList, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); clFinish(queue); evTransfer.FillTimingInfo(); transferTime += evTransfer.StartEndRuntime(); // Build the openCL kernel cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_md, NULL, &err); CL_CHECK_ERROR(err); err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL); CL_CHECK_ERROR(err); // If there is a build error, print the output and return if (err != CL_SUCCESS) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000 * sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Extract out the kernels cl_kernel lj_kernel = clCreateKernel(prog, "compute_lj_force", &err); CL_CHECK_ERROR(err); T lj1_t = (T) lj1; T lj2_t = (T) lj2; T cutsq_t = (T) cutsq; // Set kernel arguments err = clSetKernelArg(lj_kernel, 0, sizeof(cl_mem), (void*) &d_force); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 1, sizeof(cl_mem), (void*) &d_position); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 2, sizeof(cl_int), (void*) &maxNeighbors); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 3, sizeof(cl_mem), (void*) &d_neighborList); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 4, sizeof(T), (void*) &cutsq_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 5, sizeof(T), (void*) &lj1_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 6, sizeof(T), (void*) &lj2_t); CL_CHECK_ERROR(err); err = clSetKernelArg(lj_kernel, 7, sizeof(cl_int), (void*) &nAtom); CL_CHECK_ERROR(err); Event evLJ("computeLJ"); // Warm up the kernel and check correctness err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &evLJ.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); err = clEnqueueReadBuffer(queue, d_force, true, 0, nAtom * sizeof(forceVecType), force, 0, NULL, &evTransfer.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); evTransfer.FillTimingInfo(); transferTime += evTransfer.StartEndRuntime(); cout << "Performing Correctness Check (can take several minutes)\n"; // If results are correct, skip the performance tests if (!checkResults<T, forceVecType, posVecType>(force, position, neighborList, nAtom)) { return; } for (int i = 0; i < passes; i++) { double total_time = 0.0; for (int j = 0; j < iter; j++) { //Launch Kernels err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &evLJ.CLEvent()); CL_CHECK_ERROR(err); err = clFinish(queue); CL_CHECK_ERROR(err); // Collect timing info from events evLJ.FillTimingInfo(); total_time += evLJ.SubmitEndRuntime(); } char atts[1024]; long int nflops = (8 * nAtom * maxNeighbors) + (totalPairs * 13); sprintf(atts, "%d_atoms", nAtom); total_time /= (double) iter; resultDB.AddResult(testName, atts, "GFLOPS", ((double) nflops) / total_time); resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS", ((double) nflops) / (total_time + transferTime)); long int numPairs = nAtom * maxNeighbors; long int nbytes = (3 * sizeof(T) * (1+numPairs)) + // position data (3 * sizeof(T) * nAtom) + // force for each atom (sizeof(int) * numPairs); // neighbor list double gbytes = (double)nbytes / (1000. * 1000. * 1000.); double seconds = total_time / 1.e9; resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", gbytes / seconds); resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", gbytes / (seconds + (transferTime / 1.e9))); resultDB.AddResult(testName+"_Parity", atts, "N", (transferTime / 1.e9) / seconds); } // Clean up // Host memory err = clEnqueueUnmapMemObject(queue, h_pos, position, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_force, force, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clEnqueueUnmapMemObject(queue, h_neigh, neighborList, 0, NULL, NULL); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_pos); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_force); CL_CHECK_ERROR(err); err = clReleaseMemObject(h_neigh); CL_CHECK_ERROR(err); // Program Objects err = clReleaseProgram(prog); CL_CHECK_ERROR(err); err = clReleaseKernel(lj_kernel); CL_CHECK_ERROR(err); // Device Memory err = clReleaseMemObject(d_force); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_position); CL_CHECK_ERROR(err); err = clReleaseMemObject(d_neighborList); CL_CHECK_ERROR(err); }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Executes the sparse matrix - vector multiplication benchmark // // Arguments: // dev: the opencl device id to use for the benchmark // ctx: the opencl context to use for the benchmark // queue: the opencl command queue to issue commands to // resultDB: stores results from the benchmark // op: the options parser / parameter database // // Returns: nothing // Programmer: Lukasz Wesolowski // Creation: August 13, 2010 // // Modifications: // // **************************************************************************** void RunBenchmark(cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op) { //create list of problem sizes int probSizes[4] = {1024, 8192, 12288, 16384}; int sizeClass = op.getOptionInt("size") - 1; // Always run single precision test // OpenCL doesn't support templated kernels, so we have to use macros cout <<"Single precision tests:\n"; string spMacros = "-DSINGLE_PRECISION "; RunTest<float, cl_float> (dev, ctx, queue, resultDB, op, spMacros, probSizes[sizeClass]); // If double precision is supported, run the DP test if (checkExtension(dev, "cl_khr_fp64")) { cout << "Double precision tests\n"; string dpMacros = "-DK_DOUBLE_PRECISION "; RunTest<double, cl_double> (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]); } else if (checkExtension(dev, "cl_amd_fp64")) { cout << "Double precision tests\n"; string dpMacros = "-DAMD_DOUBLE_PRECISION "; RunTest<double, cl_double> (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]); } else { std::cout << "Double precision not supported by chosen device, skipping" << std::endl; // driver script still needs entries for all tests, even if not run int nPasses = (int)op.getOptionInt( "passes" ); for( unsigned int p = 0; p < nPasses; p++ ) { resultDB.AddResult( (const char*)"CSR-Scalar-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Scalar-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Vector-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"CSR-Vector-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"ELLPACKR-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"ELLPACKR-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP", "N/A", "Gflop/s", FLT_MAX ); resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP_PCIe", "N/A", "Gflop/s", FLT_MAX ); } } }
void runTest(const string& name, ResultDatabase &resultDB, OptionParser& op) { int i, j; void* work, * chk; T2* source, * result; unsigned long bytes = 0; if (op.getOptionInt("MB") == 0) { int probSizes[4] = { 1, 8, 96, 256 }; int sizeIndex = op.getOptionInt("size")-1; if (sizeIndex < 0 || sizeIndex >= 4) { cerr << "Invalid size index specified\n"; exit(-1); } bytes = probSizes[sizeIndex]; } else { bytes = op.getOptionInt("MB"); } // Convert to MB bytes *= 1024 * 1024; bool do_dp = dp<T2>(); init(op, do_dp); int passes = op.getOptionInt("passes"); // now determine how much available memory will be used int half_n_ffts = bytes / (512*sizeof(T2)*2); int n_ffts = half_n_ffts * 2; int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(T2); double N = half_n_cmplx*2; // allocate host and device memory allocHostBuffer((void**)&source, used_bytes); allocHostBuffer((void**)&result, used_bytes); // init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // alloc device memory allocDeviceBuffer(&work, used_bytes); allocDeviceBuffer(&chk, 1); // Copy to device, and record transfer time fprintf(stderr, "used_bytes=%d, N=%g\n", used_bytes, N); int pcie_TH = Timer::Start(); copyToDevice(work, source, used_bytes); double transfer_time = Timer::Stop(pcie_TH, "PCIe Transfer Time"); char chk_init = 0; copyToDevice(chk, &chk_init, 1); const char *sizeStr; stringstream ss; ss << "N=" << (long)N; sizeStr = strdup(ss.str().c_str()); for (int k=0; k<passes; k++) { // time fft kernel int TH = Timer::Start(); forward(work, n_ffts); double t = Timer::Stop(TH, "fft"); double fftsz = 512; double Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f); double gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) / ((transfer_time+t)*1e9f); resultDB.AddResult(name, sizeStr, "GFLOPS", Gflops); resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", gflopsPCIe); resultDB.AddResult(name+"_Parity", sizeStr, "N", transfer_time / t); // time ifft kernel TH = Timer::Start(); inverse(work, n_ffts); t = Timer::Stop(TH, "ifft"); Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f); gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) / ((transfer_time+t)*1e9f); resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", Gflops); resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", gflopsPCIe); resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", transfer_time / t); // time check kernel int failed = check(work, chk, half_n_ffts, half_n_cmplx); cout << "pass " << k << ((failed) ? ": failed\n" : ": passed\n"); } freeDeviceBuffer(work); freeDeviceBuffer(chk); freeHostBuffer(source); freeHostBuffer(result); }