// **************************************************************************** // Function: RunBenchmark // // Purpose: // Measures the floating point capability of the device for a variety of // combinations of arithmetic operations. // // Arguments: // op: the options parser / parameter database // // Returns: nothing // // Programmer: Zhi Ying([email protected]) // Jun Jin([email protected]) // // Creation: May 23, 2011 // // Modifications: // 12/12/12 - Kyle Spafford - Code style and minor integration updates // // **************************************************************************** void RunBenchmark(OptionParser &op, ResultDatabase &resultDB) { const bool verbose = op.getOptionBool("verbose"); // Quiet == no progress bar. const bool quiet = op.getOptionBool("quiet"); const unsigned int passes = op.getOptionInt("passes"); const int micdev = op.getOptionInt("target"); double repeatF = 3; cout << "Adjust repeat factor = " << repeatF << "\n"; // Initialize progress bar int totalRuns = 16*passes*2; ProgressBar pb(totalRuns); if (!verbose && !quiet) { pb.Show(stdout); } RunTest<float>(resultDB, passes, verbose, quiet, repeatF, pb, "-SP", micdev); RunTest<double>(resultDB, passes, verbose, quiet, repeatF, pb, "-DP", micdev); if (!verbose) cout << endl; }
void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id dev = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); if (getMaxWorkGroupSize(dev) < 64) { cout << "FFT requires MaxWorkGroupSize of at least 64" << endl; fillResultDB("SP-FFT", "MaxWorkGroupSize<64", op, resultDB); fillResultDB("DP-FFT", "MaxWorkGroupSize<64", op, resultDB); return; } bool has_dp = checkExtension(dev, "cl_khr_fp64") || checkExtension(dev, "cl_amd_fp64"); if (op.getOptionBool("dump-sp")) { dump<cplxflt>(op); } else if (op.getOptionBool("dump-dp")) { if (!has_dp) { cout << "dump-dp: no double precision support!\n"; return; } dump<cplxdbl>(op); } else { // Always run single precision test runTest<cplxflt>("SP-FFT", dev, ctx, queue, resultDB, op); // If double precision is supported, run the DP test if (has_dp) { cout << "DP Supported\n"; runTest<cplxdbl>("DP-FFT", dev, ctx, queue, resultDB, op); } else { cout << "DP Not Supported\n"; fillResultDB("DP-FFT", "DP_Not_Supported", op, resultDB); } } }
// **************************************************************************** // Function: GPUSetup // // Purpose: // do the necessary OpenCL setup for GPU part of the test // // Arguments: // op: the options parser / parameter database // mympirank: for printing errors in case of failure // mynoderank: this is typically the device ID (the mapping done in main) // // Returns: success/failure // // Creation: 2009 // // Modifications: // // **************************************************************************** // int GPUSetup(OptionParser &op, int mympirank, int mynoderank) { addBenchmarkSpecOptions(op); if (op.getOptionBool("infoDevices")) { OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); return (0); } // The device option supports specifying more than one device int platform = op.getOptionInt("platform"); int deviceIdx = mynoderank; if( deviceIdx >= op.getOptionVecInt( "device" ).size() ) { std::ostringstream estr; estr << "Warning: not enough devices specified with --device flag for task " << mympirank << " ( node rank " << mynoderank << ") to claim its own device; forcing to use first device "; std::cerr << estr.str() << std::endl; deviceIdx = 0; } int device = op.getOptionVecInt("device")[deviceIdx]; // Initialization _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) ); std::vector<cl::Device> ctxDevices; ctxDevices.push_back( *_mpicontention_ocldev ); _mpicontention_ocldriver_ctx = new cl::Context( ctxDevices ); _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE ); _mpicontention_gpuop = op; return 0; }
void RunBenchmark(ResultDatabase &resultDB, OptionParser &op) { // Test to see if this device supports double precision cudaGetDevice(&fftDevice); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, fftDevice); bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) || (deviceProp.major >= 2); if (op.getOptionBool("dump-sp")) { dump<float2>(op); } else if (op.getOptionBool("dump-dp")) { if (!has_dp) { cout << "dump-dp: no double precision support!\n"; return; } dump<double2>(op); } else { cout << "Running single precision test" << endl; runTest<float2>("SP-FFT", resultDB, op); if (has_dp) { cout << "Running double precision test" << endl; runTest<double2>("DP-FFT", resultDB, op); } else { cout << "Skipping double precision test" << endl; char atts[32] = "DP_Not_Supported"; // resultDB requires neg entry for every possible result int passes = op.getOptionInt("passes"); for (int k=0; k<passes; k++) { resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX); resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX); } } } }
void RunBenchmark(OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings if(op.getOptionBool("2D")) dump2D<cplxflt>(op); else dump1D<cplxflt>(op); }
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB) { const bool verbose = op.getOptionBool("verbose"); if (verbose) // print MKL version info { static char mklver[200]; char *p; MKL_Get_Version_String(mklver,sizeof(mklver)); mklver[sizeof(mklver)-1] = 0; p = strrchr(mklver,' '); if (p) while (p[0]==' ' && p[1]==0) *p-- = 0; printf("SHOC FFT benchmark using MKL verison %s\n",mklver); } RunTest<cplxflt>("SP-FFT", resultDB, op); RunTest<cplxdbl>("DP-FFT", resultDB, op); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; try { #if defined(PARALLEL) stdStencilFactory = new MPIHostStencilFactory<T>; testStencilFactory = new MPICUDAStencilFactory<T>; #else stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new CUDAStencilFactory<T>; #endif // defined(PARALLEL) assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::vector<long long> lDims = opts.getOptionVecInt( "lsize" ); assert( lDims.size() == 2 ); std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << lDims[0] << 'x' << lDims[1]; unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" ); unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" ); // compute the expected result on the host // or read it from a pre-existing file std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" ); #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) if( !matrixFilenameBase.empty() ) { std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n" << std::endl; } else { std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n" << "Depending on host capabilities, this may take a while." << std::endl; } #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> expected( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); bool haveExpectedData = false; if( ! matrixFilenameBase.empty() ) { bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); if( readOK ) { if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) || (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) ) { std::cerr << "The matrix read from file \'" << GetMatrixFileName<T>( matrixFilenameBase ) << "\' does not match the matrix size specified on the command line.\n"; expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); } else { haveExpectedData = true; } } if( !haveExpectedData ) { std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n" << "Depending on host capabilities, this may take a while." << std::endl; } } if( !haveExpectedData ) { init( expected ); haveExpectedData = true; if( beVerbose ) { std::cout << "initial state:\n" << expected << std::endl; } stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( expected, nIters ); } if( beVerbose ) { std::cout << "expected result:\n" << expected << std::endl; } // determine whether we are to save the expected matrix values to a file // to speed up future runs matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" ); if( !matrixFilenameBase.empty() ) { SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); } assert( haveExpectedData ); // compute the result on the CUDA device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); // Compute the number of floating point operations we will perform. // // Note: in the truly-parallel case, we count flops for redundant // work due to the need for a halo. // But we do not add to the count for the local 1-wide halo since // we aren't computing new values for those items. unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); #if defined(PARALLEL) MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil ); assert( mpiTestStencil != NULL ); int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0; int numParticipating = 0; MPI_Allreduce( &participating, // src &numParticipating, // dest 1, // count MPI_INT, // type MPI_SUM, // op MPI_COMM_WORLD ); // communicator npts *= numParticipating; #endif // defined(PARALLEL) // In our 9-point stencil, there are 11 floating point operations // per point (3 multiplies and 11 adds): // // newval = weight_center * centerval + // weight_cardinal * (northval + southval + eastval + westval) + // weight_diagnoal * (neval + nwval + seval + swval) // // we do this stencil operation 'nIters' times unsigned long nflops = npts * 11 * nIters; #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Performing " << nWarmupPasses << " warmup passes..."; #if defined(PARALLEL) } #endif // defined(PARALLEL) for( unsigned int pass = 0; pass < nWarmupPasses; pass++ ) { init(data); (*testStencil)( data, nIters ); } #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "done." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) for( unsigned int pass = 0; pass < nPasses; pass++ ) { #if !defined(PARALLEL) std::cout << "pass " << pass << ": "; #endif // !defined(PARALLEL) init( data ); int timerHandle = Timer::Start(); (*testStencil)( data, nIters ); double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" ); // find and report the computation rate double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) StencilValidater<T>* validater = new MPIStencilValidater<T>; #else StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) validater->ValidateResult( expected, data, valErrThreshold, nValErrsToPrint ); } } catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; } // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
// **************************************************************************** // Method: main() // // Purpose: // serial and parallel main for OpenCL level0 benchmarks // // Arguments: // argc, argv // // Programmer: SHOC Team // Creation: The Epoch // // Modifications: // Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010 // Changed the way device selection works. It now defaults to the device // index corresponding to the process's rank within a node if no devices // are specified on the command command line, and otherwise, round-robins // the list of devices among the tasks. // // Gabriel Marin, Tue Jun 01 15:38 EST 2010 // Check that we have valid (not NULL) context and queue objects before // running the benchmarks. Errors inside CreateContextFromSingleDevice or // CreateCommandQueueForContextAndDevice were not propagated out to the main // program. // // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n"; #endif OptionParser op; //Add shared options to the parser op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use", 'p'); op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_VECINT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1 ); } if (op.getOptionBool("infoDevices")) { #define DEBUG_DEVICE_CONTAINER 0 #ifdef PARALLEL // execute following code only if I am the process of lowest // rank on this node NodeInfo NI; int mynoderank = NI.nodeRank(); if (mynoderank==0) { int nlrrank, nlrsize; MPI_Comm nlrcomm = NI.getNLRComm(); MPI_Comm_size(nlrcomm, &nlrsize); MPI_Comm_rank(nlrcomm, &nlrrank); OpenCLNodePlatformContainer ndc1; OpenCLMultiNodeContainer localMnc(ndc1); localMnc.doMerge (nlrrank, nlrsize, nlrcomm); if (rank==0) // I am the global rank 0, print all configurations localMnc.Print (cout); } #else OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); #if DEBUG_DEVICE_CONTAINER OpenCLMultiNodeContainer mnc1(ndc1), mnc2; mnc1.Print (cout); ostringstream oss; mnc1.writeObject (oss); std::string temp(oss.str()); cout << "Serialized MultiNodeContainer:\n" << temp; istringstream iss(temp); mnc2.readObject (iss); cout << "Unserialized object2:\n"; mnc2.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 1:\n"; mnc1.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 2:\n"; mnc1.Print (cout); #endif // DEBUG #endif // PARALLEL return (0); } bool verbose = op.getOptionBool("verbose"); // The device option supports specifying more than one device // for now, just choose the first one. int platform = op.getOptionInt("platform"); #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); if (verbose) cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl; #else int myNodeRank = 0; #endif // If they haven't specified any devices, assume they // want the process with in-node rank N to use device N int deviceIdx = myNodeRank; // If they have, then round-robin the list of devices // among the processes on a node. vector<long long> deviceVec = op.getOptionVecInt("device"); if (deviceVec.size() > 0) { int len = deviceVec.size(); deviceIdx = deviceVec[myNodeRank % len]; } // Check for an erroneous device if (deviceIdx >= GetNumOclDevices(platform)) { cerr << "Warning: device index: " << deviceIdx << " out of range, defaulting to device 0.\n"; deviceIdx = 0; } // Initialization if (verbose) cout << ">> initializing\n"; cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx); cl_int clErr; cl_context ctx = clCreateContext( NULL, // properties 1, // number of devices &devID, // device NULL, // notification function NULL, &clErr ); CL_CHECK_ERROR(clErr); cl_command_queue queue = clCreateCommandQueue( ctx, devID, CL_QUEUE_PROFILING_ENABLE, &clErr ); CL_CHECK_ERROR(clErr); ResultDatabase resultDB; // Run the benchmark RunBenchmark(devID, ctx, queue, resultDB, op); clReleaseCommandQueue( queue ); clReleaseContext( ctx ); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { std::cerr << "unrecognized exception caught" << std::endl; ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif return ret; }
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB) { const bool verbose = op.getOptionBool("verbose"); // Sizes are in kb const int nSizes = 17; int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768, 65536}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; // Create host memory hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN); if(hostMem==NULL) { cerr << "Couldn't allocate CPU memory! \n"; cerr << "Test failed." << endl; return; } // Initialize memory with some pattern. for (int i = 0; i < numMaxFloats; i++) { hostMem[i] = i % 77; } const unsigned int passes = op.getOptionInt("passes"); int micdev = op.getOptionInt("target"); // Allocate memory on the card #pragma offload target(mic:micdev) \ nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) ) { } // Three passes, forward and backward both for (int pass = 0; pass < passes; pass++) { // Step through sizes forward on even passes and backward on odd for (int i = 0; i < nSizes; i++) { int sizeIndex; if ((pass % 2) == 0) { sizeIndex = i; } else { sizeIndex = (nSizes - 1) - i; } int nbytes = sizes[sizeIndex] * 1024; // D->H test double start = curr_second(); #pragma offload target(mic:micdev) \ out(hostMem:length((1024*sizes[sizeIndex]/4)) \ free_if(0) alloc_if(0) ) { } double t = curr_second()-start; if (verbose) { cerr << "Size " << sizes[sizeIndex] << "k took " << t << " sec\n"; } double speed = (double(sizes[sizeIndex]) * 1024 / (1000. * 1000. * 1000.)) / t; char sizeStr[256]; sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]); resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed); resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000); } } // Free memory allocated on the mic #pragma offload target(mic:micdev) \ in(hostMem:length(numMaxFloats) alloc_if(0) ) { } // Cleanup _mm_free(hostMem); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; //try { stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new MICStencilFactory<T>; assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; //throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { const int probSizes[4] = { 768, 1408, 2048, 4096 }; int sizeClass = opts.getOptionInt("size"); if (!(sizeClass >= 0 && sizeClass < 5)) { //throw InvalidArgValue( "Size class must be between 1-4" ); } arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1]; } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << LROWS << 'x' << LCOLS; unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" ); unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); unsigned long nflops = npts * 11 * nIters; cout<<"flops are = "<<nflops<<endl; // compute the expected result on the host #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n" << "Depending on host capabilities, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); init( exp ); if( beVerbose ) { std::cout << "initial state:\n" << exp << std::endl; } Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( exp, nIters ); if( beVerbose ) { std::cout << "expected result:\n" << exp << std::endl; } // compute the result on the MIC device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) std::cout<<"Passes:"<<nPasses<<endl; for( unsigned int pass = 0; pass < nPasses; pass++ ) { init( data ); double start = curr_second(); (*testStencil)( data, nIters ); double elapsedTime = curr_second()-start; double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) //StencilValidater<T>* validater = new MPIStencilValidater<T>; #else //StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) MICValidate(exp,data,valErrThreshold,nValErrsToPrint); /*validater->ValidateResult( exp, data, valErrThreshold, nValErrsToPrint );*/ } } /* catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; }*/ // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Executes a series of arithmetic benchmarks for OpenCL devices. // OpenCL kernels are auto-generated based on the values in the // _benchmark_type structures. // The benchmark tests throughput for add, multiply, multiply-add and // multiply+multiply-add series of operations, for 1, 2, 4 and 8 // independent streams.. // // Arguments: // ctx: the opencl context to use for the benchmark // queue: the opencl command queue to issue commands to // resultDB: results from the benchmark are stored in this db // op: the options parser (contains input parameters) // // Returns: nothing // // Programmer: Gabriel Marin // Creation: June 26, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id id = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); int npasses = op.getOptionInt("passes"); bool verbose = op.getOptionBool("verbose"); bool quiet = op.getOptionBool("quiet"); int err; cl_mem mem1; float *hostMem, *hostMem2; size_t maxGroupSize = 1; size_t localWorkSize = 1; // Seed the random number generator srand48(8650341L); // To prevent this benchmark from taking too long to run, we // calibrate how many repetitions of each test to execute. To do this we // run one pass through a multiply-add benchmark and then adjust // the repeat factor based on runtime. Use MulMAdd4 for this. int aIdx = 0; float repeatF = 1.0f; // Find the index of the MAdd4 benchmark while ((aTests!=0) && (aTests[aIdx].name!=0) && strcmp(aTests[aIdx].name,"MAdd4")) { aIdx += 1; } if (aTests && aTests[aIdx].name) // we found a benchmark with that name { struct _benchmark_type temp = aTests[aIdx]; // Limit to one repetition temp.numRepeats = 10; // Kernel will be generated into this stream ostringstream oss; generateKernel (oss, temp, "float", ""); std::string kernelCode(oss.str()); // Allocate host memory int halfNumFloatsMax = temp.halfBufSizeMax*1024/4; int numFloatsMax = 2*halfNumFloatsMax; hostMem = new float[numFloatsMax]; hostMem2 = new float[numFloatsMax]; // Allocate device memory mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float)*numFloatsMax, NULL, &err); CL_CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue, mem1, true, 0, numFloatsMax*sizeof(float), hostMem, 0, NULL, NULL); CL_CHECK_ERROR(err); // Create the kernel program const char* progSource[] = {kernelCode.c_str()}; cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, NULL, &err); CL_CHECK_ERROR(err); // Compile the kernel err = clBuildProgram(prog, 0, NULL, opts, NULL, NULL); // Compile the kernel CL_CHECK_ERROR(err); // Extract out madd kernel cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err); CL_CHECK_ERROR(err); // Set kernel arguments err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1); CL_CHECK_ERROR (err); err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), (void*)&temp.numRepeats); CL_CHECK_ERROR (err); // Determine the maximum work group size for this kernel maxGroupSize = getMaxWorkGroupSize(id); // use min(maxWorkGroupSize, 256) localWorkSize = maxGroupSize<128?maxGroupSize:128; // Initialize host data, with the first half the same as the second for (int j=0; j<halfNumFloatsMax; ++j) { hostMem[j] = hostMem[numFloatsMax-j-1] = (float)(drand48()*5.0); } // Set global work size size_t globalWorkSize = numFloatsMax; Event evCopyMem("CopyMem"); err = clEnqueueWriteBuffer (queue, mem1, true, 0, numFloatsMax*sizeof(float), hostMem, 0, NULL, &evCopyMem.CLEvent()); CL_CHECK_ERROR (err); // Wait for transfer to finish err = clWaitForEvents (1, &evCopyMem.CLEvent()); CL_CHECK_ERROR (err); Event evKernel(temp.name); err = clEnqueueNDRangeKernel (queue, kernel_madd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); CL_CHECK_ERROR (err); // Wait for kernel to finish err = clWaitForEvents (1, &evKernel.CLEvent()); CL_CHECK_ERROR (err); evKernel.FillTimingInfo(); // Calculate repeat factor based on kernel runtime double tt = double(evKernel.SubmitEndRuntime()); repeatF = 1.1e07 / tt; cout << "Adjust repeat factor = " << repeatF << endl; // Clean up err = clReleaseKernel (kernel_madd); CL_CHECK_ERROR(err); err = clReleaseProgram (prog); CL_CHECK_ERROR(err); err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); delete[] hostMem; delete[] hostMem2; } // Compute total number of kernel runs int totalRuns = 0; aIdx = 0; while ((aTests!=0) && (aTests[aIdx].name!=0)) { for (int halfNumFloats=aTests[aIdx].halfBufSizeMin*1024 ; halfNumFloats<=aTests[aIdx].halfBufSizeMax*1024 ; halfNumFloats*=aTests[aIdx].halfBufSizeStride) { totalRuns += npasses; } aIdx += 1; } // Account for custom kernels totalRuns += 2 * npasses; // check for double precision support int hasDoubleFp = 0; string doublePragma = ""; if (checkExtension(id, "cl_khr_fp64")) { hasDoubleFp = 1; doublePragma = "#pragma OPENCL EXTENSION cl_khr_fp64: enable"; } else if (checkExtension(id, "cl_amd_fp64")) { hasDoubleFp = 1; doublePragma = "#pragma OPENCL EXTENSION cl_amd_fp64: enable"; } // Double the number of passes if double precision support found if (hasDoubleFp) { cout << "DP Supported" << endl; totalRuns <<= 1; } else cout << "DP Not Supported" << endl; ProgressBar pb(totalRuns); if (!verbose && !quiet) pb.Show(stdout); RunTest<float> (id, ctx, queue, resultDB, npasses, verbose, quiet, repeatF, localWorkSize, pb, "float", "-SP", ""); if (hasDoubleFp) RunTest<double> (id, ctx, queue, resultDB, npasses, verbose, quiet, repeatF, localWorkSize, pb, "double", "-DP", doublePragma.c_str()); else { aIdx = 0; const char atts[] = "DP_Not_Supported"; while ((aTests!=0) && (aTests[aIdx].name!=0)) { for (int pas=0 ; pas<npasses ; ++pas) { resultDB.AddResult(string(aTests[aIdx].name)+"-DP" , atts, "GFLOPS", FLT_MAX); } aIdx += 1; } for (int pas=0 ; pas<npasses ; ++pas) { resultDB.AddResult("MulMAddU-DP", atts, "GFLOPS", FLT_MAX); resultDB.AddResult("MAddU-DP", atts, "GFLOPS", FLT_MAX); } } if (!verbose) fprintf (stdout, "\n\n"); }
void RunTest(const string& name, ResultDatabase &resultDB, OptionParser &op) { static __declspec(target(mic)) T2 *source; int chk; unsigned long bytes = 0; const int micdev = op.getOptionInt("target"); const bool verbose = op.getOptionBool("verbose"); // Get problem size if (op.getOptionInt("MB") == 0) { int probSizes[4] = { 1, 8, 96, 256 }; int sizeIndex = op.getOptionInt("size")-1; if (sizeIndex < 0 || sizeIndex >= 4) { cerr << "Invalid size index specified\n"; exit(-1); } bytes = probSizes[sizeIndex]; } else { bytes = op.getOptionInt("MB"); } // Convert to MiB bytes *= 1024 * 1024; int passes = op.getOptionInt("passes"); // The size of the transform computed is fixed at 512 complex elements int fftsz = 512; int N = (bytes)/sizeof(T2); int n_ffts = N/fftsz; // Allocate space (aligned) source = (T2*) MKL_malloc(bytes, 4096); //allocate buffers and create FFT plans #pragma offload target(mic:micdev) in(fftsz, n_ffts) \ nocopy(source:length(N) \ align(4096) alloc_if(1) free_if(0)) { forward((T2*)NULL, fftsz, n_ffts); inverse((T2*)NULL, fftsz, n_ffts); } const char *sizeStr; stringstream ss; ss << "N=" << (long)N; sizeStr = strdup(ss.str().c_str()); for(int k = 0; k < passes; k++) { init<T2>( source, fftsz, n_ffts ); // Warmup if (k==0) { #pragma offload target(mic:micdev) in(fftsz, n_ffts) \ in(source:length(N) \ alloc_if(0) free_if(0)) { forward(source, fftsz, n_ffts); } } // Time forward fft with data transfer over PCIe double time_fwd_pcie = -curr_second(); // Using in rather than inout to be consistent with CUDA version. #pragma offload target(mic:micdev) in(fftsz, n_ffts) \ in(source:length(N) alloc_if(0) \ free_if(0)) { forward(source, fftsz, n_ffts); } time_fwd_pcie += curr_second(); #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \ free_if(0)) { } // Time inverse fft with data transfer over PCIe double time_inv_pcie = -curr_second(); #pragma offload target(mic:micdev) in(fftsz, n_ffts) \ in(source:length(N) \ alloc_if(0) free_if(0)) { inverse(source, fftsz, n_ffts); } time_inv_pcie += curr_second(); #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \ free_if(0)) {} // Check result #pragma offload target(mic:micdev) in(fftsz,n_ffts) nocopy(source) \ out(chk) { chk = checkDiff(source, fftsz, n_ffts); } if (verbose || chk) { cout << "Test " << k << ((chk) ? ": Failed\n" : ": Passed\n"); } // Time forward fft without data transfer double time_fwd_native = -curr_second(); #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source) { forward(source, fftsz, n_ffts); } time_fwd_native += curr_second(); // Time inverse fft without data transfer double time_inv_native = -curr_second(); #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source) { inverse(source, fftsz, n_ffts); } time_inv_native += curr_second(); // Calculate gflops double flop_count = n_ffts*(5*fftsz*log2(fftsz)); double GF_fwd_pcie = flop_count / (time_fwd_pcie * 1e9); double GF_fwd_native = flop_count / (time_fwd_native * 1e9); double GF_inv_pcie = flop_count / (time_inv_pcie * 1e9); double GF_inv_native = flop_count / (time_inv_native * 1e9); resultDB.AddResult(name, sizeStr, "GFLOPS", GF_fwd_native); resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", GF_fwd_pcie); resultDB.AddResult(name+"_Parity", sizeStr, "N", (time_fwd_pcie - time_fwd_native) / time_fwd_native); resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", GF_inv_native); resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", GF_inv_pcie); resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", (time_inv_pcie - time_inv_native) / time_inv_native); } // Cleanup FFT plans and buffers #pragma offload target(mic:micdev) nocopy(source:length(N) \ alloc_if(0) free_if(1)) { forward((T2*)NULL, 0, 0); inverse((T2*)NULL, 0, 0); } MKL_free(source); }
// **************************************************************************** // Function: main // // Purpose: // The main function takes care of initialization (device and MPI), then // performs the benchmark and prints results. // // Arguments: // // // Programmer: Jeremy Meredith // Creation: // // Modifications: // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; bool noprompt = false; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n"; #endif // Get args OptionParser op; //Add shared options to the parser op.addOption("device", OPT_VECINT, "0", "specify device(s) to run on", 'd'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_INT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); #ifdef _WIN32 op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit"); #endif addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1); } bool verbose = op.getOptionBool("verbose"); bool infoDev = op.getOptionBool("infoDevices"); #ifdef _WIN32 noprompt = op.getOptionBool("noprompt"); #endif int device; #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); vector<long long> deviceVec = op.getOptionVecInt("device"); if (myNodeRank >= deviceVec.size()) { // Default is for task i to test device i device = myNodeRank; } else { device = deviceVec[myNodeRank]; } #else device = op.getOptionVecInt("device")[0]; #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (device >= deviceCount) { cerr << "Warning: device index: " << device << " out of range, defaulting to device 0.\n"; device = 0; } // Initialization EnumerateDevicesAndChoose(device, infoDev); if( infoDev ) { return 0; } ResultDatabase resultDB; // Run the benchmark RunBenchmark(resultDB, op); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( InvalidArgValue& e ) { std::cerr << e.what() << ": " << e.GetMessage() << std::endl; ret = 1; } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif #ifdef _WIN32 if (!noprompt) { cout << "Press return to exit\n"; cin.get(); } #endif return ret; }
void init(OptionParser& op, bool _do_dp) { cl_int err; do_dp = _do_dp; if (!fftCtx) { // first get the device int device, platform = op.getOptionInt("platform"); if (op.getOptionVecInt("device").size() > 0) { device = op.getOptionVecInt("device")[0]; } else { device = 0; } fftDev = ListDevicesAndGetDevice(platform, device); // now get the context fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err); CL_CHECK_ERROR(err); } if (!fftQueue) { // get a queue fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE, &err); CL_CHECK_ERROR(err); } // create the program... fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err); CL_CHECK_ERROR(err); // ...and build it string args = " -cl-mad-enable "; if (op.getOptionBool("use-native")) { args += " -cl-fast-relaxed-math "; } if (!do_dp) { args += " -DSINGLE_PRECISION "; } else if (checkExtension(fftDev, "cl_khr_fp64")) { args += " -DK_DOUBLE_PRECISION "; } else if (checkExtension(fftDev, "cl_amd_fp64")) { args += " -DAMD_DOUBLE_PRECISION "; } err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL); { char* log = NULL; size_t bytesRequired = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 0, NULL, &bytesRequired ); log = (char*)malloc( bytesRequired + 1 ); err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, bytesRequired, log, NULL ); std::cout << log << std::endl; free( log ); } if (err != CL_SUCCESS) { char log[50000]; size_t retsize = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 50000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; dumpPTXCode(fftCtx, fftProg, "oclFFT"); exit(-1); } else { // dumpPTXCode(fftCtx, fftProg, "oclFFT"); } // Create kernel for forward FFT fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for inverse FFT ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for check chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err); CL_CHECK_ERROR(err); }
// Modifications: // Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010 // Added calculation of latency estimate. // void RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp, cl::CommandQueue& queuecpp, ResultDatabase &resultDB, OptionParser &op) { // convert from C++ bindings to C bindings // TODO propagate use of C++ bindings cl_device_id id = devcpp(); cl_context ctx = ctxcpp(); cl_command_queue queue = queuecpp(); bool verbose = op.getOptionBool("verbose"); bool pinned = !op.getOptionBool("nopinned"); int npasses = op.getOptionInt("passes"); const bool waitForEvents = true; // Sizes are in kb int nSizes = 20; int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; // Max sure we don't surpass the OpenCL limit. cl_long maxAllocSizeBytes = 0; clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxAllocSizeBytes, NULL); while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes) { --nSizes; if (verbose) cout << " - dropping allocation size to keep under reported limit.\n"; if (nSizes < 1) { cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b"; return; } } // Create some host memory pattern if (verbose) cout << ">> creating host mem pattern\n"; int err; float *hostMem; cl_mem hostMemObj; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; if (pinned) { hostMemObj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*numMaxFloats, NULL, &err); if (err == CL_SUCCESS) { hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true, CL_MAP_READ|CL_MAP_WRITE, 0,sizeof(float)*numMaxFloats,0, NULL,NULL,&err); } while (err != CL_SUCCESS) { // drop the size and try again if (verbose) cout << " - dropping size allocating pinned mem\n"; --nSizes; if (nSizes < 1) { cerr << "Error: Couldn't allocated any pinned buffer\n"; return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hostMemObj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*numMaxFloats, NULL, &err); if (err == CL_SUCCESS) { hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true, CL_MAP_READ|CL_MAP_WRITE, 0,sizeof(float)*numMaxFloats,0, NULL,NULL,&err); } } } else { hostMem = new float[numMaxFloats]; } for (int i=0; i<numMaxFloats; i++) hostMem[i] = i % 77; // Allocate some device memory if (verbose) cout << ">> allocating device mem\n"; cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float)*numMaxFloats, NULL, &err); while (err != CL_SUCCESS) { // drop the size and try again if (verbose) cout << " - dropping size allocating device mem\n"; --nSizes; if (nSizes < 1) { cerr << "Error: Couldn't allocated any device buffer\n"; return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float)*numMaxFloats, NULL, &err); } if (verbose) cout << ">> filling device mem to force allocation\n"; Event evDownloadPrime("DownloadPrime"); err = clEnqueueWriteBuffer(queue, mem1, false, 0, numMaxFloats*sizeof(float), hostMem, 0, NULL, &evDownloadPrime.CLEvent()); CL_CHECK_ERROR(err); if (verbose) cout << ">> waiting for download to finish\n"; err = clWaitForEvents(1, &evDownloadPrime.CLEvent()); CL_CHECK_ERROR(err); // Three passes, forward and backward both for (int pass = 0; pass < npasses; pass++) { // store the times temporarily to estimate latency //float times[nSizes]; // Step through sizes forward on even passes and backward on odd for (int i = 0; i < nSizes; i++) { int sizeIndex; if ((pass%2) == 0) sizeIndex = i; else sizeIndex = (nSizes-1) - i; // Copy input memory to the device if (verbose) cout << ">> copying to device "<<sizes[sizeIndex]<<"kB\n"; Event evDownload("Download"); err = clEnqueueWriteBuffer(queue, mem1, false, 0, sizes[sizeIndex]*1024, hostMem, 0, NULL, &evDownload.CLEvent()); CL_CHECK_ERROR(err); // Wait for event to finish if (verbose) cout << ">> waiting for download to finish\n"; err = clWaitForEvents(1, &evDownload.CLEvent()); CL_CHECK_ERROR(err); if (verbose) cout << ">> finish!"; if (verbose) cout << endl; // Get timings err = clFlush(queue); CL_CHECK_ERROR(err); evDownload.FillTimingInfo(); if (verbose) evDownload.Print(cerr); double t = evDownload.SubmitEndRuntime() / 1.e6; // in ms //times[sizeIndex] = t; // Add timings to database double speed = (double(sizes[sizeIndex] * 1024.) / (1000.*1000.)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); // Add timings to database double delay = evDownload.SubmitStartDelay() / 1.e6; resultDB.AddResult("DownloadDelay", sizeStr, "ms", delay); resultDB.AddResult("DownloadTime", sizeStr, "ms", t); } //resultDB.AddResult("DownloadLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); //resultDB.AddResult("DownloadLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); //resultDB.AddResult("DownloadLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.); } // Cleanup err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); if (pinned) { err = clReleaseMemObject(hostMemObj); CL_CHECK_ERROR(err); } else { delete[] hostMem; } }