void StencilFactory<T>::ExtractOptions( const OptionParser& options, T& wCenter, T& wCardinal, T& wDiagonal ) { wCenter = options.getOptionFloat( "weight-center" ); wCardinal = options.getOptionFloat( "weight-cardinal" ); wDiagonal = options.getOptionFloat( "weight-diagonal" ); }
// validate stencil-independent values void CheckOptions( const OptionParser& opts ) { // check matrix dimensions - must be 2d, must be positive std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { throw InvalidArgValue( "overall size must have two dimensions" ); } if( (arrayDims[0] < 0) || (arrayDims[1] < 0) ) { throw InvalidArgValue( "each size dimension must be positive" ); } // validation error threshold must be positive float valThreshold = opts.getOptionFloat( "val-threshold" ); if( valThreshold <= 0.0f ) { throw InvalidArgValue( "validation threshold must be positive" ); } // number of validation errors to print must be non-negative int nErrsToPrint = opts.getOptionInt( "val-print-limit" ); if( nErrsToPrint < 0 ) { throw InvalidArgValue( "number of validation errors to print must be non-negative" ); } int nWarmupPasses = opts.getOptionInt( "warmupPasses" ); if( nWarmupPasses < 0 ) { throw InvalidArgValue( "number of warmup passes must be non-negative" ); } }
void RunTest(cl_device_id dev, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, OptionParser &op, string compileFlags, int nRows=0) { // Determine if the device is capable of using images in general cl_device_id device_id; cl_bool deviceSupportsImages; int err = 0; err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(device_id), &device_id, NULL); CL_CHECK_ERROR(err); err = clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(deviceSupportsImages), &deviceSupportsImages, NULL); CL_CHECK_ERROR(err); size_t maxImgWidth = 0; err = clGetDeviceInfo(device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &maxImgWidth, NULL); CL_CHECK_ERROR(err); // Make sure our sampler type is supported cl_sampler sampler; sampler = clCreateSampler(ctx, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &err); if (err != CL_SUCCESS) { cout << "Warning: Device does not support required sampler type"; cout << " falling back to global memory\n"; deviceSupportsImages = false; } else { clReleaseSampler(sampler); } // Host data structures // array of values in the sparse matrix floatType *h_val, *h_valPad; // array of column indices for each value in h_val int *h_cols, *h_colsPad; // array of indices to the start of each row in h_val/valPad int *h_rowDelimiters, *h_rowDelimitersPad; // Dense vector of values floatType *h_vec; // Output vector floatType *h_out; // Reference solution computed by cpu floatType *refOut; int nItems; // number of non-zero elements in the matrix int nItemsPadded; int numRows; // number of rows in the matrix // This benchmark either reads in a matrix market input file or // generates a random matrix string inFileName = op.getOptionString("mm_filename"); if (inFileName == "random") { // If we're not opening a file, the dimension of the matrix // has been passed in as an argument numRows = nRows; nItems = numRows * numRows / 100; // 1% of entries will be non-zero float maxval = op.getOptionFloat("maxval"); h_val = new floatType[nItems]; h_cols = new int[nItems]; h_rowDelimiters = new int[nRows+1]; fill(h_val, nItems, maxval); initRandomMatrix(h_cols, h_rowDelimiters, nItems, numRows); } else { char filename[FIELD_LENGTH]; strcpy(filename, inFileName.c_str()); readMatrix(filename, &h_val, &h_cols, &h_rowDelimiters, &nItems, &numRows); } // Final Image Check -- Make sure the image format is supported. int imgHeight = (numRows+maxImgWidth-1)/maxImgWidth; cl_image_format fmt; fmt.image_channel_data_type = CL_FLOAT; if(sizeof(floatType)==4) { fmt.image_channel_order=CL_R; } else { fmt.image_channel_order=CL_RG; } cl_mem d_vec = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth, imgHeight, 0, NULL, &err); if (err != CL_SUCCESS) { deviceSupportsImages = false; } else { clReleaseMemObject(d_vec); } // Set up remaining host data h_vec = new floatType[numRows]; refOut = new floatType[numRows]; h_rowDelimitersPad = new int[numRows+1]; fill(h_vec, numRows, op.getOptionFloat("maxval")); // Set up the padded data structures int paddedSize = numRows + (PAD_FACTOR - numRows % PAD_FACTOR); h_out = new floatType[paddedSize]; convertToPadded(h_val, h_cols, numRows, h_rowDelimiters, &h_valPad, &h_colsPad, h_rowDelimitersPad, &nItemsPadded); // Compute reference solution spmvCpu(h_val, h_cols, h_rowDelimiters, h_vec, numRows, refOut); // Dispatch based on whether or not device supports OpenCL images if (deviceSupportsImages) { cout << "CSR Test\n"; csrTest<floatType, clFloatType, true> (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols, h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut, false, maxImgWidth); // Test CSR kernels on padded data cout << "CSR Test -- Padded Data\n"; csrTest<floatType,clFloatType, true> (dev, ctx, compileFlags, queue, resultDB, op, h_valPad, h_colsPad, h_rowDelimitersPad, h_vec, h_out, numRows, nItemsPadded, refOut, true, maxImgWidth); // Test ELLPACKR kernel cout << "ELLPACKR Test\n"; ellPackTest<floatType, clFloatType, true> (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols, h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut, false, paddedSize, maxImgWidth); } else { cout << "CSR Test\n"; csrTest<floatType, clFloatType, false> (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols, h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut, false, 0); // Test CSR kernels on padded data cout << "CSR Test -- Padded Data\n"; csrTest<floatType,clFloatType, false> (dev, ctx, compileFlags, queue, resultDB, op, h_valPad, h_colsPad, h_rowDelimitersPad, h_vec, h_out, numRows, nItemsPadded, refOut, true, 0); // Test ELLPACKR kernel cout << "ELLPACKR Test\n"; ellPackTest<floatType, clFloatType, false> (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols, h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut, false, paddedSize, 0); } delete[] h_val; delete[] h_cols; delete[] h_rowDelimiters; delete[] h_vec; delete[] h_out; delete[] h_valPad; delete[] h_colsPad; delete[] h_rowDelimitersPad; }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; try { #if defined(PARALLEL) stdStencilFactory = new MPIHostStencilFactory<T>; testStencilFactory = new MPICUDAStencilFactory<T>; #else stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new CUDAStencilFactory<T>; #endif // defined(PARALLEL) assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::vector<long long> lDims = opts.getOptionVecInt( "lsize" ); assert( lDims.size() == 2 ); std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << lDims[0] << 'x' << lDims[1]; unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" ); unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" ); // compute the expected result on the host // or read it from a pre-existing file std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" ); #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) if( !matrixFilenameBase.empty() ) { std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n" << std::endl; } else { std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n" << "Depending on host capabilities, this may take a while." << std::endl; } #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> expected( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); bool haveExpectedData = false; if( ! matrixFilenameBase.empty() ) { bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); if( readOK ) { if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) || (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) ) { std::cerr << "The matrix read from file \'" << GetMatrixFileName<T>( matrixFilenameBase ) << "\' does not match the matrix size specified on the command line.\n"; expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); } else { haveExpectedData = true; } } if( !haveExpectedData ) { std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n" << "Depending on host capabilities, this may take a while." << std::endl; } } if( !haveExpectedData ) { init( expected ); haveExpectedData = true; if( beVerbose ) { std::cout << "initial state:\n" << expected << std::endl; } stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( expected, nIters ); } if( beVerbose ) { std::cout << "expected result:\n" << expected << std::endl; } // determine whether we are to save the expected matrix values to a file // to speed up future runs matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" ); if( !matrixFilenameBase.empty() ) { SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); } assert( haveExpectedData ); // compute the result on the CUDA device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); // Compute the number of floating point operations we will perform. // // Note: in the truly-parallel case, we count flops for redundant // work due to the need for a halo. // But we do not add to the count for the local 1-wide halo since // we aren't computing new values for those items. unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); #if defined(PARALLEL) MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil ); assert( mpiTestStencil != NULL ); int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0; int numParticipating = 0; MPI_Allreduce( &participating, // src &numParticipating, // dest 1, // count MPI_INT, // type MPI_SUM, // op MPI_COMM_WORLD ); // communicator npts *= numParticipating; #endif // defined(PARALLEL) // In our 9-point stencil, there are 11 floating point operations // per point (3 multiplies and 11 adds): // // newval = weight_center * centerval + // weight_cardinal * (northval + southval + eastval + westval) + // weight_diagnoal * (neval + nwval + seval + swval) // // we do this stencil operation 'nIters' times unsigned long nflops = npts * 11 * nIters; #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Performing " << nWarmupPasses << " warmup passes..."; #if defined(PARALLEL) } #endif // defined(PARALLEL) for( unsigned int pass = 0; pass < nWarmupPasses; pass++ ) { init(data); (*testStencil)( data, nIters ); } #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "done." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) for( unsigned int pass = 0; pass < nPasses; pass++ ) { #if !defined(PARALLEL) std::cout << "pass " << pass << ": "; #endif // !defined(PARALLEL) init( data ); int timerHandle = Timer::Start(); (*testStencil)( data, nIters ); double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" ); // find and report the computation rate double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) StencilValidater<T>* validater = new MPIStencilValidater<T>; #else StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) validater->ValidateResult( expected, data, valErrThreshold, nValErrsToPrint ); } } catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; } // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; //try { stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new MICStencilFactory<T>; assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; //throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { const int probSizes[4] = { 768, 1408, 2048, 4096 }; int sizeClass = opts.getOptionInt("size"); if (!(sizeClass >= 0 && sizeClass < 5)) { //throw InvalidArgValue( "Size class must be between 1-4" ); } arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1]; } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << LROWS << 'x' << LCOLS; unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" ); unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); unsigned long nflops = npts * 11 * nIters; cout<<"flops are = "<<nflops<<endl; // compute the expected result on the host #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n" << "Depending on host capabilities, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); init( exp ); if( beVerbose ) { std::cout << "initial state:\n" << exp << std::endl; } Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( exp, nIters ); if( beVerbose ) { std::cout << "expected result:\n" << exp << std::endl; } // compute the result on the MIC device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) std::cout<<"Passes:"<<nPasses<<endl; for( unsigned int pass = 0; pass < nPasses; pass++ ) { init( data ); double start = curr_second(); (*testStencil)( data, nIters ); double elapsedTime = curr_second()-start; double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) //StencilValidater<T>* validater = new MPIStencilValidater<T>; #else //StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) MICValidate(exp,data,valErrThreshold,nValErrsToPrint); /*validater->ValidateResult( exp, data, valErrThreshold, nValErrsToPrint );*/ } } /* catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; }*/ // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }