// **************************************************************************** // Function: GPUSetup // // Purpose: // do the necessary OpenCL setup for GPU part of the test // // Arguments: // op: the options parser / parameter database // mympirank: for printing errors in case of failure // mynoderank: this is typically the device ID (the mapping done in main) // // Returns: success/failure // // Creation: 2009 // // Modifications: // // **************************************************************************** // int GPUSetup(OptionParser &op, int mympirank, int mynoderank) { addBenchmarkSpecOptions(op); if (op.getOptionBool("infoDevices")) { OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); return (0); } // The device option supports specifying more than one device int platform = op.getOptionInt("platform"); int deviceIdx = mynoderank; if( deviceIdx >= op.getOptionVecInt( "device" ).size() ) { std::ostringstream estr; estr << "Warning: not enough devices specified with --device flag for task " << mympirank << " ( node rank " << mynoderank << ") to claim its own device; forcing to use first device "; std::cerr << estr.str() << std::endl; deviceIdx = 0; } int device = op.getOptionVecInt("device")[deviceIdx]; // Initialization _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) ); std::vector<cl::Device> ctxDevices; ctxDevices.push_back( *_mpicontention_ocldev ); _mpicontention_ocldriver_ctx = new cl::Context( ctxDevices ); _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE ); _mpicontention_gpuop = op; return 0; }
// validate stencil-independent values void CheckOptions( const OptionParser& opts ) { // check matrix dimensions - must be 2d, must be positive std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { throw InvalidArgValue( "overall size must have two dimensions" ); } if( (arrayDims[0] < 0) || (arrayDims[1] < 0) ) { throw InvalidArgValue( "each size dimension must be positive" ); } // validation error threshold must be positive float valThreshold = opts.getOptionFloat( "val-threshold" ); if( valThreshold <= 0.0f ) { throw InvalidArgValue( "validation threshold must be positive" ); } // number of validation errors to print must be non-negative int nErrsToPrint = opts.getOptionInt( "val-print-limit" ); if( nErrsToPrint < 0 ) { throw InvalidArgValue( "number of validation errors to print must be non-negative" ); } int nWarmupPasses = opts.getOptionInt( "warmupPasses" ); if( nWarmupPasses < 0 ) { throw InvalidArgValue( "number of warmup passes must be non-negative" ); } }
void CommonMICStencilFactory<T>::CheckOptions( const OptionParser& opts ) const { // let base class check its options first StencilFactory<T>::CheckOptions( opts ); // check our options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); assert( arrayDims.size() == 2 ); // If both of these are zero, we're using a non-custom size, skip this test if (arrayDims[0] == 0 && arrayDims[0] == 0) { return; } size_t gRows = (size_t)arrayDims[0]; size_t gCols = (size_t)arrayDims[1]; size_t lRows = LROWS; size_t lCols = LCOLS; // verify that local dimensions evenly divide global dimensions if( ((gRows % lRows) != 0) || (lRows > gRows) ) { throw InvalidArgValue( "number of rows must be even multiple of lsize rows" ); } if( ((gCols % lCols) != 0) || (lCols > gCols) ) { throw InvalidArgValue( "number of columns must be even multiple of lsize columns" ); } // TODO ensure local dims are smaller than CUDA implementation limits }
void MPICUDAStencilFactory<T>::CheckOptions( const OptionParser& opts ) const { // let base class check its options first CommonCUDAStencilFactory<T>::CheckOptions( opts ); // check our options std::vector<long long> shDims = opts.getOptionVecInt( "lsize" ); std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims[0] == 0 ) { // custom size was not specified - we are using a standard size int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } assert( shDims.size() == 2 ); assert( arrayDims.size() == 2 ); size_t gRows = (size_t)arrayDims[0]; size_t gCols = (size_t)arrayDims[1]; size_t lRows = shDims[0]; size_t lCols = shDims[1]; unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); // verify that MPI halo width will result in a matrix being passed // to the kernel that also has its global size as a multiple of // the local work size // // Because the MPI halo width is arbitrary, and the kernel halo width // is always 1, we have to ensure that: // ((size + 2*halo) - 2) % lsize == 0 if( (((gRows + 2*haloWidth) - 2) % lRows) != 0 ) { throw InvalidArgValue( "rows including halo must be even multiple of lsize (e.g., lsize rows evenly divides ((rows + 2*halo) - 2) )" ); } if( (((gCols + 2*haloWidth) - 2) % lCols) != 0 ) { throw InvalidArgValue( "columns including halo must be even multiple of lsize (e.g., lsize cols evenly divides ((cols + 2*halo) - 2) )" ); } }
void CommonMICStencilFactory<T>::ExtractOptions( const OptionParser& options, T& wCenter, T& wCardinal, T& wDiagonal, std::vector<long long>& devices ) { // let base class extract its options StencilFactory<T>::ExtractOptions( options, wCenter, wCardinal, wDiagonal ); // extract our options // with hardcoded lsize, we no longer have any to extract // determine which device to use // We would really prefer this to be done in main() but // since BuildStencil is a virtual function, we cannot change its // signature, and OptionParser provides no way to override an // option's value after it is set during parsing. devices = options.getOptionVecInt("device"); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; try { #if defined(PARALLEL) stdStencilFactory = new MPIHostStencilFactory<T>; testStencilFactory = new MPICUDAStencilFactory<T>; #else stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new CUDAStencilFactory<T>; #endif // defined(PARALLEL) assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { int sizeClass = opts.getOptionInt("size"); arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass ); } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::vector<long long> lDims = opts.getOptionVecInt( "lsize" ); assert( lDims.size() == 2 ); std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << lDims[0] << 'x' << lDims[1]; unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" ); unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" ); // compute the expected result on the host // or read it from a pre-existing file std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" ); #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) if( !matrixFilenameBase.empty() ) { std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n" << std::endl; } else { std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n" << "Depending on host capabilities, this may take a while." << std::endl; } #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> expected( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); bool haveExpectedData = false; if( ! matrixFilenameBase.empty() ) { bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); if( readOK ) { if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) || (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) ) { std::cerr << "The matrix read from file \'" << GetMatrixFileName<T>( matrixFilenameBase ) << "\' does not match the matrix size specified on the command line.\n"; expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); } else { haveExpectedData = true; } } if( !haveExpectedData ) { std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n" << "Depending on host capabilities, this may take a while." << std::endl; } } if( !haveExpectedData ) { init( expected ); haveExpectedData = true; if( beVerbose ) { std::cout << "initial state:\n" << expected << std::endl; } stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( expected, nIters ); } if( beVerbose ) { std::cout << "expected result:\n" << expected << std::endl; } // determine whether we are to save the expected matrix values to a file // to speed up future runs matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" ); if( !matrixFilenameBase.empty() ) { SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) ); } assert( haveExpectedData ); // compute the result on the CUDA device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); // Compute the number of floating point operations we will perform. // // Note: in the truly-parallel case, we count flops for redundant // work due to the need for a halo. // But we do not add to the count for the local 1-wide halo since // we aren't computing new values for those items. unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); #if defined(PARALLEL) MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil ); assert( mpiTestStencil != NULL ); int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0; int numParticipating = 0; MPI_Allreduce( &participating, // src &numParticipating, // dest 1, // count MPI_INT, // type MPI_SUM, // op MPI_COMM_WORLD ); // communicator npts *= numParticipating; #endif // defined(PARALLEL) // In our 9-point stencil, there are 11 floating point operations // per point (3 multiplies and 11 adds): // // newval = weight_center * centerval + // weight_cardinal * (northval + southval + eastval + westval) + // weight_diagnoal * (neval + nwval + seval + swval) // // we do this stencil operation 'nIters' times unsigned long nflops = npts * 11 * nIters; #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "Performing " << nWarmupPasses << " warmup passes..."; #if defined(PARALLEL) } #endif // defined(PARALLEL) for( unsigned int pass = 0; pass < nWarmupPasses; pass++ ) { init(data); (*testStencil)( data, nIters ); } #if defined(PARALLEL) if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "done." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) for( unsigned int pass = 0; pass < nPasses; pass++ ) { #if !defined(PARALLEL) std::cout << "pass " << pass << ": "; #endif // !defined(PARALLEL) init( data ); int timerHandle = Timer::Start(); (*testStencil)( data, nIters ); double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" ); // find and report the computation rate double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) StencilValidater<T>* validater = new MPIStencilValidater<T>; #else StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) validater->ValidateResult( expected, data, valErrThreshold, nValErrsToPrint ); } } catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; } // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
// **************************************************************************** // Method: main() // // Purpose: // serial and parallel main for OpenCL level0 benchmarks // // Arguments: // argc, argv // // Programmer: SHOC Team // Creation: The Epoch // // Modifications: // Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010 // Changed the way device selection works. It now defaults to the device // index corresponding to the process's rank within a node if no devices // are specified on the command command line, and otherwise, round-robins // the list of devices among the tasks. // // Gabriel Marin, Tue Jun 01 15:38 EST 2010 // Check that we have valid (not NULL) context and queue objects before // running the benchmarks. Errors inside CreateContextFromSingleDevice or // CreateCommandQueueForContextAndDevice were not propagated out to the main // program. // // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n"; #endif OptionParser op; //Add shared options to the parser op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use", 'p'); op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_VECINT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1 ); } if (op.getOptionBool("infoDevices")) { #define DEBUG_DEVICE_CONTAINER 0 #ifdef PARALLEL // execute following code only if I am the process of lowest // rank on this node NodeInfo NI; int mynoderank = NI.nodeRank(); if (mynoderank==0) { int nlrrank, nlrsize; MPI_Comm nlrcomm = NI.getNLRComm(); MPI_Comm_size(nlrcomm, &nlrsize); MPI_Comm_rank(nlrcomm, &nlrrank); OpenCLNodePlatformContainer ndc1; OpenCLMultiNodeContainer localMnc(ndc1); localMnc.doMerge (nlrrank, nlrsize, nlrcomm); if (rank==0) // I am the global rank 0, print all configurations localMnc.Print (cout); } #else OpenCLNodePlatformContainer ndc1; ndc1.Print (cout); #if DEBUG_DEVICE_CONTAINER OpenCLMultiNodeContainer mnc1(ndc1), mnc2; mnc1.Print (cout); ostringstream oss; mnc1.writeObject (oss); std::string temp(oss.str()); cout << "Serialized MultiNodeContainer:\n" << temp; istringstream iss(temp); mnc2.readObject (iss); cout << "Unserialized object2:\n"; mnc2.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 1:\n"; mnc1.Print (cout); mnc1.merge (mnc2); cout << "==============\nObject1 after merging 2:\n"; mnc1.Print (cout); #endif // DEBUG #endif // PARALLEL return (0); } bool verbose = op.getOptionBool("verbose"); // The device option supports specifying more than one device // for now, just choose the first one. int platform = op.getOptionInt("platform"); #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); if (verbose) cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl; #else int myNodeRank = 0; #endif // If they haven't specified any devices, assume they // want the process with in-node rank N to use device N int deviceIdx = myNodeRank; // If they have, then round-robin the list of devices // among the processes on a node. vector<long long> deviceVec = op.getOptionVecInt("device"); if (deviceVec.size() > 0) { int len = deviceVec.size(); deviceIdx = deviceVec[myNodeRank % len]; } // Check for an erroneous device if (deviceIdx >= GetNumOclDevices(platform)) { cerr << "Warning: device index: " << deviceIdx << " out of range, defaulting to device 0.\n"; deviceIdx = 0; } // Initialization if (verbose) cout << ">> initializing\n"; cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx); cl_int clErr; cl_context ctx = clCreateContext( NULL, // properties 1, // number of devices &devID, // device NULL, // notification function NULL, &clErr ); CL_CHECK_ERROR(clErr); cl_command_queue queue = clCreateCommandQueue( ctx, devID, CL_QUEUE_PROFILING_ENABLE, &clErr ); CL_CHECK_ERROR(clErr); ResultDatabase resultDB; // Run the benchmark RunBenchmark(devID, ctx, queue, resultDB, op); clReleaseCommandQueue( queue ); clReleaseContext( ctx ); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { std::cerr << "unrecognized exception caught" << std::endl; ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif return ret; }
int main(int argc, char *argv[]) { int numdev=0, totalnumdev=0, numtasks, mympirank, dest, source, rc, mypair=0, count, tag=2, mynoderank,myclusterrank,nodenprocs; int *grp1, *grp2; int mygrprank,grpnumtasks; MPI_Group orig_group,bmgrp; MPI_Comm bmcomm,nlrcomm; ResultDatabase resultDB,resultDBWU,resultDB1; OptionParser op; ParallelResultDatabase pardb, pardb1; bool amGPUTask = false; volatile unsigned long long *mpidone; int i,shmid; /* Allocate System V shared memory */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD, &mympirank); MPI_Comm_group(MPI_COMM_WORLD, &orig_group); //Add shared options to the parser op.addOption("device", OPT_VECINT, "0", "specify device(s) to run on", 'd'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'z'); op.addOption("size", OPT_VECINT, "1", "specify problem size", 's'); op.addOption("time", OPT_INT, "5", "specify running time in miuntes", 't'); op.addOption("outputFile", OPT_STRING, "output.txt", "specify output file", 'o'); op.addOption("infoDevices", OPT_BOOL, "", "show summary info for available devices", 'i'); op.addOption("fullInfoDevices", OPT_BOOL, "", "show full info for available devices"); op.addOption("MPIminmsg", OPT_INT, "0", "specify minimum MPI message size"); op.addOption("MPImaxmsg", OPT_INT, "16384", "specify maximum MPI message size"); op.addOption("MPIiter", OPT_INT, "1000", "specify number of MPI benchmark iterations for each size"); op.addOption("platform", OPT_INT, "0", "specify platform for device selection", 'y'); if (!op.parse(argc, argv)) { if (mympirank == 0) op.usage(); MPI_Finalize(); return 0; } int npasses = op.getOptionInt("passes"); //our simple mapping NodeInfo NI; mynoderank = NI.nodeRank(); // rank of my process within the node myclusterrank = NI.clusterRank(); // cluster (essentially, node) id MPI_Comm smpcomm = NI.getSMPComm(); if(mynoderank==0){ shmid = shmget(IPC_PRIVATE, sizeof(unsigned long long), (IPC_CREAT | 0600)); } MPI_Bcast(&shmid, 1, MPI_INT, 0, NI.getSMPComm()); mpidone = ((volatile unsigned long long*) shmat(shmid, 0, 0)); if (mynoderank == 0) shmctl(shmid, IPC_RMID, 0); *mpidone = 0; nlrcomm = NI.getNLRComm(); // communcator of all the lowest rank processes // on all the nodes int numnodes = NI.numNodes(); if ( numnodes%2!=0 ) { if(mympirank==0) printf("\nThis test needs an even number of nodes\n"); MPI_Finalize(); exit(0); } int nodealr = NI.nodeALR(); nodenprocs=NI.nodeNprocs(); // determine how many GPU devices we are to use int devsPerNode = op.getOptionVecInt( "device" ).size(); //cout<<mympirank<<":numgpus="<<devsPerNode<<endl; // if there are as many or more devices as the nprocs, only use half of // the nproc if ( devsPerNode >= nodenprocs ) devsPerNode = nodenprocs/2; numdev = (mynoderank == 0) ? devsPerNode : 0; MPI_Allreduce(&numdev, &totalnumdev, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); numdev = devsPerNode; // determine whether I am to be a GPU or a comm task if( mynoderank < numdev ) { amGPUTask = true; } //Divide tasks into two distinct groups based upon noderank grp1=(int *)calloc(totalnumdev, sizeof(int)); grp2=(int *)calloc((numtasks-totalnumdev),sizeof(int)); if (grp1==NULL || grp2==NULL) { printf("\n%d:calloc failed in %s",mympirank,__FUNCTION__); exit(1); } /*compute the groups*/ int beginoffset[2]={0,0}; if(mynoderank == 0) { int tmp[2]; tmp[0]=numdev; tmp[1]=nodenprocs-numdev; if (mympirank ==0) MPI_Send(tmp, 2*sizeof(int), MPI_CHAR, 1, 112, nlrcomm); else { MPI_Status reqstat; MPI_Recv(beginoffset, 2*sizeof(int), MPI_CHAR, myclusterrank-1, 112, nlrcomm ,&reqstat); if (myclusterrank < numnodes-1) { beginoffset[0]+=numdev; beginoffset[1]+=(nodenprocs-numdev); MPI_Send(beginoffset,2*sizeof(int), MPI_CHAR, myclusterrank+1, 112, nlrcomm); beginoffset[0]-=numdev; beginoffset[1]-=(nodenprocs-numdev); } } } MPI_Bcast(beginoffset,2,MPI_INT,0,smpcomm); if ( amGPUTask ) { // I am to do GPU work grp1[beginoffset[0]+mynoderank]=mympirank; grpnumtasks=totalnumdev; } else { // I am to do MPI communication work grp2[beginoffset[1]+(mynoderank-numdev)]=mympirank; grpnumtasks=numtasks-totalnumdev; } MPI_Allreduce(MPI_IN_PLACE, grp1, totalnumdev, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, grp2, (numtasks-totalnumdev), MPI_INT, MPI_SUM, MPI_COMM_WORLD); if ( amGPUTask ) { // I am to do GPU work, so will be part of GPU communicator MPI_Group_incl(orig_group, totalnumdev, grp1, &bmgrp); } else { // I am to do MPI communication work, so will be part of MPI // messaging traffic communicator MPI_Group_incl(orig_group, (numtasks-totalnumdev), grp2, &bmgrp); } MPI_Comm_create(MPI_COMM_WORLD, bmgrp, &bmcomm); MPI_Comm_rank(bmcomm, &mygrprank); NodeInfo *GRPNI = new NodeInfo(bmcomm); int mygrpnoderank=GRPNI->nodeRank(); int grpnodealr = GRPNI->nodeALR(); int grpnodenprocs = GRPNI->nodeNprocs(); MPI_Comm grpnlrcomm = GRPNI->getNLRComm(); //note that clusterrank and number of nodes don't change for this child //group/comm //form node-random pairs (see README) among communication tasks if( amGPUTask ) { //setup GPU in GPU tasks GPUSetup(op, mympirank, mynoderank); } else { int * pairlist = new int[numnodes]; for (i=0;i<numnodes;i++) pairlist[i]=0; if ( mygrpnoderank==0 ) { pairlist[myclusterrank]=grpnodealr; MPI_Allreduce(MPI_IN_PLACE,pairlist,numnodes,MPI_INT,MPI_SUM, grpnlrcomm); mypair = RandomPairs(myclusterrank, numnodes, grpnlrcomm); mypair = pairlist[mypair]; } for (i=0;i<numnodes;i++) pairlist[i]=0; if ( mygrpnoderank==0 ) pairlist[myclusterrank]=mypair; MPI_Allreduce(MPI_IN_PLACE,pairlist,numnodes,MPI_INT,MPI_SUM, bmcomm); mypair = pairlist[myclusterrank]+mygrpnoderank; } // ensure we are all synchronized before starting test MPI_Barrier(MPI_COMM_WORLD); //warmup run if ( amGPUTask ) { GPUDriver(op, resultDBWU); } //first, individual runs for device benchmark for(i=0;i<npasses;i++){ if ( amGPUTask ) { GPUDriver(op, resultDB); } } MPI_Barrier(MPI_COMM_WORLD); //warmup run if ( !amGPUTask ) { MPITest(op, resultDBWU, grpnumtasks, mygrprank, mypair, bmcomm); } //next, individual run for MPI Benchmark for(i=0;i<npasses;i++){ if ( !amGPUTask ) { MPITest(op, resultDB, grpnumtasks, mygrprank, mypair, bmcomm); } } MPI_Barrier(MPI_COMM_WORLD); //merge and print pardb.MergeSerialDatabases(resultDB, bmcomm); if (mympirank==0) cout<<endl<<"*****************************Sequential GPU and MPI runs****************************"<<endl; DumpInSequence(pardb, mygrprank, mympirank); // Simultaneous runs for observing impact of contention MPI_Barrier(MPI_COMM_WORLD); if ( amGPUTask ) { do { if (mympirank == 0 ) cout<<"."; GPUDriver(op, resultDB1);flush(cout); } while(*mpidone==0); if (mympirank == 0 ) cout<<"*"<<endl; } else { for ( i=0;i<npasses;i++ ) { MPITest(op, resultDB1, grpnumtasks, mygrprank, mypair, bmcomm); } *mpidone=1; } MPI_Barrier(MPI_COMM_WORLD); //merge and print pardb1.MergeSerialDatabases(resultDB1,bmcomm); if (mympirank==0) cout<<endl<<"*****************************Simultaneous GPU and MPI runs****************************"<<endl; DumpInSequence(pardb1, mygrprank, mympirank); //print summary if ( !amGPUTask && mygrprank==0) { vector<ResultDatabase::Result> prelatency = pardb.GetResultsForTest("MPI Latency(mean)"); vector<ResultDatabase::Result> postlatency = pardb1.GetResultsForTest("MPI Latency(mean)"); cout<<endl<<"Summarized Mean(Mean) MPI Baseline Latency vs. Latency with Contention"; cout<<endl<<"MSG SIZE(B)\t"; int msgsize=0; for (i=0; i<prelatency.size(); i++) { cout<<msgsize<<"\t"; msgsize = (msgsize ? msgsize * 2 : msgsize + 1); } cout << endl <<"BASELATENCY\t"; for (i=0; i<prelatency.size(); i++) cout<<setiosflags(ios::fixed) << setprecision(2)<<prelatency[i].GetMean() << "\t"; cout << endl <<"CONTLATENCY\t"; for (i=0; i<postlatency.size(); i++) cout<<setiosflags(ios::fixed) << setprecision(2)<<postlatency[i].GetMean() << "\t"; flush(cout); cout<<endl; } MPI_Barrier(MPI_COMM_WORLD); if ( amGPUTask && mympirank==0) { vector<ResultDatabase::Result> prespeed = pardb.GetResultsForTest("DownloadSpeed(mean)"); vector<ResultDatabase::Result> postspeed = pardb1.GetResultsForTest("DownloadSpeed(mean)"); cout<<endl<<"Summarized Mean(Mean) GPU Baseline Download Speed vs. Download Speed with Contention"; cout<<endl<<"MSG SIZE(KB)\t"; int msgsize=1; for (i=0; i<prespeed.size(); ++i) { cout<<msgsize<<"\t"; msgsize = (msgsize ? msgsize * 2 : msgsize + 1); } cout << endl <<"BASESPEED\t"; for (i=0; i<prespeed.size(); ++i) cout<<setiosflags(ios::fixed) << setprecision(4)<<prespeed[i].GetMean() << "\t"; cout << endl <<"CONTSPEED\t"; for (i=0; i<postspeed.size(); ++i) cout<<setiosflags(ios::fixed) << setprecision(4)<<postspeed[i].GetMean() << "\t"; cout<<endl; } if ( amGPUTask && mympirank==0) { vector<ResultDatabase::Result> pregpulat = pardb.GetResultsForTest("DownloadLatencyEstimate(mean)"); vector<ResultDatabase::Result> postgpulat = pardb1.GetResultsForTest("DownloadLatencyEstimate(mean)"); cout<<endl<<"Summarized Mean(Mean) GPU Baseline Download Latency vs. Download Latency with Contention"; cout<<endl<<"MSG SIZE\t"; for (i=0; i<pregpulat.size(); ++i) { cout<<pregpulat[i].atts<<"\t"; } cout << endl <<"BASEGPULAT\t"; for (i=0; i<pregpulat.size(); ++i) cout<<setiosflags(ios::fixed) << setprecision(7)<<pregpulat[i].GetMean() << "\t"; cout << endl <<"CONTGPULAT\t"; for (i=0; i<postgpulat.size(); ++i) cout<<setiosflags(ios::fixed) << setprecision(7)<<postgpulat[i].GetMean() << "\t"; cout<<endl; } //cleanup GPU if( amGPUTask ) { GPUCleanup(op); } MPI_Finalize(); }
void DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts ) { StencilFactory<T>* stdStencilFactory = NULL; Stencil<T>* stdStencil = NULL; StencilFactory<T>* testStencilFactory = NULL; Stencil<T>* testStencil = NULL; //try { stdStencilFactory = new HostStencilFactory<T>; testStencilFactory = new MICStencilFactory<T>; assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) ); // do a sanity check on option values CheckOptions( opts ); stdStencilFactory->CheckOptions( opts ); testStencilFactory->CheckOptions( opts ); // extract and validate options std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" ); if( arrayDims.size() != 2 ) { cerr << "Dim size: " << arrayDims.size() << "\n"; //throw InvalidArgValue( "all overall dimensions must be positive" ); } if (arrayDims[0] == 0) // User has not specified a custom size { const int probSizes[4] = { 768, 1408, 2048, 4096 }; int sizeClass = opts.getOptionInt("size"); if (!(sizeClass >= 0 && sizeClass < 5)) { //throw InvalidArgValue( "Size class must be between 1-4" ); } arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1]; } long int seed = (long)opts.getOptionInt( "seed" ); bool beVerbose = opts.getOptionBool( "verbose" ); unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" ); double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" ); unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" ); #if defined(PARALLEL) unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" ); #else unsigned int haloWidth = 1; #endif // defined(PARALLEL) float haloVal = (float)opts.getOptionFloat( "haloVal" ); // build a description of this experiment std::ostringstream experimentDescriptionStr; experimentDescriptionStr << nIters << ':' << arrayDims[0] << 'x' << arrayDims[1] << ':' << LROWS << 'x' << LCOLS; unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" ); unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * (arrayDims[1] + 2*haloWidth - 2); unsigned long nflops = npts * 11 * nIters; cout<<"flops are = "<<nflops<<endl; // compute the expected result on the host #if defined(PARALLEL) int cwrank; MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n" << "Depending on host capabilities, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Initialize<T> init( seed, haloWidth, haloVal ); init( exp ); if( beVerbose ) { std::cout << "initial state:\n" << exp << std::endl; } Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts ); (*stdStencil)( exp, nIters ); if( beVerbose ) { std::cout << "expected result:\n" << exp << std::endl; } // compute the result on the MIC device Matrix2D<T> data( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth ); Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts ); #if defined(PARALLEL) MPI_Comm_rank( MPI_COMM_WORLD, &cwrank ); if( cwrank == 0 ) { #endif // defined(PARALLEL) std::cout << "\nPerforming stencil operation on chosen device, " << nPasses << " passes.\n" << "Depending on chosen device, this may take a while." << std::endl; #if defined(PARALLEL) } #endif // defined(PARALLEL) #if !defined(PARALLEL) std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." << std::endl; #endif // !defined(PARALLEL) std::cout<<"Passes:"<<nPasses<<endl; for( unsigned int pass = 0; pass < nPasses; pass++ ) { init( data ); double start = curr_second(); (*testStencil)( data, nIters ); double elapsedTime = curr_second()-start; double gflops = (nflops / elapsedTime) / 1e9; resultDB.AddResult( timerDesc, experimentDescriptionStr.str(), "GFLOPS", gflops ); if( beVerbose ) { std::cout << "observed result, pass " << pass << ":\n" << data << std::endl; } // validate the result #if defined(PARALLEL) //StencilValidater<T>* validater = new MPIStencilValidater<T>; #else //StencilValidater<T>* validater = new SerialStencilValidater<T>; #endif // defined(PARALLEL) MICValidate(exp,data,valErrThreshold,nValErrsToPrint); /*validater->ValidateResult( exp, data, valErrThreshold, nValErrsToPrint );*/ } } /* catch( ... ) { // clean up - abnormal termination // wish we didn't have to do this, but C++ exceptions do not // support a try-catch-finally approach delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; throw; }*/ // clean up - normal termination delete stdStencil; delete stdStencilFactory; delete testStencil; delete testStencilFactory; }
// **************************************************************************** // Function: main // // Purpose: // The main function takes care of initialization (device and MPI), then // performs the benchmark and prints results. // // Arguments: // // // Programmer: Jeremy Meredith // Creation: // // Modifications: // Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 // Split timing reports into detailed and summary. For serial code, we // report all trial values, and for parallel, skip the per-process vals. // Also detect and print outliers from parallel runs. // // **************************************************************************** int main(int argc, char *argv[]) { int ret = 0; bool noprompt = false; try { #ifdef PARALLEL int rank, size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n"; #endif // Get args OptionParser op; //Add shared options to the parser op.addOption("device", OPT_VECINT, "0", "specify device(s) to run on", 'd'); op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v'); op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n'); op.addOption("size", OPT_INT, "1", "specify problem size", 's'); op.addOption("infoDevices", OPT_BOOL, "", "show info for available platforms and devices", 'i'); op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q'); #ifdef _WIN32 op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit"); #endif addBenchmarkSpecOptions(op); if (!op.parse(argc, argv)) { #ifdef PARALLEL if (rank == 0) op.usage(); MPI_Finalize(); #else op.usage(); #endif return (op.HelpRequested() ? 0 : 1); } bool verbose = op.getOptionBool("verbose"); bool infoDev = op.getOptionBool("infoDevices"); #ifdef _WIN32 noprompt = op.getOptionBool("noprompt"); #endif int device; #ifdef PARALLEL NodeInfo ni; int myNodeRank = ni.nodeRank(); vector<long long> deviceVec = op.getOptionVecInt("device"); if (myNodeRank >= deviceVec.size()) { // Default is for task i to test device i device = myNodeRank; } else { device = deviceVec[myNodeRank]; } #else device = op.getOptionVecInt("device")[0]; #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (device >= deviceCount) { cerr << "Warning: device index: " << device << " out of range, defaulting to device 0.\n"; device = 0; } // Initialization EnumerateDevicesAndChoose(device, infoDev); if( infoDev ) { return 0; } ResultDatabase resultDB; // Run the benchmark RunBenchmark(resultDB, op); #ifndef PARALLEL resultDB.DumpDetailed(cout); #else ParallelResultDatabase pardb; pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD); if (rank==0) { pardb.DumpSummary(cout); pardb.DumpOutliers(cout); } #endif } catch( InvalidArgValue& e ) { std::cerr << e.what() << ": " << e.GetMessage() << std::endl; ret = 1; } catch( std::exception& e ) { std::cerr << e.what() << std::endl; ret = 1; } catch( ... ) { ret = 1; } #ifdef PARALLEL MPI_Finalize(); #endif #ifdef _WIN32 if (!noprompt) { cout << "Press return to exit\n"; cin.get(); } #endif return ret; }
void init(OptionParser& op, bool _do_dp) { cl_int err; do_dp = _do_dp; if (!fftCtx) { // first get the device int device, platform = op.getOptionInt("platform"); if (op.getOptionVecInt("device").size() > 0) { device = op.getOptionVecInt("device")[0]; } else { device = 0; } fftDev = ListDevicesAndGetDevice(platform, device); // now get the context fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err); CL_CHECK_ERROR(err); } if (!fftQueue) { // get a queue fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE, &err); CL_CHECK_ERROR(err); } // create the program... fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err); CL_CHECK_ERROR(err); // ...and build it string args = " -cl-mad-enable "; if (op.getOptionBool("use-native")) { args += " -cl-fast-relaxed-math "; } if (!do_dp) { args += " -DSINGLE_PRECISION "; } else if (checkExtension(fftDev, "cl_khr_fp64")) { args += " -DK_DOUBLE_PRECISION "; } else if (checkExtension(fftDev, "cl_amd_fp64")) { args += " -DAMD_DOUBLE_PRECISION "; } err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL); { char* log = NULL; size_t bytesRequired = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 0, NULL, &bytesRequired ); log = (char*)malloc( bytesRequired + 1 ); err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, bytesRequired, log, NULL ); std::cout << log << std::endl; free( log ); } if (err != CL_SUCCESS) { char log[50000]; size_t retsize = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 50000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; dumpPTXCode(fftCtx, fftProg, "oclFFT"); exit(-1); } else { // dumpPTXCode(fftCtx, fftProg, "oclFFT"); } // Create kernel for forward FFT fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for inverse FFT ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for check chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err); CL_CHECK_ERROR(err); }