bool runTest(int argc, const char **argv) { bool ok = true; float *host_output; float *device_output; float *input; float *coeff; int defaultDim; int dimx; int dimy; int dimz; int outerDimx; int outerDimy; int outerDimz; int radius; int timesteps; size_t volumeSize; memsize_t memsize; const float lowerBound = 0.0f; const float upperBound = 1.0f; // Determine default dimensions shrLog("Set-up, based upon target device GMEM size...\n"); if (ok) { // Get the memory size of the target device shrLog(" getTargetDeviceGlobalMemSize\n"); ok = getTargetDeviceGlobalMemSize(&memsize, argc, argv); } if (ok) { // We can never use all the memory so to keep things simple we aim to // use around half the total memory memsize /= 2; // Most of our memory use is taken up by the input and output buffers - // two buffers of equal size - and for simplicity the volume is a cube: // dim = floor( (N/2)^(1/3) ) defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0/3.0)); // By default, make the volume edge size an integer multiple of 128B to // improve performance by coalescing memory accesses, in a real // application it would make sense to pad the lines accordingly int roundTarget = 128 / sizeof(float); defaultDim = defaultDim / roundTarget * roundTarget; defaultDim -= k_radius_default * 2; // Check dimension is valid if (defaultDim < k_dim_min) { shrLogEx(LOGBOTH | ERRORMSG, -1000, STDERROR); shrLog("\tinsufficient device memory (maximum volume on device is %d, must be between %d and %d).\n", defaultDim, k_dim_min, k_dim_max); ok = false; } else if (defaultDim > k_dim_max) { defaultDim = k_dim_max; } } // For QA testing, override default volume size if (ok) { if (shrCheckCmdLineFlag(argc, argv, "qatest")) { defaultDim = MIN(defaultDim, k_dim_qa); } } // Parse command line arguments if (ok) { char *dim = 0; if (shrGetCmdLineArgumentstr(argc, argv, "dimx", &dim)) { dimx = (int)atoi(dim); if (dimx < k_dim_min || dimx > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1001, STDERROR); shrLog("\tdimx out of range (%d requested, must be between %d and %d), see header files for details.\n", dimx, k_dim_min, k_dim_max); ok = false; } } else { dimx = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "dimy", &dim)) { dimy = (int)atoi(dim); if (dimy < k_dim_min || dimy > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1002, STDERROR); shrLog("\tdimy out of range (%d requested, must be between %d and %d), see header files for details.\n", dimy, k_dim_min, k_dim_max); ok = false; } } else { dimy = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "dimz", &dim)) { dimz = (int)atoi(dim); if (dimz < k_dim_min || dimz > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1003, STDERROR); shrLog("\tdimz out of range (%d requested, must be between %d and %d), see header files for details.\n", dimz, k_dim_min, k_dim_max); ok = false; } } else { dimz = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "radius", &dim)) { radius = (int)atoi(dim); if (radius < k_radius_min || radius >= k_radius_max) { shrLogEx(LOGBOTH | ERRORMSG, -1004, STDERROR); shrLog("\tradius out of range (%d requested, must be between %d and %d), see header files for details.\n", radius, k_radius_min, k_radius_max); ok = false; } } else { radius = k_radius_default; } if (shrGetCmdLineArgumentstr(argc, argv, "timesteps", &dim)) { timesteps = (int)atoi(dim); if (timesteps < k_timesteps_min || radius >= k_timesteps_max) { shrLogEx(LOGBOTH | ERRORMSG, -1005, STDERROR); shrLog("\ttimesteps out of range (%d requested, must be between %d and %d), see header files for details.\n", timesteps, k_timesteps_min, k_timesteps_max); ok = false; } } else { timesteps = k_timesteps_default; } if (dim) free(dim); } // Determine volume size if (ok) { outerDimx = dimx + 2 * radius; outerDimy = dimy + 2 * radius; outerDimz = dimz + 2 * radius; volumeSize = outerDimx * outerDimy * outerDimz; } // Allocate memory if (ok) { shrLog(" calloc host_output\n"); if ((host_output = (float *)calloc(volumeSize, sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1006, STDERROR); shrLog("\tInsufficient memory for host_output calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } if (ok) { shrLog(" malloc input\n"); if ((input = (float *)malloc(volumeSize * sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1007, STDERROR); shrLog("\tInsufficient memory for input malloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } if (ok) { shrLog(" malloc coeff\n"); if ((coeff = (float *)malloc((radius + 1) * sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1008, STDERROR); shrLog("\tInsufficient memory for coeff malloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Create coefficients if (ok) { for (int i = 0 ; i <= radius ; i++) { coeff[i] = 0.1f; } } // Generate data if (ok) { shrLog(" generateRandomData\n\n"); generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound); } if (ok) { shrLog("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d timesteps...\n\n", dimx, dimy, dimz, radius, timesteps); } // Execute on the host if (ok) { shrLog("fdtdReference...\n"); ok = fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps); shrLog("fdtdReference complete\n"); } // Allocate memory if (ok) { shrLog(" calloc device_output\n"); if ((device_output = (float *)calloc(volumeSize, sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1009, STDERROR); shrLog("\tInsufficient memory for device output calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Execute on the device if (ok) { shrLog("fdtdGPU...\n"); ok = fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv); shrLog("fdtdGPU complete\n"); } // Compare the results if (ok) { float tolerance = 0.0001f; shrLog("\nCompareData (tolerance %f)...\n", tolerance); ok = compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance); } return ok; }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }