bool runTest(int argc, const char **argv) { bool ok = true; float *host_output; float *device_output; float *input; float *coeff; int defaultDim; int dimx; int dimy; int dimz; int outerDimx; int outerDimy; int outerDimz; int radius; int timesteps; size_t volumeSize; memsize_t memsize; const float lowerBound = 0.0f; const float upperBound = 1.0f; // Determine default dimensions shrLog("Set-up, based upon target device GMEM size...\n"); if (ok) { // Get the memory size of the target device shrLog(" getTargetDeviceGlobalMemSize\n"); ok = getTargetDeviceGlobalMemSize(&memsize, argc, argv); } if (ok) { // We can never use all the memory so to keep things simple we aim to // use around half the total memory memsize /= 2; // Most of our memory use is taken up by the input and output buffers - // two buffers of equal size - and for simplicity the volume is a cube: // dim = floor( (N/2)^(1/3) ) defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0/3.0)); // By default, make the volume edge size an integer multiple of 128B to // improve performance by coalescing memory accesses, in a real // application it would make sense to pad the lines accordingly int roundTarget = 128 / sizeof(float); defaultDim = defaultDim / roundTarget * roundTarget; defaultDim -= k_radius_default * 2; // Check dimension is valid if (defaultDim < k_dim_min) { shrLogEx(LOGBOTH | ERRORMSG, -1000, STDERROR); shrLog("\tinsufficient device memory (maximum volume on device is %d, must be between %d and %d).\n", defaultDim, k_dim_min, k_dim_max); ok = false; } else if (defaultDim > k_dim_max) { defaultDim = k_dim_max; } } // For QA testing, override default volume size if (ok) { if (shrCheckCmdLineFlag(argc, argv, "qatest")) { defaultDim = MIN(defaultDim, k_dim_qa); } } // Parse command line arguments if (ok) { char *dim = 0; if (shrGetCmdLineArgumentstr(argc, argv, "dimx", &dim)) { dimx = (int)atoi(dim); if (dimx < k_dim_min || dimx > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1001, STDERROR); shrLog("\tdimx out of range (%d requested, must be between %d and %d), see header files for details.\n", dimx, k_dim_min, k_dim_max); ok = false; } } else { dimx = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "dimy", &dim)) { dimy = (int)atoi(dim); if (dimy < k_dim_min || dimy > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1002, STDERROR); shrLog("\tdimy out of range (%d requested, must be between %d and %d), see header files for details.\n", dimy, k_dim_min, k_dim_max); ok = false; } } else { dimy = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "dimz", &dim)) { dimz = (int)atoi(dim); if (dimz < k_dim_min || dimz > k_dim_max) { shrLogEx(LOGBOTH | ERRORMSG, -1003, STDERROR); shrLog("\tdimz out of range (%d requested, must be between %d and %d), see header files for details.\n", dimz, k_dim_min, k_dim_max); ok = false; } } else { dimz = defaultDim; } if (shrGetCmdLineArgumentstr(argc, argv, "radius", &dim)) { radius = (int)atoi(dim); if (radius < k_radius_min || radius >= k_radius_max) { shrLogEx(LOGBOTH | ERRORMSG, -1004, STDERROR); shrLog("\tradius out of range (%d requested, must be between %d and %d), see header files for details.\n", radius, k_radius_min, k_radius_max); ok = false; } } else { radius = k_radius_default; } if (shrGetCmdLineArgumentstr(argc, argv, "timesteps", &dim)) { timesteps = (int)atoi(dim); if (timesteps < k_timesteps_min || radius >= k_timesteps_max) { shrLogEx(LOGBOTH | ERRORMSG, -1005, STDERROR); shrLog("\ttimesteps out of range (%d requested, must be between %d and %d), see header files for details.\n", timesteps, k_timesteps_min, k_timesteps_max); ok = false; } } else { timesteps = k_timesteps_default; } if (dim) free(dim); } // Determine volume size if (ok) { outerDimx = dimx + 2 * radius; outerDimy = dimy + 2 * radius; outerDimz = dimz + 2 * radius; volumeSize = outerDimx * outerDimy * outerDimz; } // Allocate memory if (ok) { shrLog(" calloc host_output\n"); if ((host_output = (float *)calloc(volumeSize, sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1006, STDERROR); shrLog("\tInsufficient memory for host_output calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } if (ok) { shrLog(" malloc input\n"); if ((input = (float *)malloc(volumeSize * sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1007, STDERROR); shrLog("\tInsufficient memory for input malloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } if (ok) { shrLog(" malloc coeff\n"); if ((coeff = (float *)malloc((radius + 1) * sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1008, STDERROR); shrLog("\tInsufficient memory for coeff malloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Create coefficients if (ok) { for (int i = 0 ; i <= radius ; i++) { coeff[i] = 0.1f; } } // Generate data if (ok) { shrLog(" generateRandomData\n\n"); generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound); } if (ok) { shrLog("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d timesteps...\n\n", dimx, dimy, dimz, radius, timesteps); } // Execute on the host if (ok) { shrLog("fdtdReference...\n"); ok = fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps); shrLog("fdtdReference complete\n"); } // Allocate memory if (ok) { shrLog(" calloc device_output\n"); if ((device_output = (float *)calloc(volumeSize, sizeof(float))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -1009, STDERROR); shrLog("\tInsufficient memory for device output calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Execute on the device if (ok) { shrLog("fdtdGPU...\n"); ok = fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv); shrLog("fdtdGPU complete\n"); } // Compare the results if (ok) { float tolerance = 0.0001f; shrLog("\nCompareData (tolerance %f)...\n", tolerance); ok = compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance); } return ok; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv) { bool ok = true; cl_platform_id platform = 0; cl_context context = 0; cl_device_id *devices = 0; cl_uint deviceCount = 0; cl_uint targetDevice = 0; cl_ulong memsize = 0; cl_int errnum = 0; // Get the NVIDIA platform if (ok) { shrLog(" oclGetPlatformID\n"); errnum = oclGetPlatformID(&platform); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("oclGetPlatformID (no platforms found).\n"); ok = false; } } // Get the list of GPU devices associated with the platform if (ok) { shrLog(" clGetDeviceIDs\n"); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount); devices = (cl_device_id *)malloc(deviceCount * sizeof(cl_device_id) ); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, deviceCount, devices, NULL); if ((deviceCount == 0) || (errnum != CL_SUCCESS)) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetDeviceIDs (returned error or no devices found).\n"); ok = false; } } // Create the OpenCL context if (ok) { shrLog(" clCreateContext\n"); context = clCreateContext(0, deviceCount, devices, NULL, NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateContext (returned %d).\n", errnum); ok = false; } } // Select target device (device 0 by default) if (ok) { char *device = 0; if (shrGetCmdLineArgumentstr(argc, argv, "device", &device)) { targetDevice = (cl_uint)atoi(device); if (targetDevice >= deviceCount) { shrLogEx(LOGBOTH | ERRORMSG, -2000, STDERROR); shrLog("invalid target device specified on command line (device %d does not exist).\n", targetDevice); ok = false; } } else { targetDevice = 0; } if (device) { free(device); } } // Query target device for maximum memory allocation if (ok) { shrLog(" clGetDeviceInfo\n"); errnum = clGetDeviceInfo(devices[targetDevice], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &memsize, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetDeviceInfo (returned %d).\n", errnum); ok = false; } } // Save the result if (ok) { *result = (memsize_t)memsize; } // Cleanup if (devices) free(devices); if (context) clReleaseContext(context); return ok; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
bool fdtdGPU(float *output, const float *input, const float *coeff, const int dimx, const int dimy, const int dimz, const int radius, const int timesteps, const int argc, const char **argv) { bool ok = true; const int outerDimx = dimx + 2 * radius; const int outerDimy = dimy + 2 * radius; const int outerDimz = dimz + 2 * radius; const size_t volumeSize = outerDimx * outerDimy * outerDimz; cl_context context = 0; cl_platform_id platform = 0; cl_device_id *devices = 0; cl_command_queue commandQueue = 0; cl_mem bufferOut = 0; cl_mem bufferIn = 0; cl_mem bufferCoeff = 0; cl_program program = 0; cl_kernel kernel = 0; cl_event *kernelEvents = 0; #ifdef GPU_PROFILING cl_ulong kernelEventStart; cl_ulong kernelEventEnd; #endif double hostElapsedTimeS; char *cPathAndName = 0; char *cSourceCL = 0; size_t szKernelLength; size_t globalWorkSize[2]; size_t localWorkSize[2]; cl_uint deviceCount = 0; cl_uint targetDevice = 0; cl_int errnum = 0; char buildOptions[128]; // Ensure that the inner data starts on a 128B boundary const int padding = (128 / sizeof(float)) - radius; const size_t paddedVolumeSize = volumeSize + padding; #ifdef GPU_PROFILING const int profileTimesteps = timesteps - 1; if (ok) { if (profileTimesteps < 1) { shrLog(" cannot profile with fewer than two timesteps (timesteps=%d), profiling is disabled.\n", timesteps); } } #endif // Get the NVIDIA platform if (ok) { shrLog(" oclGetPlatformID...\n"); errnum = oclGetPlatformID(&platform); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("oclGetPlatformID (returned %d).\n", errnum); ok = false; } } // Get the list of GPU devices associated with the platform if (ok) { shrLog(" clGetDeviceIDs"); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount); devices = (cl_device_id *)malloc(deviceCount * sizeof(cl_device_id) ); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, deviceCount, devices, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetDeviceIDs (returned %d).\n", errnum); ok = false; } } // Create the OpenCL context if (ok) { shrLog(" clCreateContext...\n"); context = clCreateContext(0, deviceCount, devices, NULL, NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateContext (returned %d).\n", errnum); ok = false; } } // Select target device (device 0 by default) if (ok) { char *device = 0; if (shrGetCmdLineArgumentstr(argc, argv, "device", &device)) { targetDevice = (cl_uint)atoi(device); if (targetDevice >= deviceCount) { shrLogEx(LOGBOTH | ERRORMSG, -2001, STDERROR); shrLog("invalid target device specified on command line (device %d does not exist).\n", targetDevice); ok = false; } } else { targetDevice = 0; } if (device) { free(device); } } // Create a command-queue if (ok) { shrLog(" clCreateCommandQueue\n"); commandQueue = clCreateCommandQueue(context, devices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateCommandQueue (returned %d).\n", errnum); ok = false; } } // Create memory buffer objects if (ok) { shrLog(" clCreateBuffer bufferOut\n"); bufferOut = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferIn\n"); bufferIn = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferCoeff\n"); bufferCoeff = clCreateBuffer(context, CL_MEM_READ_ONLY, (radius + 1) * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } // Load the kernel from file if (ok) { shrLog(" shrFindFilePath\n"); cPathAndName = shrFindFilePath(clSourceFile, argv[0]); if (cPathAndName == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2002, STDERROR); shrLog("shrFindFilePath returned null.\n"); ok = false; } } if (ok) { shrLog(" oclLoadProgSource\n"); cSourceCL = oclLoadProgSource(cPathAndName, "// Preamble\n", &szKernelLength); if (cSourceCL == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2003, STDERROR); shrLog("oclLoadProgSource returned null.\n"); ok = false; } } // Create the program if (ok) { shrLog(" clCreateProgramWithSource\n"); program = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &szKernelLength, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateProgramWithSource (returned %d).\n", errnum); ok = false; } } // Check for a command-line specified work group size size_t userWorkSize; int localWorkMaxY; if (ok) { int userWorkSizeInt; if (shrGetCmdLineArgumenti(argc, argv, "work-group-size", &userWorkSizeInt)) { // We can't clamp to CL_KERNEL_WORK_GROUP_SIZE yet since that is // dependent on the build. if (userWorkSizeInt < k_localWorkMin || userWorkSizeInt > k_localWorkMax) { shrLogEx(LOGBOTH | ERRORMSG, -2004, STDERROR); shrLog("invalid work group size specified on command line (must be between %d and %d).\n", k_localWorkMin, k_localWorkMax); ok = false; } // Constrain to a multiple of k_localWorkX userWorkSize = (userWorkSizeInt / k_localWorkX * k_localWorkX); } else { userWorkSize = k_localWorkY * k_localWorkX; } // Divide by k_localWorkX (integer division to clamp) localWorkMaxY = userWorkSize / k_localWorkX; } // Build the program if (ok) { #ifdef WIN32 if (sprintf_s(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("sprintf_s (failed).\n"); ok = false; } #else if (snprintf(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("snprintf (failed).\n"); ok = false; } #endif } if (ok) { shrLog(" clBuildProgram (%s)\n", buildOptions); errnum = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL); if (errnum != CL_SUCCESS) { char buildLog[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clBuildProgram (returned %d).\n", errnum); shrLog("Log:\n%s\n", buildLog); ok = false; } } // Create the kernel if (ok) { shrLog(" clCreateKernel\n"); kernel = clCreateKernel(program, "FiniteDifferences", &errnum); if (kernel == (cl_kernel)NULL || errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateKernel (returned %d).\n", errnum); ok = false; } } // Get the maximum work group size size_t maxWorkSize; if (ok) { shrLog(" clGetKernelWorkGroupInfo\n"); errnum = clGetKernelWorkGroupInfo(kernel, devices[targetDevice], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkSize, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetKernelWorkGroupInfo (returned %d).\n", errnum); ok = false; } } // Set the work group size if (ok) { userWorkSize = CLAMP(userWorkSize, k_localWorkMin, maxWorkSize); localWorkSize[0] = k_localWorkX; localWorkSize[1] = userWorkSize / k_localWorkX; globalWorkSize[0] = localWorkSize[0] * (unsigned int)ceil((float)dimx / localWorkSize[0]); globalWorkSize[1] = localWorkSize[1] * (unsigned int)ceil((float)dimy / localWorkSize[1]); shrLog(" set local work group size to %dx%d\n", localWorkSize[0], localWorkSize[1]); shrLog(" set total work size to %dx%d\n", globalWorkSize[0], globalWorkSize[1]); } // Copy the input to the device input buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferIn\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferIn, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferIn (returned %d).\n", errnum); ok = false; } } // Copy the input to the device output buffer (actually only need the halo) if (ok) { shrLog(" clEnqueueWriteBuffer bufferOut\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferOut, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferOut (returned %d).\n", errnum); ok = false; } } // Copy the coefficients to the device coefficient buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferCoeff\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferCoeff, CL_TRUE, 0, (radius + 1) * sizeof(float), coeff, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferCoeff (returned %d).\n", errnum); ok = false; } } // Allocate the events if (ok) { shrLog(" calloc events\n"); if ((kernelEvents = (cl_event *)calloc(timesteps, sizeof(cl_event))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2006, STDERROR); shrLog("Insufficient memory for events calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Start the clock shrDeltaT(0); // Set the constant arguments if (ok) { shrLog(" clSetKernelArg 2-6\n"); errnum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&bufferCoeff); errnum |= clSetKernelArg(kernel, 3, sizeof(int), &dimx); errnum |= clSetKernelArg(kernel, 4, sizeof(int), &dimy); errnum |= clSetKernelArg(kernel, 5, sizeof(int), &dimz); errnum |= clSetKernelArg(kernel, 6, sizeof(int), &padding); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 2-6 (returned %d).\n", errnum); ok = false; } } // Execute the FDTD cl_mem bufferSrc = bufferIn; cl_mem bufferDst = bufferOut; if (ok) { shrLog(" GPU FDTD loop\n"); } for (int it = 0 ; ok && it < timesteps ; it++) { shrLog("\tt = %d ", it); // Set the dynamic arguments if (ok) { shrLog(" clSetKernelArg 0-1,"); errnum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&bufferDst); errnum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bufferSrc); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 0-1 (returned %d).\n", errnum); ok = false; } } // Launch the kernel if (ok) { shrLog(" clEnqueueNDRangeKernel\n"); errnum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernelEvents[it]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueNDRangeKernel (returned %d).\n", errnum); ok = false; } } // Toggle the buffers cl_mem tmp = bufferSrc; bufferSrc = bufferDst; bufferDst = tmp; } if (ok) shrLog("\n"); // Wait for the kernel to complete if (ok) { shrLog(" clWaitForEvents\n"); errnum = clWaitForEvents(1, &kernelEvents[timesteps-1]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clWaitForEvents (returned %d).\n", errnum); ok = false; } } // Stop the clock hostElapsedTimeS = shrDeltaT(0); // Read the result back, result is in bufferSrc (after final toggle) if (ok) { shrLog(" clEnqueueReadBuffer\n"); errnum = clEnqueueReadBuffer(commandQueue, bufferSrc, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), output, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueReadBuffer bufferSrc (returned %d).\n", errnum); ok = false; } } // Report time #ifdef GPU_PROFILING double elapsedTime = 0.0; if (ok && profileTimesteps > 0) shrLog(" Collect profile information\n"); for (int it = 1 ; ok && it <= profileTimesteps ; it++) { shrLog("\tt = %d ", it); shrLog(" clGetEventProfilingInfo,", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelEventStart, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } shrLog(" clGetEventProfilingInfo\n", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEventEnd, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } elapsedTime += (double)kernelEventEnd - (double)kernelEventStart; } if (ok && profileTimesteps > 0) { shrLog("\n"); // Convert nanoseconds to seconds elapsedTime *= 1.0e-9; double avgElapsedTime = elapsedTime / (double)profileTimesteps; // Determine number of computations per timestep size_t pointsComputed = dimx * dimy * dimz; // Determine throughput double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; shrLogEx(LOGBOTH | MASTER, 0, "oclFDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, NumDevsUsed = %i, Workgroup = %u\n", throughputM, avgElapsedTime, pointsComputed, 1, localWorkSize[0] * localWorkSize[1]); } #endif // Cleanup if (kernelEvents) { for (int it = 0 ; it < timesteps ; it++) { if (kernelEvents[it]) clReleaseEvent(kernelEvents[it]); } free(kernelEvents); } if (kernel) clReleaseKernel(kernel); if (program) clReleaseProgram(program); if (cSourceCL) free(cSourceCL); if (cPathAndName) free(cPathAndName); if (bufferCoeff) clReleaseMemObject(bufferCoeff); if (bufferIn) clReleaseMemObject(bufferIn); if (bufferOut) clReleaseMemObject(bufferOut); if (commandQueue) clReleaseCommandQueue(commandQueue); if (devices) free(devices); if (context) clReleaseContext(context); return ok; }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog("%s Starting...\n\n", argv[0]); cl_platform_id cpPlatform = NULL; cl_uint uiNumDevices = 0; cl_device_id *cdDevices = NULL; cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; cl_mem cmAlphaTable4, cmProds4; cl_mem cmAlphaTable3, cmProds3; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); oclCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); oclCheckError(h_img != NULL, shrTRUE); shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); // Convert linear image to block linear. const uint memSize = width * height * sizeof(cl_uint); uint* block_image = (uint*)malloc(memSize); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); // Get the platform's GPU devices ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); if( device == (cl_device_id)-1 ) { shrLog(" Invalid GPU Device: devID=%d. %d valid GPU devices detected\n\n", device_nr, uiNumDevices); shrLog(" exiting...\n"); return -1; } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Constants cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint*)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3); oclCheckError(ciErrNum, CL_SUCCESS); // Copy input data host to device clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // Determine launch configuration and run timed computation numIterations times int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units); // set work-item dimensions szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS; szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height); shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", blocks, NUM_THREADS, blocks * NUM_THREADS); int numIterations = 50; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel for( int j=0; j<blocks; j+= blocksPerLaunch ) { clSetKernelArg(ckKernel, 7, sizeof(int), &j); szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif oclCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog("\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); oclCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif oclCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog("RMS(reference, result) = %f\n\n", rms); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseMemObject(cmAlphaTable4); clReleaseMemObject(cmProds4); clReleaseMemObject(cmAlphaTable3); clReleaseMemObject(cmProds3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED); }
// Main function // ********************************************************************* int main(const int argc, const char** argv) { // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); shrCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); shrCheckError(h_img != NULL, shrTRUE); shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height); // Convert linear image to block linear. uint * block_image = (uint *) malloc(width * height * 4); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // create the OpenCL context on a GPU device cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevInfo(LOGBOTH, device); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY , sizeof(cl_uint) * width * height, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint *)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); shrCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); shrCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); shrCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL); ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL); ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height); // Upload the image clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // set work-item dimensions szGlobalWorkSize[0] = width * height * (NUM_THREADS/16); szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING int numIterations = 100; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif shrCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); shrCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif shrCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms); shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!"); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrEXIT(argc, argv); }
/////////////////////////////////////////////////////////////////////////////// //Parse args, run the appropriate tests /////////////////////////////////////////////////////////////////////////////// int runTest(const int argc, const char **argv) { int start = DEFAULT_SIZE; int end = DEFAULT_SIZE; int startDevice = 0; int endDevice = 0; int increment = DEFAULT_INCREMENT; testMode mode = QUICK_MODE; bool htod = false; bool dtoh = false; bool dtod = false; char *modeStr; char *device = NULL; printMode printmode = USER_READABLE; char *memModeStr = NULL; memoryMode memMode = PAGEABLE; accessMode accMode = DIRECT; //process command line args if(shrCheckCmdLineFlag( argc, argv, "help")) { printHelp(); return 0; } if(shrCheckCmdLineFlag( argc, argv, "csv")) { printmode = CSV; } // Get host memory mode type from command line if(shrGetCmdLineArgumentstr(argc, argv, "memory", &memModeStr)) { if(strcmp(memModeStr, "pageable") == 0 ) { memMode = PAGEABLE; } else if(strcmp(memModeStr, "pinned") == 0) { memMode = PINNED; } else { shrLog("Invalid memory mode - valid modes are pageable or pinned\n"); shrLog("See --help for more information\n"); return -1000; } } else { //default - pageable memory memMode = PAGEABLE; } // Access type from command line if(shrGetCmdLineArgumentstr(argc, argv, "access", &memModeStr)) { if(strcmp(memModeStr, "direct") == 0) { accMode = DIRECT; } else if(strcmp(memModeStr, "mapped") == 0) { accMode = MAPPED; } else { shrLog("Invalid access mode - valid modes are direct or mapped\n"); shrLog("See --help for more information\n"); return -2000; } } else { //default - direct accMode = DIRECT; } // Get OpenCL platform ID for NVIDIA if available, otherwise default cl_platform_id clSelectedPlatformID = NULL; cl_int ciErrNum = oclGetPlatformID (&clSelectedPlatformID); oclCheckError(ciErrNum, CL_SUCCESS); // Find out how many devices there are cl_uint ciDeviceCount; ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return ciErrNum; } // Get command line device options and config accordingly if(shrGetCmdLineArgumentstr(argc, argv, "device", &device)) { if(strcmp (device, "all") == 0) { shrLog("\n!!!Cumulative Bandwidth to be computed from all the devices !!!\n\n"); startDevice = 0; endDevice = (int)(ciDeviceCount-1); } else { startDevice = endDevice = atoi(device); if(startDevice < 0 || ((size_t)startDevice) >= ciDeviceCount) { shrLog("\n!!!Invalid GPU number %d given hence default gpu %d will be used !!!\n", startDevice,0); startDevice = endDevice = 0; } } } // Get and log the device info shrLog("Running on...\n\n"); devices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount); ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_GPU, ciDeviceCount, devices, &ciDeviceCount); for(int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { oclPrintDevName(LOGBOTH, devices[currentDevice]); shrLog("\n"); } shrLog("\n"); // Get command line mode(s) and config accordingly if(shrGetCmdLineArgumentstr(argc, argv, "mode", &modeStr)) { //figure out the mode if(strcmp(modeStr, "quick") == 0) { shrLog("Quick Mode\n\n"); mode = QUICK_MODE; } else if(strcmp(modeStr, "shmoo") == 0) { shrLog("Shmoo Mode\n\n"); mode = SHMOO_MODE; } else if(strcmp(modeStr, "range") == 0) { shrLog("Range Mode\n\n"); mode = RANGE_MODE; } else { shrLog("Invalid mode - valid modes are quick, range, or shmoo\n"); shrLog("See --help for more information\n\n"); return -3000; } } else { //default mode - quick shrLog("Quick Mode\n\n"); mode = QUICK_MODE; } if(shrCheckCmdLineFlag(argc, argv, "htod")) htod = true; if(shrCheckCmdLineFlag(argc, argv, "dtoh")) dtoh = true; if(shrCheckCmdLineFlag(argc, argv, "dtod")) dtod = true; if(!htod && !dtoh && !dtod) { //default: All htod = true; dtoh = true; dtod = true; } if(RANGE_MODE == mode) { if(shrGetCmdLineArgumenti( argc, argv, "start", &start)) { if( start <= 0 ) { shrLog("Illegal argument - start must be greater than zero\n"); return -4000; } } else { shrLog("Must specify a starting size in range mode\n"); shrLog("See --help for more information\n"); return -5000; } if(shrGetCmdLineArgumenti( argc, argv, "end", &end)) { if(end <= 0) { shrLog("Illegal argument - end must be greater than zero\n"); return -6000; } if(start > end) { shrLog("Illegal argument - start is greater than end\n"); return -7000; } } else { shrLog("Must specify an end size in range mode.\n"); shrLog("See --help for more information\n"); return -8000; } if(shrGetCmdLineArgumenti( argc, argv, "increment", &increment)) { if(increment <= 0) { shrLog("Illegal argument - increment must be greater than zero\n"); return -9000; } } else { shrLog("Must specify an increment in user mode\n"); shrLog("See --help for more information\n"); return -10000; } } // Create the OpenCL context cxGPUContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, NULL); if (cxGPUContext == (cl_context)0) { shrLog("Failed to create OpenCL context!\n"); return -11000; } // Run tests if(htod) { testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment, mode, HOST_TO_DEVICE, printmode, accMode, memMode, startDevice, endDevice); } if(dtoh) { testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment, mode, DEVICE_TO_HOST, printmode, accMode, memMode, startDevice, endDevice); } if(dtod) { testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, accMode, memMode, startDevice, endDevice); } // Clean up free(memModeStr); if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue); if(cxGPUContext)clReleaseContext(cxGPUContext); if(devices)free(devices); return 0; }
// Main function // ********************************************************************* int main( int argc, const char** argv) { shrQAStart(argc, (char **)argv); // start logs shrSetLogFileName ("oclReduction.txt"); shrLog("%s Starting...\n\n", argv[0]); char *typeChoice; shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice); // determine type of array from command line args if (0 == typeChoice) { typeChoice = (char*)malloc(7 * sizeof(char)); #ifdef WIN32 strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int"); #else strcpy(typeChoice, "int"); #endif } ReduceType datatype = REDUCE_INT; #ifdef WIN32 if (!_strcmpi(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!_strcmpi(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #else if (!strcmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #endif shrLog("Reducing array of type %s.\n", typeChoice); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log the device info if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr); if( device_nr < uiNumDevices ) { device = oclGetDev(cxGPUContext, device_nr); } else { shrLog("Invalid Device %d Requested.\n", device_nr); shrExitEX(argc, argv, EXIT_FAILURE); } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]); bool bSuccess = false; switch (datatype) { default: case REDUCE_INT: bSuccess = runTest<int>( argc, argv, datatype); break; case REDUCE_FLOAT: bSuccess = runTest<float>( argc, argv, datatype); break; } // finish shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED); }