// Main function // ********************************************************************* int main(int argc, char **argv) { gp_argc = &argc; gp_argv = &argv; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclDotProduct.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, 4 * iNumElements); shrFillArray((float*)srcB, 4 * iNumElements); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx"); Cleanup(EXIT_FAILURE); } // Create the kernel shrLog("clCreateKernel (DotProduct)...\n"); ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum); // Set the Argument values shrLog("clSetKernelArg 0 - 3...\n\n"); ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog("clEnqueueReadBuffer (Dst)...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
// Main function // ********************************************************************* int ymain(int argc, char **argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclVectorAdd2.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE); }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
// Main function // ********************************************************************* int main(int argc, char **argv) { ////////////////////////////////////////////////////////////////////////// unsigned int count = iNumElements; int k = 8; unsigned int random_seed, random_seed2; srand( (unsigned)time( NULL ) ); random_seed = rand(); random_seed2 = rand(); ////////////////////////////////////////////////////////////////////////// // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs shrSetLogFileName ("oclVectorAdd.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); ////////////////////////////////////////////////////////////////////////// float *scalar_value = new float[count]; float *gradient_magnitude = new float[count]; float *second_derivative_magnitude = new float[count]; unsigned char *label_ptr = new unsigned char[count]; shrFillArray(scalar_value, count); shrFillArray(gradient_magnitude, count); shrFillArray(second_derivative_magnitude, count); ////////////////////////////////////////////////////////////////////////// //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); printf("%s\n%s\n", cSourceFile, cPathAndName); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Set the Argument values //ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); //ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); //ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); //ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); ////////////////////////////////////////////////////////////////////////// // __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2 ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr); ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count); ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k); ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed); ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2); ////////////////////////////////////////////////////////////////////////// shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device //ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); //ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED"); ////////////////////////////////////////////////////////////////////////// //float *a = (float *)srcA; //float *b = (float *)srcB; //float *c = (float *)dst; //float *d = (float *)Golden; //for (int i=0; i<iNumElements; i++) //{ // printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]); // printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal")); //} //for (int i=0; i<iNumElements; i++) //{ // printf("%f\n", ((float *)dst)[i]); //} ////////////////////////////////////////////////////////////////////////// // Cleanup and leave Cleanup (EXIT_SUCCESS); ////////////////////////////////////////////////////////////////////////// delete [] scalar_value; delete [] gradient_magnitude; delete [] second_derivative_magnitude; delete [] label_ptr; ////////////////////////////////////////////////////////////////////////// }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buff_A, buff_B, buff_C; int mult = 1; uint32_t uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; uiWA = WA * mult; uiHA = HA * mult; uiWB = WB * mult; uiHB = HB * mult; uiWC = WC * mult; uiHC = HC * mult; printf("sizes WA %u HA %u WB %u HB %u WC %u HC %u\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); uint32_t size_A = uiWA * uiHA; uint32_t size_B = uiWB * uiHB; uint32_t size_C = uiWC * uiHC; size_t mem_size_A = size_A * sizeof(float); size_t mem_size_B = size_B * sizeof(float); size_t mem_size_C = size_C * sizeof(float); float *data_A = (float *)malloc(mem_size_A); float *data_B = (float *)malloc(mem_size_B); float *data_C = (float *)malloc(mem_size_C); srand(2012); shrFillArray(data_A, size_A); shrFillArray(data_B, size_B); size_t global_work_size[2]; size_t local_work_size[] = { BLOCK_SIZE, BLOCK_SIZE }; global_work_size[0] = shrRoundUp(BLOCK_SIZE, uiWC); global_work_size[1] = shrRoundUp(BLOCK_SIZE, uiHA); const char *source = load_program_source("MatrixMul.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "matrixMul", &err); printf("kernel %p err %d\n", kernel, err); buff_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_A, data_A, NULL); printf("buff_A %p\n", buff_A); buff_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, data_B, NULL); printf("buff_B %p\n", buff_B); buff_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, NULL); printf("buff_C %p\n", buff_C); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buff_C); printf("err %d\n", err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buff_A); printf("err %d\n", err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buff_B); printf("err %d\n", err); err = clSetKernelArg(kernel, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 4, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&uiWA); printf("err %d\n", err); err = clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&uiWB); printf("err %d\n", err); err = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&uiHA); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clFlush(queue); printf("err %d\n", err); err = clFinish(queue); printf("err %d\n", err); err = clEnqueueReadBuffer(queue, buff_C, CL_TRUE, 0, mem_size_C, data_C, 0, NULL, NULL); printf("err %d\n", err); int i; for (i = 0; i < size_C; i++) { printf("%d %f\n", i, data_C[i]); } clReleaseMemObject(buff_A); clReleaseMemObject(buff_B); clReleaseMemObject(buff_C); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); }