// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
int main(int argc, const char** argv) { cl_uint platform_count; cl_platform_id platforms[5]; cl_int err = CL_SUCCESS; unsigned int i, p; cl_device_type dev_type = CL_DEVICE_TYPE_ALL; void * ptrs[BLOCKS]; cl_command_queue cqs[BLOCKS]; cl_mem d_A[BLOCKS]; cl_mem d_C[BLOCKS]; cl_mem d_B[BLOCKS]; cl_event GPUDone[BLOCKS]; cl_event GPUExecution[BLOCKS]; struct timeval start, end; int workOffset[BLOCKS]; int workSize[BLOCKS]; unsigned int sizePerGPU = HC / BLOCKS; unsigned int sizeMod = HC % BLOCKS; size_t A_size = WA * HA; size_t A_mem_size = sizeof(TYPE) * A_size; TYPE* A_data; size_t B_size = WB * HB; size_t B_mem_size = sizeof(TYPE) * B_size; TYPE* B_data; size_t C_size = WC * HC; size_t C_mem_size = sizeof(TYPE) * C_size; TYPE* C_data; parse_args(argc, argv); check(clGetPlatformIDs(5, platforms, &platform_count)); if (platform_count == 0) { printf("No platform found\n"); exit(77); } cl_uint device_count; cl_uint devs[platform_count]; cl_device_id * devices[platform_count]; cl_context ctx[platform_count]; cl_command_queue * commandQueue[platform_count]; device_count = 0; for (p=0; p<platform_count; p++) { cl_platform_id platform = platforms[p]; err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]); if (err == CL_DEVICE_NOT_FOUND) { devs[p] = 0; continue; } if (devs[p] == 0) { printf("No OpenCL device found\n"); exit(77); } if (err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err); exit(EXIT_FAILURE); } if (devs[p] == 0) continue; devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]); commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]); check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL)); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err)); for(i = 0; i < devs[p]; ++i) { cl_device_id device = devices[p][i]; char name[2048]; name[0] = '\0'; clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL); printf("Device %d: %s\n", i, name); check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); } device_count += devs[p]; } if (device_count == 0) error("No device found\n"); cl_kernel multiplicationKernel[platform_count]; printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC); // allocate host memory for matrices A, B and C A_data = (TYPE*)malloc(A_mem_size); if (A_data == NULL) { perror("malloc"); } B_data = (TYPE*)malloc(B_mem_size); if (B_data == NULL) { perror("malloc"); } C_data = (TYPE*) malloc(C_mem_size); if (C_data == NULL) { perror("malloc"); } cl_program program[platform_count]; for (p=0; p<platform_count; p++) { if (devs[p] == 0) continue; check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err)); check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL)); check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err)); } printf("Initializing data...\n"); srand(2008); fillArray(A_data, A_size); fillArray(B_data, B_size); memset(C_data, 0, C_size); printf("Computing...\n"); workOffset[0] = 0; gettimeofday(&start, NULL); size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; int c = 0; for (p=0; p<platform_count;p++) { for (i=0; i<devs[p]; i++) { check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err)); c++; } } for(i=0; i < BLOCKS; ++i) { int d = i % device_count; cl_uint platform = 0; // determine device platform int dev = d; for (platform = 0; platform < platform_count; platform++) { if ((cl_int)(dev - devs[platform]) < 0) break; dev -= devs[platform]; } workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU; check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err)); check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err)); check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i])); check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d])); check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i])); size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])}; check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i])); // Non-blocking copy of result from device to host cqs[i] = commandQueue[platform][dev]; check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err)); if(i+1 < BLOCKS) workOffset[i + 1] = workOffset[i] + workSize[i]; } // CPU sync with GPU for (p=0; p<platform_count;p++) { cl_uint dev; for (dev=0; dev<devs[p]; dev++) { clFinish(commandQueue[p][dev]); } } gettimeofday(&end, NULL); double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); double dSeconds = timing/1000/1000; double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB; double gflops = 1.0e-9 * dNumOps/dSeconds; printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]); // compute reference solution if (check) { printf("Comparing results with CPU computation... "); TYPE* reference = (TYPE*)malloc(C_mem_size); computeReference(reference, A_data, B_data, HA, WA, WB); // check result int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f); if (res == 0) { printf("\n\n"); printDiff(reference, C_data, WC, HC, 100, 1.0e-5f); } else printf("PASSED\n\n"); free(reference); } for(i = 0; i < BLOCKS; i++) { clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL); } for(i = 0; i < BLOCKS; i++) { clFinish(cqs[i]); } for (i=0; i<device_count; i++) { clReleaseMemObject(d_B[i]); } for(i = 0; i < BLOCKS; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } for (p=0; p<platform_count;p++) { if (devs[p] == 0) continue; check(clReleaseKernel(multiplicationKernel[p])); check(clReleaseProgram(program[p])); check(clReleaseContext(ctx[p])); cl_uint k; for(k = 0; k < devs[p]; ++k) { check(clReleaseCommandQueue(commandQueue[p][k])); } } free(A_data); free(B_data); free(C_data); return 0; }