bool HighpassFilter::filter(cl_command_queue GPUCommandQueue) { int iLocalPixPitch = iBlockDimX + 2; GPUError = clSetKernelArg(GPUFilter, 0, sizeof(cl_mem), (void*)&GPUTransfer->cmDevBuf); GPUError |= clSetKernelArg(GPUFilter, 1, sizeof(cl_mem), (void*)&cmDevBufMaskV); GPUError |= clSetKernelArg(GPUFilter, 2, sizeof(cl_mem), (void*)&cmDevBufMaskH); GPUError |= clSetKernelArg(GPUFilter, 3, (iLocalPixPitch * (iBlockDimY + 2) * GPUTransfer->nChannels * sizeof(cl_uchar)), NULL); GPUError |= clSetKernelArg(GPUFilter, 4, ( 9 * sizeof(int)), NULL); GPUError |= clSetKernelArg(GPUFilter, 5, ( 9 * sizeof(int)), NULL); GPUError |= clSetKernelArg(GPUFilter, 6, sizeof(cl_int), (void*)&iLocalPixPitch); GPUError |= clSetKernelArg(GPUFilter, 7, sizeof(cl_uint), (void*)&GPUTransfer->ImageWidth); GPUError |= clSetKernelArg(GPUFilter, 8, sizeof(cl_uint), (void*)&GPUTransfer->ImageHeight); GPUError |= clSetKernelArg(GPUFilter, 9, sizeof(cl_int), (void*)&GPUTransfer->nChannels); if(GPUError) return false; size_t GPULocalWorkSize[2]; GPULocalWorkSize[0] = iBlockDimX; GPULocalWorkSize[1] = iBlockDimY; GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], GPUTransfer->ImageWidth); GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)GPUTransfer->ImageHeight); if(clEnqueueNDRangeKernel( GPUCommandQueue, GPUFilter, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL)) return false; return true; }
// Kernel function //***************************************************************************** int executeKernel(cl_int radius) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], image_width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], image_height); // set the args values cl_int tilew = (cl_int)szLocalWorkSize[0]+(2*radius); ciErrNum = clSetKernelArg(ckKernel, 4, sizeof(tilew), &tilew); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(radius), &radius); cl_float threshold = 0.8f; ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(threshold), &threshold); cl_float highlight = 4.0f; ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(highlight), &highlight); // Local memory ciErrNum |= clSetKernelArg(ckKernel, 8, (szLocalWorkSize[0]+(2*16))*(szLocalWorkSize[1]+(2*16))*sizeof(int), NULL); // launch computation kernel #ifdef GPU_PROFILING int nIter = 30; for( int i=-1; i< nIter; ++i) { if( i ==0 ) shrDeltaT(0); #endif ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dSeconds = shrDeltaT(0)/(double)nIter; double dNumTexels = (double)image_width * (double)image_height; double mtexps = 1.0e-6 * dNumTexels/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclPostprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %.0f Texels, NumDevsUsed = %u, Workgroup = %u\n", mtexps, dSeconds, dNumTexels, uiNumDevsUsed, szLocalWorkSize[0] * szLocalWorkSize[1]); #endif oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
bool Subtract::Process() { size_t GPULocalWorkSize[2]; GPULocalWorkSize[0] = iBlockDimX; GPULocalWorkSize[1] = iBlockDimY; GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], (int)imageWidth); GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)imageHeight); int iLocalPixPitch = iBlockDimX + 2; GPUError = clSetKernelArg(GPUKernel, 0, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListIn[0]); GPUError = clSetKernelArg(GPUKernel, 1, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListIn[1]); GPUError = clSetKernelArg(GPUKernel, 2, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListOut[0]); GPUError |= clSetKernelArg(GPUKernel, 3, sizeof(cl_uint), (void*)&imageWidth); GPUError |= clSetKernelArg(GPUKernel, 4, sizeof(cl_uint), (void*)&imageHeight); if(GPUError) return false; if(clEnqueueNDRangeKernel( GPUCommandQueue, GPUKernel, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL)) return false; return true; }
bool RGB2HSV::filter(cl_command_queue GPUCommandQueue) { int iLocalPixPitch = iBlockDimX + 2; GPUError = clSetKernelArg(GPUFilter, 0, sizeof(cl_mem), (void*)&GPUTransfer->cmDevBuf); GPUError |= clSetKernelArg(GPUFilter, 1, sizeof(cl_uint), (void*)&GPUTransfer->ImageWidth); GPUError |= clSetKernelArg(GPUFilter, 2, sizeof(cl_uint), (void*)&GPUTransfer->ImageHeight); if( GPUError != 0 ) return false; size_t GPULocalWorkSize[2]; GPULocalWorkSize[0] = iBlockDimX; GPULocalWorkSize[1] = iBlockDimY; GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], GPUTransfer->ImageWidth); GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)GPUTransfer->ImageHeight); if( clEnqueueNDRangeKernel( GPUCommandQueue, GPUFilter, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL) ) return false; return true; }
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C ) { cl_mem d_A[MAX_GPU_COUNT]; cl_mem d_C[MAX_GPU_COUNT]; cl_mem d_B[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; // Start the computation on each available GPU // Create buffers for each GPU // Each GPU will compute sizePerGPU rows of the result int sizePerGPU = uiHA / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i=0; i < ciDeviceCount; ++i) { // Input buffer workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]); d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL); // Copy only assigned rows from host to device clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL); // create OpenCL buffer on device that will be initiatlize from the host memory on first use // on device d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B_data, NULL); // Output buffer d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, workSize[i] * uiWC * sizeof(float), NULL,NULL); // set the args values clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]); clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]); clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]); clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA); clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB); if(i+1 < ciDeviceCount) workOffset[i + 1] = workOffset[i] + workSize[i]; } // Execute Multiplication on all GPUs in parallel size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])}; // Launch kernels on devices #ifdef GPU_PROFILING int nIter = 30; for (int j = -1; j < nIter; j++) { // Sync all queues to host and start timer first time through loop if(j == 0){ for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } shrDeltaT(0); } #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Multiplication - non-blocking execution: launch and push to device(s) globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]); clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); clFlush(commandQueue[i]); } #ifdef GPU_PROFILING } #endif // sync all queues to host for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } #ifdef GPU_PROFILING // stop and log timer double dSeconds = shrDeltaT(0)/(double)nIter; double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]); // Print kernel timing per GPU shrLog("\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { shrLog(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i])); } shrLog("\n"); #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Non-blocking copy of result from device to host clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]); } // CPU sync with GPU clWaitForEvents(ciDeviceCount, GPUDone); // Release mem and event objects for(unsigned int i = 0; i < ciDeviceCount; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseMemObject(d_B[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } }
//----------------------------------------------------------------------------- //! Run the CL part of the computation //----------------------------------------------------------------------------- void RunKernels() { static float t = 0.0f; // ---------------------------------------------------------------- // populate the 2d texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_2d.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_2d.height); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clMem), (void *) &(g_texture_2d.clMem)); #else ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); #endif ciErrNum |= clSetKernelArg(ckKernel_tex2d, 1, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 2, sizeof(g_texture_2d.width), &g_texture_2d.width); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 3, sizeof(g_texture_2d.height), &g_texture_2d.height); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 4, sizeof(g_texture_2d.pitch), &g_texture_2d.pitch); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 5, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_tex2d, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_2d.width, g_texture_2d.height, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_2d.clMem /* src_buffer */, g_texture_2d.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } // ---------------------------------------------------------------- // populate the volume texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_vol.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_vol.height); // set the args values ciErrNum |= clSetKernelArg(ckKernel_texvolume, 0, sizeof(g_texture_vol.clMem), (void *) &(g_texture_vol.clMem)); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 1, sizeof(g_texture_vol.width), &g_texture_vol.width); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 2, sizeof(g_texture_vol.height), &g_texture_vol.height); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 3, sizeof(g_texture_vol.depth), &g_texture_vol.depth); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 4, sizeof(g_texture_vol.pitch), &g_texture_vol.pitch); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 5, sizeof(g_texture_vol.pitchslice), &g_texture_vol.pitchslice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texvolume, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ONLY staging buffer works, for volume texture // do the copy here size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_vol.clMem /* src_buffer */, g_texture_vol.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); } // ---------------------------------------------------------------- // populate the faces of the cube map for (int face = 0; face < 6; ++face) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_cube.size); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_cube.size); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clMem[face]), (void *) &(g_texture_cube.clMem[face])); #else ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clTexture[face]), (void *) &(g_texture_cube.clTexture[face])); #endif ciErrNum |= clSetKernelArg(ckKernel_texcube, 1, sizeof(g_texture_cube.size), &g_texture_cube.size); ciErrNum |= clSetKernelArg(ckKernel_texcube, 2, sizeof(g_texture_cube.pitch), &g_texture_cube.pitch); ciErrNum |= clSetKernelArg(ckKernel_texcube, 3, sizeof(int), &face); ciErrNum |= clSetKernelArg(ckKernel_texcube, 4, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texcube, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_cube.size, g_texture_cube.size, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_cube.clMem[face]/* src_buffer */, g_texture_cube.clTexture[face]/* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } t += 0.1f; }
// Main function // ********************************************************************* int main(int argc, char **argv) { gp_argc = &argc; gp_argv = &argv; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclDotProduct.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, 4 * iNumElements); shrFillArray((float*)srcB, 4 * iNumElements); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx"); Cleanup(EXIT_FAILURE); } // Create the kernel shrLog("clCreateKernel (DotProduct)...\n"); ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum); // Set the Argument values shrLog("clSetKernelArg 0 - 3...\n\n"); ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog("clEnqueueReadBuffer (Dst)...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
// Main function // ********************************************************************* int ymain(int argc, char **argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclVectorAdd2.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char** argv) { if (argc < 2) { printf("%s nrepeat\n", argv[0]); exit(1); } int nrepeat=atoi(argv[1]); // Storage for the arrays. static cl_mem output; // OpenCL state static cl_command_queue queue; //static cl_kernel kernel; static cl_device_id device_ids[NDEV]; static cl_context context; static cl_platform_id platform_id; cl_int nrow=25; cl_int ncol=25; float cenrow0=12.; float cencol0=12.; float irr0=2.; float irc0=0.; float icc0=3.; int nelem=nrow*ncol; int nwalkers=20; int ntot=nrow*ncol*nwalkers; cl_uint numPlatforms; cl_int err = CL_SUCCESS; clock_t t0,t1; int nsteps=600; int device_type=0; if (1) { device_type=CL_DEVICE_TYPE_GPU; } else { device_type=CL_DEVICE_TYPE_CPU; } //SETUP PLATFORM err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS) { fprintf(stderr,"could not get platform\n"); exit(EXIT_FAILURE); } if(numPlatforms > 0) { //we have at least one //cl_platform_id* platforms = new cl_platform_id[numPlatforms]; cl_platform_id* platforms = calloc(numPlatforms, sizeof(cl_platform_id)); err = clGetPlatformIDs(numPlatforms, platforms, NULL); if (err != CL_SUCCESS) { fprintf(stderr,"could not get platform id\n"); exit(EXIT_FAILURE); } fprintf(stderr,"Found %d platforms\n", numPlatforms); platform_id = platforms[0]; //delete[] platforms; free(platforms); } else exit(0); //END PLATFORM //SETUP CONTEXT cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0 }; context = clCreateContextFromType( cps, device_type, NULL, NULL, &err); //END CONTEXT int num_devices=0; err = clGetDeviceIDs(platform_id, device_type, NDEV, device_ids, &num_devices); fprintf(stderr,"found %d devices\n", num_devices); if (err != CL_SUCCESS) { fprintf(stderr,"could not get device ids\n"); exit(EXIT_FAILURE); } if (NDEV != num_devices) { printf("expected %d devices\n", NDEV); exit(1); } for (int i=0; i<num_devices; i++) { size_t len=0; cl_uint avail=0; cl_uint id=0; clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_uint), &avail, &len); clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &id, &len); printf("device #: %d id: %d avail: %d\n", i, id, avail); } int devnum=0; printf("choosing device %d\n", devnum); cl_program program = clCreateProgramWithSource(context, 1, &kernel_source , NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create program\n"); exit(EXIT_FAILURE); } size_t len; char buffer[2048]; clGetProgramBuildInfo(program, device_ids[devnum], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); clGetDeviceInfo(device_ids[devnum], CL_DEVICE_NAME, sizeof(buffer), buffer, &len); cl_ulong memsize; clGetDeviceInfo(device_ids[devnum], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &memsize, &len); cl_uint nunits; clGetDeviceInfo(device_ids[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &nunits, &len); cl_ulong max_work_group_size; clGetDeviceInfo(device_ids[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &max_work_group_size, &len); //cl_uint warp_size; //clGetDeviceInfo(device_ids[devnum], CL_NV_DEVICE_WARP_SIZE, sizeof(cl_uint), &warp_size, &len); printf("CL_DEVICE_NAME: '%s'\n", buffer); printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu\n", memsize); // compute unit is a lump of hardware that executes 'work groups' printf("CL_DEVICE_MAX_COMPUTE_UNITS: %u\n", nunits); // max number of items per work group printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\n", max_work_group_size); //printf("CL_NV_DEVICE_WARP_SIZE: %u\n", warp_size); size_t szLocalWorkSize = nrow; //size_t szLocalWorkSize = 512; // make sure multiple of 32 szLocalWorkSize=shrRoundUp((int)256, (int)szLocalWorkSize); // rounded up to the nearest multiple of the LocalWorkSize size_t szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)ntot); printf("nrow: %d\n", nrow); printf("ncol %d\n", ncol); printf("setting nelem: %d\n", nelem); printf("setting ntot: %d\n", ntot); printf("setting local work size: %lu\n", szLocalWorkSize); printf("setting global work size: %lu\n", szGlobalWorkSize); //queue = clCreateCommandQueue(context, device_ids, 0, &err); queue = clCreateCommandQueue(context, device_ids[devnum], //CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create command queue\n"); exit(EXIT_FAILURE); } //OPTIMIZATION OPTIONS FOUND AT http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clBuildProgram.html err = clBuildProgram(program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (err != CL_SUCCESS) { fprintf(stderr,"could not build program\n"); exit(EXIT_FAILURE); } //SETUP KERNEL cl_kernel kernel = clCreateKernel(program, "gmix", &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create kernel\n"); exit(EXIT_FAILURE); } clReleaseProgram(program); // no longer needed printf("processing %dx%d image %d walkers %d steps nrepeat %d\n", nrow,ncol,nwalkers,nsteps,nrepeat); double tstandard=0; double topencl=0; cl_float *image=NULL; srand48(10); t0=clock(); for (int rep=0; rep<nrepeat; rep++) { // we can probably instead re-use rows so this // is overkill simulating overhead image=get_new_image(nrow,ncol); cl_float *data_from_gpu = calloc(szGlobalWorkSize, sizeof(cl_float)); cl_float *rows=calloc(szGlobalWorkSize,sizeof(cl_float)); cl_float *cols=calloc(szGlobalWorkSize,sizeof(cl_float)); fill_rows_cols(nwalkers, nrow, ncol, rows, cols); err=0; cl_mem image_in = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*nrow*ncol, image, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create image buffer\n"); exit(EXIT_FAILURE); } cl_mem rows_in = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*szGlobalWorkSize, rows, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create rows buffer\n"); exit(EXIT_FAILURE); } cl_mem cols_in = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*szGlobalWorkSize, cols, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create cols buffer\n"); exit(EXIT_FAILURE); } output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float)*szGlobalWorkSize, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr,"could not create buffer\n"); exit(EXIT_FAILURE); } err = clSetKernelArg(kernel, 0, sizeof(cl_int), &ntot); err |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ncol); err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &image_in); err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &rows_in); err |= clSetKernelArg(kernel, 10, sizeof(cl_mem), &cols_in); err |= clSetKernelArg(kernel, 11, sizeof(cl_mem), &output); for (int step=0; step<nsteps; step++) { float cenrow = cenrow0 + 0.01*(drand48()-0.5); float cencol = cencol0 + 0.01*(drand48()-0.5); float irr = irr0+0.01*(drand48()-0.5); float irc = irc0+0.01*(drand48()-0.5); float icc = icc0+0.01*(drand48()-0.5); float det = irr*icc - irc*irc; float idet = 1./det; // a copy of the kernel is made each time, so we can add new arguments err |= clSetKernelArg(kernel, 2, sizeof(cl_float), (void*)&cenrow); err |= clSetKernelArg(kernel, 3, sizeof(cl_float), (void*)&cencol); err |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*)&idet); err |= clSetKernelArg(kernel, 5, sizeof(cl_float), (void*)&irr); err |= clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&irc); err |= clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&icc); if (err != CL_SUCCESS) { fprintf(stderr,"could not set step kernel args\n"); exit(EXIT_FAILURE); } err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS) { fprintf(stderr,"error executing kernel\n"); exit(EXIT_FAILURE); } } clReleaseMemObject(image_in); clReleaseMemObject(rows_in); clReleaseMemObject(cols_in); clReleaseMemObject(output); free(image); free(rows); free(cols); free(data_from_gpu); } t1=clock(); topencl = ((double)(t1-t0))/CLOCKS_PER_SEC; printf("time for GPU: %lf\n", topencl); printf("time per repeat: %lf\n", topencl/nrepeat); /* clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); */ return 0; }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
// Main function // ********************************************************************* int main(int argc, char **argv) { //shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("Barrier.txt"); printf("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = NUM_THREADS ; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize printf("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); printf("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); printf("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); printf("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1); printf("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file printf("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); printf("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); printf("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1); printf("clCreateKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate and initialize host arrays printf( "Allocate and Init Host Mem...\n"); input = (int *)malloc(sizeof(int) * NUM_BLOCKS); for(int i =0; i<=NUM_BLOCKS; i++) { input[i]=0; } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out); // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements); printf("clSetKernelArg 0 - 2...\n\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL); printf("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); printf("clEnqueueNDRangeKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL); printf("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); }*/ //GPU_PROFILING ciErr1=clWaitForEvents(1, &ceEvent); if (ciErr1 != CL_SUCCESS) { printf("Error 1 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } cl_ulong start, end; ciErr1 = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (ciErr1 != CL_SUCCESS) { printf("Error 2 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } double dSeconds = 1.0e-9 * (double)(end - start); printf("Done! time taken %llu \n",end - start ); // printf("Done! Kernel execution time: %.5f s\n\n", dSeconds); // Release event clReleaseEvent(ceEvent); ceEvent = 0; Cleanup (argc, argv, EXIT_SUCCESS); }
// Main function // ********************************************************************* int main(int argc, char **argv) { ////////////////////////////////////////////////////////////////////////// unsigned int count = iNumElements; int k = 8; unsigned int random_seed, random_seed2; srand( (unsigned)time( NULL ) ); random_seed = rand(); random_seed2 = rand(); ////////////////////////////////////////////////////////////////////////// // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs shrSetLogFileName ("oclVectorAdd.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); ////////////////////////////////////////////////////////////////////////// float *scalar_value = new float[count]; float *gradient_magnitude = new float[count]; float *second_derivative_magnitude = new float[count]; unsigned char *label_ptr = new unsigned char[count]; shrFillArray(scalar_value, count); shrFillArray(gradient_magnitude, count); shrFillArray(second_derivative_magnitude, count); ////////////////////////////////////////////////////////////////////////// //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); printf("%s\n%s\n", cSourceFile, cPathAndName); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Set the Argument values //ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); //ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); //ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); //ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); ////////////////////////////////////////////////////////////////////////// // __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2 ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr); ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count); ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k); ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed); ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2); ////////////////////////////////////////////////////////////////////////// shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device //ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); //ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED"); ////////////////////////////////////////////////////////////////////////// //float *a = (float *)srcA; //float *b = (float *)srcB; //float *c = (float *)dst; //float *d = (float *)Golden; //for (int i=0; i<iNumElements; i++) //{ // printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]); // printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal")); //} //for (int i=0; i<iNumElements; i++) //{ // printf("%f\n", ((float *)dst)[i]); //} ////////////////////////////////////////////////////////////////////////// // Cleanup and leave Cleanup (EXIT_SUCCESS); ////////////////////////////////////////////////////////////////////////// delete [] scalar_value; delete [] gradient_magnitude; delete [] second_derivative_magnitude; delete [] label_ptr; ////////////////////////////////////////////////////////////////////////// }
// OpenCL computation function for 1 or more GPUs // Copies input data from pinned host buf to the device, runs kernel, copies output data back to pinned output host buf //***************************************************************************** double SobelFilterGPU(cl_uint* uiInputImage, cl_uint* uiOutputImage) { // If this is a video application, fresh data in pinned host buffer is needed beyond here // This line could be a sync point assuring that an asynchronous acqusition is complete. // That ascynchronous acquisition would do a map, update and unmap for the pinned input buffer // // Otherwise a synchronous acquisition call ('get next frame') could be placed here, but that would be less optimal. // For each device: copy fresh input H2D ciErrNum = CL_SUCCESS; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Nonblocking Write of input image data from host to device ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[i], cmDevBufIn[i], CL_FALSE, 0, szAllocDevBytes[i], (void*)&uiInputImage[uiInHostPixOffsets[i]], 0, NULL, NULL); } // Sync all queues to host and start computation timer on host to get computation elapsed wall clock time // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: Process shrDeltaT(0); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes, offsets and launch config, based on position of device region vertically in image if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } // Pass in dev image height (# of rows worked on) for this device ciErrNum |= clSetKernelArg(ckSobel[i], 5, sizeof(cl_uint), (void*)&uiDevImageHeight[i]); // Launch Sobel kernel(s) into queue(s) and push to device(s) ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue[i], ckSobel[i], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); // Push to device(s) so subsequent clFinish in queue 0 doesn't block driver from issuing enqueue command for higher queues ciErrNum |= clFlush(cqCommandQueue[i]); } // Sync all queues to host and get elapsed wall clock time for computation in all queues // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } double dKernelTime = shrDeltaT(0); // Time from launch of first compute kernel to end of all compute kernels oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: copy fresh output D2H for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes and offsets based on position of device region vertically in image size_t szReturnBytes; cl_uint uiOutDevByteOffset; if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szReturnBytes = szBuffBytes; uiOutDevByteOffset = 0; } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = 0; } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szReturnBytes = szAllocDevBytes[i] - ((uiImageWidth * sizeof(cl_uint)) * 2); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } // Non Blocking Read of output image data from device to host ciErrNum |= clEnqueueReadBuffer(cqCommandQueue[i], cmDevBufOut[i], CL_FALSE, uiOutDevByteOffset, szReturnBytes, (void*)&uiOutputImage[uiOutHostPixOffsets[i]], 0, NULL, NULL); } // Finish all queues and check for errors before returning // The block here assures valid output data for subsequent host processing for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return dKernelTime; }
// Main program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs cExecutableName = argv[0]; shrSetLogFileName ("oclSobelFilter.txt"); shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); // Get command line args for quick test or QA test, if provided bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); // Menu items if (!(bQATest)) { ShowMenuItems(); } // Find the path from the exe to the image file cPathAndName = shrFindFilePath(cImageFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3); // Initialize OpenGL items (if not No-GL QA test) shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); if (!(bQATest)) { InitGL(&argc, argv); } //Get the NVIDIA platform if available, otherwise use default char cBuffer[1024]; bool bNV = false; shrLog("Get Platform ID... "); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("%s\n\n", cBuffer); bNV = (strstr(cBuffer, "NVIDIA") != NULL); //Get the devices shrLog("Get Device Info...\n"); cl_uint uiNumAllDevs = 0; GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup); // Get selected device if specified, otherwise examine avaiable ones and choose by perf cl_int iSelectedDevice = 0; if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) { // Use 1 selected device GpuDevMngr->uiUsefulDevCt = 1; iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1)); GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice; GpuDevMngr->fLoadProportions[0] = 1.0f; shrLog(" Using 1 Selected Device for Sobel Filter Computation...\n"); } else { // Use available useful devices and Compute the device load proportions ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (GpuDevMngr->uiUsefulDevCt == 1) { iSelectedDevice = GpuDevMngr->uiUsefulDevs[0]; } shrLog(" Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); } //Create the context shrLog("\nclCreateContext...\n\n"); cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate per-device OpenCL objects for useful devices cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt]; ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt]; cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt]; cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt]; szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt]; uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt]; // Create command queue(s) for device(s) shrLog("clCreateCommandQueue...\n"); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog(" CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]); shrLog("\n"); } // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int); cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); // Get mapped pointers for writing to pinned input and output host image pointers uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); // Load image data from file to pinned input host buffer ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight); oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup); shrLog("Load Input Image to Input pinned host buffer...\n"); // Read the kernel in from file free(cPathAndName); cPathAndName = shrFindFilePath(clSourcefile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); shrLog("Load OpenCL Prog Source from File...\n"); // Create the program object cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateProgramWithSource...\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // On error: write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx"); Cleanup(EXIT_FAILURE); } shrLog("clBuildProgram...\n\n"); // Determine, the size/shape of the image portions for each dev and create the device buffers unsigned uiSumHeight = 0; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Create kernel instance ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateKernel (ckSobel), Device %u...\n", i); // Allocations and offsets for the portion of the image worked on by each device if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset uiDevImageHeight[i] = uiImageHeight; uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i == 0) { // Multiple devices, top stripe zone including topmost row of image: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe) // Won't return the last row (dark/garbage row) D2H uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle stripe zone: // Over-allocate on device by 2 rows // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe) // Won't return the first and last rows (dark/garbage rows) D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 2; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (above top of stripe) // Won't return the first row (dark/garbage rows D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = uiImageHeight - uiSumHeight; // "leftover" rows uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); // Create the device buffers in GMEM on each device cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); // Set the common argument values for the Median kernel instance for each device int iLocalPixPitch = iBlockDimX + 2; ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL); ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch); ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth); ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); } // Set common global and local work sizes for Median kernel szLocalWorkSize[0] = iBlockDimX; szLocalWorkSize[1] = iBlockDimY; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); // init running timers shrDeltaT(0); // timer 0 used for computation timing shrDeltaT(1); // timer 1 used for fps computation // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if (!(bQATest)) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buff_A, buff_B, buff_C; int mult = 1; uint32_t uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; uiWA = WA * mult; uiHA = HA * mult; uiWB = WB * mult; uiHB = HB * mult; uiWC = WC * mult; uiHC = HC * mult; printf("sizes WA %u HA %u WB %u HB %u WC %u HC %u\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); uint32_t size_A = uiWA * uiHA; uint32_t size_B = uiWB * uiHB; uint32_t size_C = uiWC * uiHC; size_t mem_size_A = size_A * sizeof(float); size_t mem_size_B = size_B * sizeof(float); size_t mem_size_C = size_C * sizeof(float); float *data_A = (float *)malloc(mem_size_A); float *data_B = (float *)malloc(mem_size_B); float *data_C = (float *)malloc(mem_size_C); srand(2012); shrFillArray(data_A, size_A); shrFillArray(data_B, size_B); size_t global_work_size[2]; size_t local_work_size[] = { BLOCK_SIZE, BLOCK_SIZE }; global_work_size[0] = shrRoundUp(BLOCK_SIZE, uiWC); global_work_size[1] = shrRoundUp(BLOCK_SIZE, uiHA); const char *source = load_program_source("MatrixMul.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "matrixMul", &err); printf("kernel %p err %d\n", kernel, err); buff_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_A, data_A, NULL); printf("buff_A %p\n", buff_A); buff_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, data_B, NULL); printf("buff_B %p\n", buff_B); buff_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, NULL); printf("buff_C %p\n", buff_C); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buff_C); printf("err %d\n", err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buff_A); printf("err %d\n", err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buff_B); printf("err %d\n", err); err = clSetKernelArg(kernel, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 4, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&uiWA); printf("err %d\n", err); err = clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&uiWB); printf("err %d\n", err); err = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&uiHA); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clFlush(queue); printf("err %d\n", err); err = clFinish(queue); printf("err %d\n", err); err = clEnqueueReadBuffer(queue, buff_C, CL_TRUE, 0, mem_size_C, data_C, 0, NULL, NULL); printf("err %d\n", err); int i; for (i = 0; i < size_C; i++) { printf("%d %f\n", i, data_C[i]); } clReleaseMemObject(buff_A); clReleaseMemObject(buff_B); clReleaseMemObject(buff_C); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); }
void bluesteinsFFTGpu(const char* const argv[],const unsigned n, const unsigned orign,const unsigned size) { const unsigned powM = (unsigned) log2(n); printf("Compiling Bluesteins Program..\n"); compileProgram(argv, "fft.h", "kernels/bluesteins.cl"); printf("Creating Kernel\n"); for (unsigned i = 0; i < deviceCount; ++i) { createKernel(i, "bluesteins"); } const unsigned sizePerGPU = size / deviceCount; for (unsigned i = 0; i < deviceCount; ++i) { workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU : (size - workOffset[i]); allocateDeviceMemoryBS(i , workSize[i], workOffset[i]); clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]); clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]); clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]); clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]); clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]); clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]); clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n); clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign); clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM); clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize); if ((i + 1) < deviceCount) { workOffset[i + 1] = workOffset[i] + workSize[i]; } } size_t localWorkSize[] = {blockSize}; for (unsigned i = 0; i < deviceCount; ++i) { size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; // kernel non blocking execution runKernel(i, localWorkSize, globalWorkSize); } h_Rreal = h_Hreal; h_Rimag = h_Himag; for (unsigned i = 0; i < deviceCount; ++i) { copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i], workSize[i]); copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i], workSize[i]); } // wait for copy event const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone); checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents"); printGpuTime(); }