int AtomicCounters::setupCL(void) { cl_int status = 0; cl_device_type dType; if (sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else // deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if (sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } cl_platform_id platform = NULL; int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed."); // Display available devices. retValue = displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed."); cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = getDevices(context, &devices, sampleArgs->deviceId, sampleArgs->isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed "); // Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); // Check device extensions if (!strstr(deviceInfo.extensions, "cl_ext_atomic_counters_32")) { OPENCL_EXPECTED_ERROR( "Device does not support cl_ext_atomic_counters_32 extension!"); } if (!strstr(deviceInfo.extensions, "cl_khr_local_int32_base_atomics")) { OPENCL_EXPECTED_ERROR( "Device does not support cl_khr_local_int32_base_atomics extension!"); } // Get OpenCL device version std::string deviceVersionStr = std::string(deviceInfo.deviceVersion); size_t vStart = deviceVersionStr.find(" ", 0); size_t vEnd = deviceVersionStr.find(" ", vStart + 1); std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); // Check of OPENCL_C_VERSION if device version is 1.1 or later #ifdef CL_VERSION_1_1 if (deviceInfo.openclCVersion) { // Exit if OpenCL C device version is 1.0 deviceVersionStr = std::string(deviceInfo.openclCVersion); vStart = deviceVersionStr.find(" ", 0); vStart = deviceVersionStr.find(" ", vStart + 1); vEnd = deviceVersionStr.find(" ", vStart + 1); vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); if (vStrVal.compare("1.0") <= 0) { OPENCL_EXPECTED_ERROR( "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); } } else { OPENCL_EXPECTED_ERROR( "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); } #else OPENCL_EXPECTED_ERROR( "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); #endif // Setup application data if (setupAtomicCounters() != SDK_SUCCESS) { return SDK_FAILURE; } cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId], props, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed(commandQueue)"); // Set Persistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if (sampleArgs->isAmdPlatform()) { inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; } // Create buffer for input array inBuf = clCreateBuffer(context, inMemFlags, length * sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)"); // Set up data for input array cl_event writeEvt; status = clEnqueueWriteBuffer(commandQueue, inBuf, CL_FALSE, 0, length * sizeof(cl_uint), input, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed.."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); counterOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf)."); globalOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf)."); // create a CL program using the kernel source buildProgramData buildData; buildData.kernelName = std::string("AtomicCounters_Kernels.cl"); buildData.devices = devices; buildData.deviceId = sampleArgs->deviceId; buildData.flagsStr = std::string(""); if (sampleArgs->isLoadBinaryEnabled()) { buildData.binaryName = std::string(sampleArgs->loadBinary.c_str()); } if (sampleArgs->isComplierFlagsSpecified()) { buildData.flagsFileName = std::string(sampleArgs->flags.c_str()); } retValue = buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed"); // ConstantBuffer bandwidth from single access counterKernel = clCreateKernel(program, "atomicCounters", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel)."); globalKernel = clCreateKernel(program, "globalAtomics", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed."); status = kernelInfoC.setKernelWorkGroupInfo(counterKernel, devices[sampleArgs->deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); status = kernelInfoG.setKernelWorkGroupInfo(globalKernel, devices[sampleArgs->deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); if (counterWorkGroupSize > kernelInfoC.kernelWorkGroupSize) { if (!sampleArgs->quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << counterWorkGroupSize << std::endl; std::cout << "Max Group Size supported on the kernel(readKernel) : " << kernelInfoC.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfoC.kernelWorkGroupSize << std::endl; } counterWorkGroupSize = kernelInfoC.kernelWorkGroupSize; } if (globalWorkGroupSize > kernelInfoG.kernelWorkGroupSize) { if (!sampleArgs->quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << globalWorkGroupSize << std::endl; std::cout << "Max Group Size supported on the kernel(writeKernel) : " << kernelInfoG.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfoG.kernelWorkGroupSize << std::endl; } globalWorkGroupSize = kernelInfoG.kernelWorkGroupSize; } // Wait for event and release event status = waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); return SDK_SUCCESS; }
int DwtHaar1D::runDwtHaar1DKernel() { cl_int status; status = this->setWorkGroupSize(); CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize failed"); // Force write to inData Buf to update its values cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, inDataBuf, CL_FALSE, 0, curSignalLength * sizeof(cl_float), inData, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inDataBuf)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt1) Failed"); // Whether sort is to be in increasing order. CL_TRUE implies increasing status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&inDataBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inDataBuf)"); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&dOutDataBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dOutDataBuf)"); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&dPartialOutDataBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dPartialOutData)"); status = clSetKernelArg(kernel, 3, (localThreads * 2 * sizeof(cl_float)), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)"); status = clSetKernelArg(kernel, 4, sizeof(cl_uint), (void*)&totalLevels); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (totalLevels)"); status = clSetKernelArg(kernel, 5, sizeof(cl_uint), (void*)&curSignalLength); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (curSignalLength)"); status = clSetKernelArg(kernel, 6, sizeof(cl_uint), (void*)&levelsDone); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)"); status = clSetKernelArg(kernel, 7, sizeof(cl_uint), (void*)&maxLevelsOnDevice); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)"); /* * Enqueue a kernel run call. */ cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed"); // Enqueue the results to application pointer cl_event readEvt1; status = clEnqueueReadBuffer( commandQueue, dOutDataBuf, CL_FALSE, 0, signalLength * sizeof(cl_float), dOutData, 0, NULL, &readEvt1); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); // Enqueue the results to application pointer cl_event readEvt2; status = clEnqueueReadBuffer( commandQueue, dPartialOutDataBuf, CL_FALSE, 0, signalLength * sizeof(cl_float), dPartialOutData, 0, NULL, &readEvt2); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&readEvt1); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt1) Failed"); status = waitForEventAndRelease(&readEvt2); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt2) Failed"); return SDK_SUCCESS; }
int AtomicCounters::runGlobalAtomicKernel() { cl_int status = CL_SUCCESS; // Set Global and Local work items size_t globalWorkItems = length; size_t localWorkItems = globalWorkGroupSize; // Initialize the counter value cl_event writeEvt; status = clEnqueueWriteBuffer(commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &initValue, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); // Set kernel arguments status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed."); status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value); CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed."); status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed."); // Run Kernel cl_event ndrEvt; status = clEnqueueNDRangeKernel(commandQueue, globalKernel, 1, NULL, &globalWorkItems, &localWorkItems, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); cl_int eventStatus = CL_QUEUED; while (eventStatus != CL_COMPLETE) { status = clGetEventInfo(ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed."); } cl_ulong startTime; cl_ulong endTime; // Get profiling information status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); CHECK_OPENCL_ERROR( status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed."); status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); CHECK_OPENCL_ERROR( status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed."); double sec = 1e-9 * (endTime - startTime); kTimeAtomGlobal += sec; status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Get the occurrences of Value from atomicKernel cl_event readEvt; status = clEnqueueReadBuffer(commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &globalOut, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = waitForEventAndRelease(&readEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed."); return SDK_SUCCESS; }
int URNG::runCLKernels() { cl_int status; // Set input data cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, inputImageBuffer, CL_FALSE, 0, width * height * sizeof(cl_uchar4), inputImageData, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputImageBuffer)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed"); // Set appropriate arguments to the kernel // input buffer image status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)"); // outBuffer imager status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)"); // input buffer image status = clSetKernelArg(kernel, 2, sizeof(factor), &factor); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (factor)"); // Enqueue a kernel run call. size_t globalThreads[] = {width, height}; size_t localThreads[] = {blockSizeX, blockSizeY}; cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); // Enqueue readBuffer cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, outputImageBuffer, CL_TRUE, 0, width * height * pixelSize, outputImageData, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&readEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed"); return SDK_SUCCESS; }
bool SimpleConvolution::runCLKernels( const float* pInArray, const float* pMaskArray, float* pOutArray){ cl_mem inputBuffer = 0, maskBuffer = 0, outputBuffer = 0; //cl_int nArraySize = width * height; cl_int errNum = 0; cl_event events[2]; inputBuffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, width*height*sizeof(float), (void*)pInArray, &errNum ); if(errNum != CL_SUCCESS){ printf( "ERROR: allocation of device input array.\n" ); return false; } maskBuffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, maskWidth*maskHeight*sizeof(float), (void*)pMaskArray, &errNum); if(errNum != CL_SUCCESS){ clReleaseMemObject(inputBuffer); printf( "ERROR: allocation of device input array.\n" ); return false; } outputBuffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE, width*height*sizeof(float), NULL, &errNum ); if(errNum != CL_SUCCESS){ clReleaseMemObject(inputBuffer); clReleaseMemObject(maskBuffer); printf( "ERROR: allocation of device input array.\n" ); return false; } size_t gws[1] = { width * height }; size_t lws[1] = { 256 }; errNum |= clSetKernelArg( m_kernel, 0, sizeof(cl_mem), (void *)&outputBuffer ); errNum |= clSetKernelArg( m_kernel, 1, sizeof(cl_mem), (void *)&inputBuffer); errNum |= clSetKernelArg( m_kernel, 2, sizeof(cl_mem), (void *)&maskBuffer); cl_uint2 inputDimensions = {width, height}; cl_uint2 maskDimensions = {maskWidth, maskHeight}; errNum |= clSetKernelArg( m_kernel, 3, sizeof(cl_uint2), (void *)&inputDimensions ); errNum |= clSetKernelArg( m_kernel, 4, sizeof(cl_uint2), (void *)&maskDimensions ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error setting kernel arguments" ); return false; } errNum = clEnqueueNDRangeKernel( m_commandQueue, m_kernel, 1, NULL, gws, lws, 0, NULL, &events[0] ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error setting kernel arguments" ); return false; } errNum = clFlush(m_commandQueue); errNum = waitForEventAndRelease(&events[0]); errNum = clEnqueueReadBuffer( m_commandQueue, outputBuffer, CL_TRUE, 0, width * height * sizeof(float), pOutArray, 0, NULL, &events[1]); if(errNum != CL_SUCCESS) { return false; } errNum = clFlush(m_commandQueue); errNum = waitForEventAndRelease(&events[1]); clReleaseMemObject(inputBuffer); clReleaseMemObject(maskBuffer); clReleaseMemObject(outputBuffer); return true; }
int Histogram::runCLKernels(void) { cl_int status; cl_int eventStatus = CL_QUEUED; status = this->setWorkGroupSize(); CHECK_ERROR(status, SDK_SUCCESS, "setKernelWorkGroupSize() failed"); // whether sort is to be in increasing order. CL_TRUE implies increasing status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dataBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dataBuf)"); status = clSetKernelArg(kernel, 1, groupSize * binSize * sizeof(cl_uchar), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)"); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&midDeviceBinBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (deviceBinBuf)"); // Enqueue a kernel run call. cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed"); status = mapBuffer( midDeviceBinBuf, midDeviceBin, subHistgCnt * binSize * sizeof(cl_uint), CL_MAP_READ); CHECK_ERROR(status, SDK_SUCCESS, "Failed to map device buffer.(midDeviceBinBuf)"); // Clear deviceBin array memset(deviceBin, 0, binSize * sizeof(cl_uint)); // Calculate final histogram bin for(int i = 0; i < subHistgCnt; ++i) { for(int j = 0; j < binSize; ++j) { deviceBin[j] += midDeviceBin[i * binSize + j]; } } status = unmapBuffer( midDeviceBinBuf, midDeviceBin); CHECK_ERROR(status, SDK_SUCCESS, "Failed to unmap device buffer.(midDeviceBinBuf)"); return SDK_SUCCESS; }
int BoxFilterSeparable::runCLKernels() { cl_int status; cl_int eventStatus = CL_QUEUED; // Set input data cl_event writeEvt; status = clEnqueueWriteBuffer(commandQueue, inputImageBuffer, CL_FALSE, 0, width * height * pixelSize, inputImageData, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputImageBuffer)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed"); // Set appropriate arguments to the kernel // input buffer image status = clSetKernelArg(horizontalKernel, 0, sizeof(cl_mem), &inputImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)"); // outBuffer imager status = clSetKernelArg(horizontalKernel, 1, sizeof(cl_mem), &tempImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)"); // filter width status = clSetKernelArg(horizontalKernel, 2, sizeof(cl_int), &filterWidth); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (filterWidth)"); #ifdef USE_LDS // shared memory status = clSetKernelArg(horizontalKernel, 3, (GROUP_SIZE + filterWidth - 1) * sizeof(cl_uchar4), 0); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)"); #endif // Enqueue a kernel run call. size_t globalThreads[] = {width, height}; size_t localThreads[] = {blockSizeX, blockSizeY}; cl_event ndrEvt1; status = clEnqueueNDRangeKernel(commandQueue, horizontalKernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt1); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&ndrEvt1); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed"); // Do vertical pass // Set appropriate arguments to the kernel // input buffer image status = clSetKernelArg(verticalKernel, 0, sizeof(cl_mem), &tempImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)"); // outBuffer imager status = clSetKernelArg(verticalKernel, 1, sizeof(cl_mem), &outputImageBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)"); // filter width status = clSetKernelArg(verticalKernel, 2, sizeof(cl_int), &filterWidth); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (filterWidth)"); // Enqueue a kernel run call. cl_event ndrEvt2; status = clEnqueueNDRangeKernel(commandQueue, verticalKernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt2); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&ndrEvt2); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt2) Failed"); // Enqueue readBuffer cl_event readEvt; status = clEnqueueReadBuffer(commandQueue, outputImageBuffer, CL_FALSE, 0, width * height * pixelSize, outputImageData, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&readEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed"); return SDK_SUCCESS; }
int MersenneTwister::setupCL(void) { cl_int status = 0; cl_device_type dType; if(sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed"); retValue = displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); if(checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } // getting device on which to run the sample status = getDevices(context, &devices, sampleArgs->deviceId, sampleArgs->isDeviceIdEnabled()); CHECK_ERROR(status, 0, "getDevices() failed"); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]); CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId], prop, &status); if(checkVal(status, 0, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } } // Set Persistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(sampleArgs->isAmdPlatform()) { inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; } seedsBuf = clCreateBuffer(context, inMemFlags, width * height * sizeof(cl_float4), 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (seedsBuf)"); resultBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * height * sizeof(cl_float4) * mulFactor, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (resultBuf)"); cl_event writeEvt; // Enqueue write to seedsBuf status = clEnqueueWriteBuffer(commandQueue, seedsBuf, CL_FALSE, 0, width * height * sizeof(cl_float4), seeds, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (seedsBuf)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed"); // create a CL program using the kernel source buildProgramData buildData; buildData.kernelName = std::string("MersenneTwister_Kernels.cl"); buildData.devices = devices; buildData.deviceId = sampleArgs->deviceId; buildData.flagsStr = std::string("-x clc++ "); if(sampleArgs->isLoadBinaryEnabled()) { buildData.binaryName = std::string(sampleArgs->loadBinary.c_str()); } if(sampleArgs->isComplierFlagsSpecified()) { buildData.flagsFileName = std::string(sampleArgs->flags.c_str()); } retValue = buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "gaussianRand", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int MersenneTwister::runCLKernels(void) { cl_int status; status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[sampleArgs->deviceId]); CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed"); // Calculate 2D block size according to required work-group size by kernel kernelInfo.kernelWorkGroupSize = kernelInfo.kernelWorkGroupSize; kernelInfo.kernelWorkGroupSize = kernelInfo.kernelWorkGroupSize > GROUP_SIZE ? GROUP_SIZE : kernelInfo.kernelWorkGroupSize; while((blockSizeX * blockSizeY) < kernelInfo.kernelWorkGroupSize) { if(2 * blockSizeX * blockSizeY <= kernelInfo.kernelWorkGroupSize) { blockSizeX <<= 1; } if(2 * blockSizeX * blockSizeY <= kernelInfo.kernelWorkGroupSize) { blockSizeY <<= 1; } } size_t globalThreads[2] = {width, height}; size_t localThreads[2] = {blockSizeX, blockSizeY}; if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] || localThreads[1] > deviceInfo.maxWorkItemSizes[1]|| (size_t)blockSizeX * blockSizeY > deviceInfo.maxWorkGroupSize) { std::cout<<"Unsupported: Device does not support" "requested number of work items."; return SDK_FAILURE; } // Set appropriate arguments to the kernel // Seeds array status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&seedsBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (seedsBuf)"); // width - i.e width of seeds array status = clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*)&width); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width)"); // mulFactor - i.e each seed generates mulFactor random numbers status = clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&mulFactor); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (mulFactor)"); // resultBuf status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&resultBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (resultBuf)"); /* * Enqueue a kernel run call. * Each thread generates mulFactor random numbers from given seed. */ cl_event ndrEvt; status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed"); cl_event readEvt; // Enqueue the results to application pointer status = clEnqueueReadBuffer(commandQueue, resultBuf, CL_FALSE, 0, width * height * mulFactor * sizeof(cl_float4), deviceResult, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = waitForEventAndRelease(&readEvt); CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed"); return SDK_SUCCESS; }
int FastWalshTransform::runCLKernels(void) { cl_int status; size_t globalThreads[1]; size_t localThreads[1]; // Enqueue write input to inputBuffer cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, inputBuffer, CL_FALSE, 0, length * sizeof(cl_float), input, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed"); /* * The kernel performs a butterfly operation and it runs for half the * total number of input elements in the array. * In each pass of the kernel two corresponding elements are found using * the butterfly operation on an array of numbers and their sum and difference * is stored in the same locations as the numbers */ globalThreads[0] = length / 2; localThreads[0] = 256; // Check group size against kernelWorkGroupSize status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[sampleArgs->deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed."); if((cl_uint)(localThreads[0]) > kernelInfo.kernelWorkGroupSize) { if(!sampleArgs->quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << localThreads[0] << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout<<"Changing the group size to " << kernelInfo.kernelWorkGroupSize << std::endl; } localThreads[0] = kernelInfo.kernelWorkGroupSize; } // Set appropriate arguments to the kernel // the input array - also acts as output status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer)"); for(cl_int step = 1; step < length; step <<= 1) { // stage of the algorithm status = clSetKernelArg( kernel, 1, sizeof(cl_int), (void *)&step); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (step)"); // Enqueue a kernel run call cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); } // Enqueue readBuffer cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, inputBuffer, CL_FALSE, 0, length * sizeof(cl_float), output, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&readEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed"); return SDK_SUCCESS; }