int AtomicCounters::setupCL(void) {
  cl_int status = 0;
  cl_device_type dType;
  if (sampleArgs->deviceType.compare("cpu") == 0) {
    dType = CL_DEVICE_TYPE_CPU;
  } else  // deviceType = "gpu"
  {
    dType = CL_DEVICE_TYPE_GPU;
    if (sampleArgs->isThereGPU() == false) {
      std::cout << "GPU not found. Falling back to CPU" << std::endl;
      dType = CL_DEVICE_TYPE_CPU;
    }
  }
  cl_platform_id platform = NULL;
  int retValue = getPlatform(platform, sampleArgs->platformId,
                             sampleArgs->isPlatformEnabled());
  CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed.");
  // Display available devices.
  retValue = displayDevices(platform, dType);
  CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed.");
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
                                  (cl_context_properties)platform, 0};
  context = clCreateContextFromType(cps, dType, NULL, NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");
  // getting device on which to run the sample
  status = getDevices(context, &devices, sampleArgs->deviceId,
                      sampleArgs->isDeviceIdEnabled());
  CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed ");
  // Set device info of given cl_device_id
  retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
  CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");
  // Check device extensions
  if (!strstr(deviceInfo.extensions, "cl_ext_atomic_counters_32")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_ext_atomic_counters_32 extension!");
  }
  if (!strstr(deviceInfo.extensions, "cl_khr_local_int32_base_atomics")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_khr_local_int32_base_atomics extension!");
  }
  // Get OpenCL device version
  std::string deviceVersionStr = std::string(deviceInfo.deviceVersion);
  size_t vStart = deviceVersionStr.find(" ", 0);
  size_t vEnd = deviceVersionStr.find(" ", vStart + 1);
  std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
// Check of OPENCL_C_VERSION if device version is 1.1 or later
#ifdef CL_VERSION_1_1
  if (deviceInfo.openclCVersion) {
    // Exit if OpenCL C device version is 1.0
    deviceVersionStr = std::string(deviceInfo.openclCVersion);
    vStart = deviceVersionStr.find(" ", 0);
    vStart = deviceVersionStr.find(" ", vStart + 1);
    vEnd = deviceVersionStr.find(" ", vStart + 1);
    vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
    if (vStrVal.compare("1.0") <= 0) {
      OPENCL_EXPECTED_ERROR(
          "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
    }
  } else {
    OPENCL_EXPECTED_ERROR(
        "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
  }
#else
  OPENCL_EXPECTED_ERROR(
      "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
#endif
  // Setup application data
  if (setupAtomicCounters() != SDK_SUCCESS) {
    return SDK_FAILURE;
  }
  cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
  commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId],
                                      props, &status);
  CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed(commandQueue)");
  // Set Persistent memory only for AMD platform
  cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
  if (sampleArgs->isAmdPlatform()) {
    inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
  }
  // Create buffer for input array
  inBuf = clCreateBuffer(context, inMemFlags, length * sizeof(cl_uint), NULL,
                         &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)");
  // Set up data for input array
  cl_event writeEvt;
  status =
      clEnqueueWriteBuffer(commandQueue, inBuf, CL_FALSE, 0,
                           length * sizeof(cl_uint), input, 0, NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed..");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
  counterOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                 NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf).");
  globalOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf).");
  // create a CL program using the kernel source
  buildProgramData buildData;
  buildData.kernelName = std::string("AtomicCounters_Kernels.cl");
  buildData.devices = devices;
  buildData.deviceId = sampleArgs->deviceId;
  buildData.flagsStr = std::string("");
  if (sampleArgs->isLoadBinaryEnabled()) {
    buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
  }
  if (sampleArgs->isComplierFlagsSpecified()) {
    buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
  }
  retValue = buildOpenCLProgram(program, context, buildData);
  CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");
  // ConstantBuffer bandwidth from single access
  counterKernel = clCreateKernel(program, "atomicCounters", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel).");
  globalKernel = clCreateKernel(program, "globalAtomics", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed.");
  status = kernelInfoC.setKernelWorkGroupInfo(counterKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  status = kernelInfoG.setKernelWorkGroupInfo(globalKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  if (counterWorkGroupSize > kernelInfoC.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << counterWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(readKernel) : "
                << kernelInfoC.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoC.kernelWorkGroupSize
                << std::endl;
    }
    counterWorkGroupSize = kernelInfoC.kernelWorkGroupSize;
  }
  if (globalWorkGroupSize > kernelInfoG.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << globalWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(writeKernel) : "
                << kernelInfoG.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoG.kernelWorkGroupSize
                << std::endl;
    }
    globalWorkGroupSize = kernelInfoG.kernelWorkGroupSize;
  }
  // Wait for event and release event
  status = waitForEventAndRelease(&writeEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
  return SDK_SUCCESS;
}
int DwtHaar1D::runDwtHaar1DKernel()
{
    cl_int status;

    status = this->setWorkGroupSize();
    CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize failed");

    // Force write to inData Buf to update its values
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 inDataBuf,
                 CL_FALSE,
                 0,
                 curSignalLength * sizeof(cl_float),
                 inData,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inDataBuf)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt1) Failed");

    // Whether sort is to be in increasing order. CL_TRUE implies increasing
    status = clSetKernelArg(kernel,
                            0,
                            sizeof(cl_mem),
                            (void*)&inDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inDataBuf)");

    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem),
                            (void*)&dOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dOutDataBuf)");

    status = clSetKernelArg(kernel,
                            2,
                            sizeof(cl_mem),
                            (void*)&dPartialOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dPartialOutData)");

    status = clSetKernelArg(kernel,
                            3,
                            (localThreads * 2 * sizeof(cl_float)),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)");

    status = clSetKernelArg(kernel,
                            4,
                            sizeof(cl_uint),
                            (void*)&totalLevels);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (totalLevels)");

    status = clSetKernelArg(kernel,
                            5,
                            sizeof(cl_uint),
                            (void*)&curSignalLength);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (curSignalLength)");

    status = clSetKernelArg(kernel,
                            6,
                            sizeof(cl_uint),
                            (void*)&levelsDone);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)");

    status = clSetKernelArg(kernel,
                            7,
                            sizeof(cl_uint),
                            (void*)&maxLevelsOnDevice);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)");

    /*
    * Enqueue a kernel run call.
    */
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 1,
                 NULL,
                 &globalThreads,
                 &localThreads,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed");

    // Enqueue the results to application pointer
    cl_event readEvt1;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 dOutDataBuf,
                 CL_FALSE,
                 0,
                 signalLength * sizeof(cl_float),
                 dOutData,
                 0,
                 NULL,
                 &readEvt1);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    // Enqueue the results to application pointer
    cl_event readEvt2;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 dPartialOutDataBuf,
                 CL_FALSE,
                 0,
                 signalLength * sizeof(cl_float),
                 dPartialOutData,
                 0,
                 NULL,
                 &readEvt2);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&readEvt1);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt1) Failed");

    status = waitForEventAndRelease(&readEvt2);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt2) Failed");

    return SDK_SUCCESS;
}
int AtomicCounters::runGlobalAtomicKernel() {
  cl_int status = CL_SUCCESS;
  // Set Global and Local work items
  size_t globalWorkItems = length;
  size_t localWorkItems = globalWorkGroupSize;
  // Initialize the counter value
  cl_event writeEvt;
  status =
      clEnqueueWriteBuffer(commandQueue, globalOutBuf, CL_FALSE, 0,
                           sizeof(cl_uint), &initValue, 0, NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush() failed.");
  // Wait for event and release event
  status = waitForEventAndRelease(&writeEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
  // Set kernel arguments
  status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
  status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
  status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed.");
  // Run Kernel
  cl_event ndrEvt;
  status = clEnqueueNDRangeKernel(commandQueue, globalKernel, 1, NULL,
                                  &globalWorkItems, &localWorkItems, 0, NULL,
                                  &ndrEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
  cl_int eventStatus = CL_QUEUED;
  while (eventStatus != CL_COMPLETE) {
    status = clGetEventInfo(ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &eventStatus, NULL);
    CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
  }
  cl_ulong startTime;
  cl_ulong endTime;
  // Get profiling information
  status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_START,
                                   sizeof(cl_ulong), &startTime, NULL);
  CHECK_OPENCL_ERROR(
      status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
  status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_END,
                                   sizeof(cl_ulong), &endTime, NULL);
  CHECK_OPENCL_ERROR(
      status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
  double sec = 1e-9 * (endTime - startTime);
  kTimeAtomGlobal += sec;
  status = clReleaseEvent(ndrEvt);
  CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
  // Get the occurrences of Value from atomicKernel
  cl_event readEvt;
  status = clEnqueueReadBuffer(commandQueue, globalOutBuf, CL_FALSE, 0,
                               sizeof(cl_uint), &globalOut, 0, NULL, &readEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush() failed.");
  // Wait for event and release event
  status = waitForEventAndRelease(&readEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
  return SDK_SUCCESS;
}
Пример #4
0
int
URNG::runCLKernels()
{
    cl_int status;

    // Set input data
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 inputImageBuffer,
                 CL_FALSE,
                 0,
                 width * height * sizeof(cl_uchar4),
                 inputImageData,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputImageBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed");

    // Set appropriate arguments to the kernel

    // input buffer image
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImageBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");

    // outBuffer imager
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImageBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)");

    // input buffer image
    status = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (factor)");

    // Enqueue a kernel run call.
    size_t globalThreads[] = {width, height};
    size_t localThreads[] = {blockSizeX, blockSizeY};

    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 2,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed");

    // Enqueue readBuffer
    cl_event readEvt;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 outputImageBuffer,
                 CL_TRUE,
                 0,
                 width * height * pixelSize,
                 outputImageData,
                 0,
                 NULL,
                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

    return SDK_SUCCESS;
}
bool SimpleConvolution::runCLKernels( const float* pInArray, const float* pMaskArray, float* pOutArray){
	cl_mem inputBuffer = 0, maskBuffer = 0, outputBuffer = 0;
	//cl_int nArraySize = width * height;
	cl_int errNum = 0;
	cl_event events[2];
	inputBuffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, width*height*sizeof(float), (void*)pInArray, &errNum );
	if(errNum != CL_SUCCESS){
		printf( "ERROR: allocation of device input array.\n" );
		return false;
	}
	maskBuffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, maskWidth*maskHeight*sizeof(float), (void*)pMaskArray, &errNum);
	if(errNum != CL_SUCCESS){
		clReleaseMemObject(inputBuffer);
		printf( "ERROR: allocation of device input array.\n" );
		return false;
	}
	outputBuffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE, width*height*sizeof(float), NULL, &errNum );
	if(errNum != CL_SUCCESS){
		clReleaseMemObject(inputBuffer);
		clReleaseMemObject(maskBuffer);
		printf( "ERROR: allocation of device input array.\n" );
		return false;
	}

	size_t gws[1] = { width * height };
	size_t lws[1] = { 256 };
	errNum |= clSetKernelArg( m_kernel, 0, sizeof(cl_mem), (void *)&outputBuffer );
	errNum |= clSetKernelArg( m_kernel, 1, sizeof(cl_mem), (void *)&inputBuffer);
	errNum |= clSetKernelArg( m_kernel, 2, sizeof(cl_mem), (void *)&maskBuffer);
	cl_uint2 inputDimensions = {width, height};
	cl_uint2 maskDimensions  = {maskWidth, maskHeight};
	errNum |= clSetKernelArg( m_kernel, 3, sizeof(cl_uint2), (void *)&inputDimensions );
	errNum |= clSetKernelArg( m_kernel, 4, sizeof(cl_uint2), (void *)&maskDimensions );
	if( errNum != CL_SUCCESS )
	{
		FrmLogMessage( "Error setting kernel arguments" );
		return false;
	}

	errNum = clEnqueueNDRangeKernel( m_commandQueue, m_kernel, 1, NULL,
		gws, lws, 0, NULL, &events[0] );
	if( errNum != CL_SUCCESS )
	{
		FrmLogMessage( "Error setting kernel arguments" );
		return false;
	}
	errNum = clFlush(m_commandQueue);
	errNum = waitForEventAndRelease(&events[0]);

	errNum = clEnqueueReadBuffer( m_commandQueue, outputBuffer, CL_TRUE, 0, width * height * sizeof(float), pOutArray, 0, NULL, &events[1]);
	if(errNum != CL_SUCCESS)
	{
		return false;
	}
	errNum = clFlush(m_commandQueue);
	errNum = waitForEventAndRelease(&events[1]);

	clReleaseMemObject(inputBuffer);
	clReleaseMemObject(maskBuffer);
	clReleaseMemObject(outputBuffer);
	return true;
}
int
Histogram::runCLKernels(void)
{
    cl_int status;
    cl_int eventStatus = CL_QUEUED;

    status = this->setWorkGroupSize();
    CHECK_ERROR(status, SDK_SUCCESS, "setKernelWorkGroupSize() failed");

    // whether sort is to be in increasing order. CL_TRUE implies increasing
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dataBuf)");

    status = clSetKernelArg(kernel, 1, groupSize * binSize * sizeof(cl_uchar),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)");

    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&midDeviceBinBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (deviceBinBuf)");

    // Enqueue a kernel run call.

    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 1,
                 NULL,
                 &globalThreads,
                 &localThreads,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed");

    status = mapBuffer( midDeviceBinBuf, midDeviceBin,
                        subHistgCnt * binSize * sizeof(cl_uint), CL_MAP_READ);
    CHECK_ERROR(status, SDK_SUCCESS,
                "Failed to map device buffer.(midDeviceBinBuf)");

    // Clear deviceBin array
    memset(deviceBin, 0, binSize * sizeof(cl_uint));

    // Calculate final histogram bin
    for(int i = 0; i < subHistgCnt; ++i)
    {
        for(int j = 0; j < binSize; ++j)
        {
            deviceBin[j] += midDeviceBin[i * binSize + j];
        }
    }

    status = unmapBuffer( midDeviceBinBuf, midDeviceBin);
    CHECK_ERROR(status, SDK_SUCCESS,
                "Failed to unmap device buffer.(midDeviceBinBuf)");

    return SDK_SUCCESS;
}
int BoxFilterSeparable::runCLKernels() {
  cl_int status;
  cl_int eventStatus = CL_QUEUED;

  // Set input data
  cl_event writeEvt;
  status = clEnqueueWriteBuffer(commandQueue, inputImageBuffer, CL_FALSE, 0,
                                width * height * pixelSize, inputImageData, 0,
                                NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputImageBuffer)");

  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush failed.");

  status = waitForEventAndRelease(&writeEvt);
  CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed");

  // Set appropriate arguments to the kernel

  // input buffer image
  status =
      clSetKernelArg(horizontalKernel, 0, sizeof(cl_mem), &inputImageBuffer);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");

  // outBuffer imager
  status =
      clSetKernelArg(horizontalKernel, 1, sizeof(cl_mem), &tempImageBuffer);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)");

  // filter width
  status = clSetKernelArg(horizontalKernel, 2, sizeof(cl_int), &filterWidth);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (filterWidth)");

#ifdef USE_LDS
  // shared memory
  status =
      clSetKernelArg(horizontalKernel, 3,
                     (GROUP_SIZE + filterWidth - 1) * sizeof(cl_uchar4), 0);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)");
#endif
  // Enqueue a kernel run call.
  size_t globalThreads[] = {width, height};
  size_t localThreads[] = {blockSizeX, blockSizeY};

  cl_event ndrEvt1;
  status =
      clEnqueueNDRangeKernel(commandQueue, horizontalKernel, 2, NULL,
                             globalThreads, localThreads, 0, NULL, &ndrEvt1);
  CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush failed.");

  status = waitForEventAndRelease(&ndrEvt1);
  CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed");

  // Do vertical pass

  // Set appropriate arguments to the kernel

  // input buffer image
  status = clSetKernelArg(verticalKernel, 0, sizeof(cl_mem), &tempImageBuffer);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");

  // outBuffer imager
  status =
      clSetKernelArg(verticalKernel, 1, sizeof(cl_mem), &outputImageBuffer);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)");

  // filter width
  status = clSetKernelArg(verticalKernel, 2, sizeof(cl_int), &filterWidth);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (filterWidth)");

  // Enqueue a kernel run call.
  cl_event ndrEvt2;
  status =
      clEnqueueNDRangeKernel(commandQueue, verticalKernel, 2, NULL,
                             globalThreads, localThreads, 0, NULL, &ndrEvt2);
  CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush failed.");

  status = waitForEventAndRelease(&ndrEvt2);
  CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt2) Failed");

  // Enqueue readBuffer
  cl_event readEvt;
  status = clEnqueueReadBuffer(commandQueue, outputImageBuffer, CL_FALSE, 0,
                               width * height * pixelSize, outputImageData, 0,
                               NULL, &readEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush failed.");

  status = waitForEventAndRelease(&readEvt);
  CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

  return SDK_SUCCESS;
}
int
MersenneTwister::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId,
                               sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);

    if(checkVal(status,
                CL_SUCCESS,
                "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
                       sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, 0, "getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(context,
                                            devices[sampleArgs->deviceId],
                                            prop,
                                            &status);
        if(checkVal(status,
                    0,
                    "clCreateCommandQueue failed."))
        {
            return SDK_FAILURE;
        }
    }

    // Set Persistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(sampleArgs->isAmdPlatform())
    {
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
    }

    seedsBuf = clCreateBuffer(context,
                              inMemFlags,
                              width * height * sizeof(cl_float4),
                              0,
                              &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (seedsBuf)");

    resultBuf = clCreateBuffer(context,
                               CL_MEM_WRITE_ONLY,
                               width * height * sizeof(cl_float4) * mulFactor,
                               NULL,
                               &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (resultBuf)");

    cl_event writeEvt;
    // Enqueue write to seedsBuf
    status = clEnqueueWriteBuffer(commandQueue,
                                  seedsBuf,
                                  CL_FALSE,
                                  0,
                                  width * height * sizeof(cl_float4),
                                  seeds,
                                  0,
                                  NULL,
                                  &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (seedsBuf)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed");

    // create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("MersenneTwister_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string("-x clc++ ");
    if(sampleArgs->isLoadBinaryEnabled())
    {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if(sampleArgs->isComplierFlagsSpecified())
    {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program, "gaussianRand", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
int
MersenneTwister::runCLKernels(void)
{
    cl_int   status;

    status = kernelInfo.setKernelWorkGroupInfo(kernel,
             devices[sampleArgs->deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed");

    // Calculate 2D block size according to required work-group size by kernel
    kernelInfo.kernelWorkGroupSize = kernelInfo.kernelWorkGroupSize;

    kernelInfo.kernelWorkGroupSize = kernelInfo.kernelWorkGroupSize > GROUP_SIZE ?
                                     GROUP_SIZE : kernelInfo.kernelWorkGroupSize;
    while((blockSizeX * blockSizeY) < kernelInfo.kernelWorkGroupSize)
    {
        if(2 * blockSizeX * blockSizeY <= kernelInfo.kernelWorkGroupSize)
        {
            blockSizeX <<= 1;
        }
        if(2 * blockSizeX * blockSizeY <= kernelInfo.kernelWorkGroupSize)
        {
            blockSizeY <<= 1;
        }
    }

    size_t globalThreads[2] = {width, height};
    size_t localThreads[2] = {blockSizeX, blockSizeY};

    if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] ||
            localThreads[1] > deviceInfo.maxWorkItemSizes[1]||
            (size_t)blockSizeX * blockSizeY > deviceInfo.maxWorkGroupSize)
    {
        std::cout<<"Unsupported: Device does not support"
                 "requested number of work items.";
        return SDK_FAILURE;
    }

    // Set appropriate arguments to the kernel

    // Seeds array
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&seedsBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (seedsBuf)");

    // width - i.e width of seeds array
    status = clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*)&width);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width)");

    // mulFactor - i.e each seed generates mulFactor random numbers
    status = clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&mulFactor);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (mulFactor)");

    // resultBuf
    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&resultBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (resultBuf)");

    /*
     * Enqueue a kernel run call.
     * Each thread generates mulFactor random numbers from given seed.
     */
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(commandQueue,
                                    kernel,
                                    2,
                                    NULL,
                                    globalThreads,
                                    localThreads,
                                    0,
                                    NULL,
                                    &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

    status = waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed");

    cl_event readEvt;
    // Enqueue the results to application pointer
    status = clEnqueueReadBuffer(commandQueue,
                                 resultBuf,
                                 CL_FALSE,
                                 0,
                                 width * height * mulFactor * sizeof(cl_float4),
                                 deviceResult,
                                 0,
                                 NULL,
                                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed");

    return SDK_SUCCESS;
}
int
FastWalshTransform::runCLKernels(void)
{
    cl_int   status;
    size_t globalThreads[1];
    size_t localThreads[1];

    // Enqueue write input to inputBuffer
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 inputBuffer,
                 CL_FALSE,
                 0,
                 length * sizeof(cl_float),
                 input,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed");

    /*
     * The kernel performs a butterfly operation and it runs for half the
     * total number of input elements in the array.
     * In each pass of the kernel two corresponding elements are found using
     * the butterfly operation on an array of numbers and their sum and difference
     * is stored in the same locations as the numbers
     */
    globalThreads[0] = length / 2;
    localThreads[0]  = 256;

    // Check group size against kernelWorkGroupSize
    status = kernelInfo.setKernelWorkGroupInfo(kernel,
             devices[sampleArgs->deviceId]);
    CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed.");

    if((cl_uint)(localThreads[0]) > kernelInfo.kernelWorkGroupSize)
    {
        if(!sampleArgs->quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << localThreads[0] << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelInfo.kernelWorkGroupSize << std::endl;
            std::cout<<"Changing the group size to " << kernelInfo.kernelWorkGroupSize
                     << std::endl;
        }
        localThreads[0] = kernelInfo.kernelWorkGroupSize;
    }

    // Set appropriate arguments to the kernel

    // the input array - also acts as output
    status = clSetKernelArg(
                 kernel,
                 0,
                 sizeof(cl_mem),
                 (void *)&inputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer)");

    for(cl_int step = 1; step < length; step <<= 1)
    {
        // stage of the algorithm
        status = clSetKernelArg(
                     kernel,
                     1,
                     sizeof(cl_int),
                     (void *)&step);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (step)");

        // Enqueue a kernel run call
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(
                     commandQueue,
                     kernel,
                     1,
                     NULL,
                     globalThreads,
                     localThreads,
                     0,
                     NULL,
                     &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        status = clFlush(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

        status = waitForEventAndRelease(&ndrEvt);
        CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed");
    }


    // Enqueue readBuffer
    cl_event readEvt;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 inputBuffer,
                 CL_FALSE,
                 0,
                 length *  sizeof(cl_float),
                 output,
                 0,
                 NULL,
                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

    status = waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

    return SDK_SUCCESS;
}