int
MatrixMulDouble::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    status = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(status, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;
    if(platforms.size() > 0)
    {
        if(sampleArgs->isPlatformEnabled())
        {
            i = platforms.begin() + sampleArgs->platformId;
        }
        else
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(),
                           "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
    }

    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)(*i)(),
        0
    };


    if(NULL == (*i)())
    {
        error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    context = cl::Context(dType, cps, NULL, NULL, &status);
    CHECK_OPENCL_ERROR(status, "Context::Context() failed.");

    devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CHECK_OPENCL_ERROR(status, "Context::getInfo() failed.");

    std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
    int deviceCount = (int)devices.size();
    int j = 0;
    for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end();
            ++i, ++j)
    {
        std::cout << "Device " << j << " : ";
        std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
        std::cout << deviceName.c_str() << "\n";
    }
    std::cout << "\n";

    if (deviceCount == 0)
    {
        std::cerr << "No device available\n";
        return SDK_FAILURE;
    }

    if(validateDeviceId(sampleArgs->deviceId, deviceCount))
    {
        error("validateDeviceId() failed");
        return SDK_FAILURE;
    }

    std::string extensions =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_EXTENSIONS>();

    std::string buildOptions = std::string("");
    // Check if cl_khr_fp64 extension is supported
    if(strstr(extensions.c_str(), "cl_khr_fp64"))
    {
        buildOptions.append("-D KHR_DP_EXTENSION");
    }
    else
    {
        // Check if cl_amd_fp64 extension is supported
        if(!strstr(extensions.c_str(), "cl_amd_fp64"))
        {
            OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!");
        }
    }
    cl_uint localMemType;
    // Get device specific information
    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_LOCAL_MEM_TYPE,
                 &localMemType);
    CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed.");

    // If scratchpad is available then update the flag
    if(localMemType == CL_LOCAL)
    {
        lds = true;
    }

    // Get Device specific Information
    status = devices[sampleArgs->deviceId].getInfo<size_t>(
                 CL_DEVICE_MAX_WORK_GROUP_SIZE,
                 &maxWorkGroupSize);

    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");

    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                 &maxDimensions);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    std::vector<size_t> workItems =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
    {
        maxWorkItemSizes[i] = workItems[i];
    }

    status = devices[sampleArgs->deviceId].getInfo<cl_ulong>(
                 CL_DEVICE_LOCAL_MEM_SIZE,
                 &totalLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

    // Set command queue properties
    cl_command_queue_properties prop = 0;
    if(!eAppGFLOPS)
    {
        prop |= CL_QUEUE_PROFILING_ENABLE;
    }

    commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], prop,
                                    &status);
    CHECK_OPENCL_ERROR(status, "CommandQueue::CommandQueue() failed.");

    // Set Persistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(sampleArgs->isAmdPlatform())
    {
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
    }

    // Create buffer for matrix A
    inputBufA = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthA * heightA,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufA)");

    // Create buffer for matrix B
    inputBufB = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthB * heightB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufB)");

    outputBuf = cl::Buffer(
                    context,
                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                    sizeof(cl_double) * heightA * widthB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (outputBuf)");

    device.push_back(devices[sampleArgs->deviceId]);

    // create a CL program using the kernel source
    SDKFile kernelFile;
    std::string kernelPath = getPath();

    if(sampleArgs->isLoadBinaryEnabled())
    {
        kernelPath.append(sampleArgs->loadBinary.c_str());
        if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS)
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Binaries programBinary(1,std::make_pair(
                                                (const void*)kernelFile.source().data(),
                                                kernelFile.source().size()));

        program = cl::Program(context, device, programBinary, NULL, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Binary) failed.");

    }
    else
    {
        kernelPath.append("MatrixMulDouble_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Sources programSource(
            1,
            std::make_pair(kernelFile.source().data(),
                           kernelFile.source().size()));

        program = cl::Program(context, programSource, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Source) failed.");

    }

    std::string flagsStr = std::string("");

    // Get build options if any
    flagsStr.append(buildOptions.c_str());

    // Get additional options
    if(sampleArgs->isComplierFlagsSpecified())
    {
        SDKFile flagsFile;
        std::string flagsPath = getPath();
        flagsPath.append(sampleArgs->flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
    {
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
    }

    status = program.build(device, flagsStr.c_str());

    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }
    CHECK_OPENCL_ERROR(status, "Program::build() failed.");

    // Create kernel

    // If local memory is present then use the specific kernel
    if(lds)
    {
        kernel = cl::Kernel(program, "mmmKernel_local", &status);
    }
    else
    {
        kernel = cl::Kernel(program, "mmmKernel", &status);
    }

    CHECK_OPENCL_ERROR(status, "cl::Kernel failed.");
    status =  kernel.getWorkGroupInfo<cl_ulong>(

                  devices[sampleArgs->deviceId],
                  CL_KERNEL_LOCAL_MEM_SIZE,
                  &usedLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed"
                       ".(usedLocalMemory)");

    availableLocalMemory = totalLocalMemory - usedLocalMemory;
    if(lds)
    {
        neededLocalMemory = (blockSize * 4) * (blockSize * 4) * sizeof(cl_double);
    }
    else
    {
        neededLocalMemory = 0;
    }

    if(neededLocalMemory > availableLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    // Check group size against group size returned by kernel
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>
                          (devices[sampleArgs->deviceId], &status);
    CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo()  failed.");

    if((cl_uint)(blockSize * blockSize) > kernelWorkGroupSize)
    {
        if(kernelWorkGroupSize >= 64)
        {
            blockSize = 8;
        }
        else if(kernelWorkGroupSize >= 32)
        {
            blockSize = 4;
        }
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSize * blockSize << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelWorkGroupSize<<std::endl;
            return SDK_FAILURE;
        }
    }

    if(blockSize > maxWorkItemSizes[0] ||
            blockSize > maxWorkItemSizes[1] ||
            blockSize * blockSize > maxWorkGroupSize)
    {
        error("Unsupported: Device does not support requested number of work items.");
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
Beispiel #2
0
/** Initialization OpenCL entities
 *
 * Method inits all required entities for work with OpenCL code.
 *
 *  @param config
 *  Contain information about simulating configuration
 */
void owOpenCLSolver::initializeOpenCL(owConfigProperty * config)
{
	cl_int err;
	std::vector< cl::Platform > platformList;
	err = cl::Platform::get( &platformList ); //TODO make check that returned value isn't error
	if( platformList.size() < 1 || err != CL_SUCCESS ){
		throw std::runtime_error( "No OpenCL platforms found" );
	}
	char cBuffer[1024];
	cl_platform_id cl_pl_id[10];
	cl_uint n_pl;
	clGetPlatformIDs(10,cl_pl_id,&n_pl);
	cl_int ciErrNum;
	int sz;
	for(int i=0;i<(int)n_pl;i++)
	{
		// Get OpenCL platform name and version
		ciErrNum = clGetPlatformInfo (cl_pl_id[i], CL_PLATFORM_VERSION, sz = sizeof(cBuffer), cBuffer, NULL);
		if (ciErrNum == CL_SUCCESS)
		{
			printf(" CL_PLATFORM_VERSION [%d]: \t%s\n", i, cBuffer);
		}
		else
		{
			printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum);
		}
	}
	//0-CPU, 1-GPU // depends on the time order of system OpenCL drivers installation on your local machine
	// CL_DEVICE_TYPE
    cl_device_type type;
	unsigned int device_type [] = {CL_DEVICE_TYPE_CPU,CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ALL};

	int plList = -1;//selected platform index in platformList array [choose CPU by default]
							//added autodetection of device number corresonding to preferrable device type (CPU|GPU) | otherwise the choice will be made from list of existing devices
	cl_uint ciDeviceCount = 0;
	cl_device_id * devices_t;
	bool bPassed = true, findDevice = false;
	cl_int result;
	cl_uint device_coumpute_unit_num;
	cl_uint device_coumpute_unit_num_current = 0;
	unsigned int deviceNum = 0;
	//Selection of more appropriate device
	while(!findDevice){
		for(int clSelectedPlatformID = 0;clSelectedPlatformID < (int)n_pl;clSelectedPlatformID++){
			//if(findDevice)
			//	break;
			clGetDeviceIDs (cl_pl_id[clSelectedPlatformID], device_type[config->getDeviceType()], 0, NULL, &ciDeviceCount);
			if((devices_t = static_cast<cl_device_id *>(malloc(sizeof(cl_device_id) * ciDeviceCount))) == NULL)
				bPassed = false;
			if(bPassed){
				result= clGetDeviceIDs (cl_pl_id[clSelectedPlatformID], device_type[config->getDeviceType()], ciDeviceCount, devices_t, &ciDeviceCount);
				if( result == CL_SUCCESS){
					for( cl_uint i =0; i < ciDeviceCount; ++i ){
						clGetDeviceInfo(devices_t[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL);
						if( type & device_type[config->getDeviceType()]){
							clGetDeviceInfo(devices_t[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(device_coumpute_unit_num), &device_coumpute_unit_num, NULL);
							if(device_coumpute_unit_num_current <= device_coumpute_unit_num){
								plList = clSelectedPlatformID;
								device_coumpute_unit_num_current = device_coumpute_unit_num;
								findDevice = true;
								deviceNum = i;
							}
							//break;
						}
					}
				}
				free(devices_t);
			}
		}
		if(!findDevice){
			//plList = 0;
			deviceNum = 0;
			std::string deviceTypeName = (config->getDeviceType() == ALL)? "ALL": (config->getDeviceType() == CPU)? "CPU":"GPU";
			std::cout << "Unfortunately OpenCL couldn't find device " << deviceTypeName << std::endl;
			std::cout << "OpenCL try to init existing device " << std::endl;
			if(config->getDeviceType() != ALL)
				config->setDeviceType(ALL);
			else
				throw std::runtime_error("Sibernetic can't find any OpenCL devices. Please check you're environment configuration.");
		}
	}
	cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) (platformList[plList])(), 0 };
	context = cl::Context( device_type[config->getDeviceType()], cprops, NULL, NULL, &err );
	devices = context.getInfo< CL_CONTEXT_DEVICES >();
	if( devices.size() < 1 ){
		throw std::runtime_error( "No OpenCL devices were found" );
	}
	//Print some information about chosen platform
	size_t compUnintsCount, memoryInfo, workGroupSize;
	result = devices[deviceNum].getInfo(CL_DEVICE_NAME,&cBuffer);// CL_INVALID_VALUE = -30;
	if(result == CL_SUCCESS){
		std::cout << "CL_CONTEXT_PLATFORM ["<< plList
		<< "]: CL_DEVICE_NAME [" << deviceNum
		<< "]:\t" << cBuffer << "\n" << std::endl;
	}
	if(strlen(cBuffer)<1000){
		config->setDeviceName(cBuffer);
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_TYPE,&cBuffer);
	if(result == CL_SUCCESS){
		std::cout << "CL_CONTEXT_PLATFORM ["<< plList << "]: CL_DEVICE_TYPE ["
		<< deviceNum << "]:\t"
		<< (((int)cBuffer[0] == CL_DEVICE_TYPE_CPU)? "CPU" : "GPU") << std::endl;
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE,&workGroupSize);
	if(result == CL_SUCCESS){
		std::cout << "CL_CONTEXT_PLATFORM ["<< plList
		<< "]: CL_DEVICE_MAX_WORK_GROUP_SIZE [" <<  deviceNum
		<<"]: \t" << workGroupSize <<std::endl;
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,&compUnintsCount);
	if(result == CL_SUCCESS){
		std::cout<<"CL_CONTEXT_PLATFORM [" << plList
		<< "]: CL_DEVICE_MAX_COMPUTE_UNITS [" << deviceNum
		<< "]: \t" << compUnintsCount  << std::endl;
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_GLOBAL_MEM_SIZE,&memoryInfo);
	if(result == CL_SUCCESS){
		std::cout<<"CL_CONTEXT_PLATFORM [" << plList
		<<"]: CL_DEVICE_GLOBAL_MEM_SIZE ["<< deviceNum
		<<"]: \t" << deviceNum <<std::endl;
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,&memoryInfo);
	if(result == CL_SUCCESS){
		std::cout << "CL_CONTEXT_PLATFORM [" << plList
		<<"]: CL_DEVICE_GLOBAL_MEM_CACHE_SIZE ["
		<< deviceNum <<"]:\t" << memoryInfo <<std::endl;
	}
	result = devices[deviceNum].getInfo(CL_DEVICE_LOCAL_MEM_SIZE,&memoryInfo);
	if(result == CL_SUCCESS){
		std::cout << "CL_CONTEXT_PLATFORM "
		<< plList <<": CL_DEVICE_LOCAL_MEM_SIZE ["
		<< deviceNum <<"]:\t" << memoryInfo << std::endl;
	}
	queue = cl::CommandQueue( context, devices[ deviceNum ], 0, &err );
	if( err != CL_SUCCESS ){
		throw std::runtime_error( "Failed to create command queue" );
	}
	std::ifstream file( config->getSourceFileName().c_str() );
	if( !file.is_open() ){
		throw std::runtime_error( "Could not open file with OpenCL program check input arguments oclsourcepath: " + config->getSourceFileName() );
	}
	std::string programSource( std::istreambuf_iterator<char>( file ), ( std::istreambuf_iterator<char>() ));
	cl::Program::Sources source( 1, std::make_pair( programSource.c_str(), programSource.length()+1 ));
	program = cl::Program( context, source );
#if defined(__APPLE__)
	err = program.build( devices, "-g -cl-opt-disable" );
#else
	#if INTEL_OPENCL_DEBUG
		err = program.build( devices, OPENCL_DEBUG_PROGRAM_PATH +  "-g -cl-opt-disable");
	#else
		err = program.build( devices, "");
	#endif
#endif
	if( err != CL_SUCCESS ){
		std::string compilationErrors;
		compilationErrors = program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( devices[ 0 ] );
		std::cerr << "Compilation failed: " << std::endl << compilationErrors << std::endl;
		throw std::runtime_error( "failed to build program" );
	}
	std::cout<<"OPENCL program was successfully build. Program file oclsourcepath: " << config->getSourceFileName() << std::endl;
	return;
}
Beispiel #3
0
int
Lucas::setupCL (void)
{
  cl_int status = 0;
  cl_device_type dType;

  if (deviceType.compare ("cpu") == 0)
    dType = CL_DEVICE_TYPE_CPU;
  else				//deviceType = "gpu" 
    {
      dType = CL_DEVICE_TYPE_GPU;
      if (isThereGPU () == false)
	{
	  std::cout << "GPU not found. Falling back to CPU device" << std::
	    endl;
	  dType = CL_DEVICE_TYPE_CPU;
	}
    }

  /*
   * Have a look at the available platforms and pick either
   * the AMD one if available or a reasonable default.
   */
  status = cl::Platform::get (&platforms);
  CHECK_OPENCL_ERROR (status, "Platform::get() failed.");

  std::vector < cl::Platform >::iterator i;
  if (platforms.size () > 0)
    {
      if (isPlatformEnabled ())
	{
	  i = platforms.begin () + platformId;
	}
      else
	{
	  for (i = platforms.begin (); i != platforms.end (); ++i)
	    {
	      if (!strcmp ((*i).getInfo < CL_PLATFORM_VENDOR > ().c_str (),
			   "Advanced Micro Devices, Inc."))
		{
		  break;
		}
	    }
	}
    }

  cl_context_properties cps[3] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties) (*i) (),
    0
  };

  if (NULL == (*i) ())
    {
      sampleCommon->error ("NULL platform found so Exiting Application.");
      return SDK_FAILURE;
    }

  context = cl::Context (dType, cps, NULL, NULL, &status);
  CHECK_OPENCL_ERROR (status, "Context::Context() failed.");

  devices = context.getInfo < CL_CONTEXT_DEVICES > ();
  CHECK_OPENCL_ERROR (status, "Context::getInfo() failed.");

  std::cout << "Platform :" << (*i).getInfo < CL_PLATFORM_VENDOR >
    ().c_str () << "\n";
  int deviceCount = (int) devices.size ();
  int j = 0;
  for (std::vector < cl::Device >::iterator i = devices.begin ();
       i != devices.end (); ++i, ++j)
    {
      std::cout << "Device " << j << " : ";
      std::string deviceName = (*i).getInfo < CL_DEVICE_NAME > ();
      std::cout << deviceName.c_str () << "\n";
    }
  std::cout << "\n";

  if (deviceCount == 0)
    {
      std::cerr << "No device available\n";
      return SDK_FAILURE;
    }

  if (sampleCommon->validateDeviceId (deviceId, deviceCount))
    {
      sampleCommon->error ("sampleCommon::validateDeviceId() failed");
      return SDK_FAILURE;
    }

  std::string extensions =
    devices[deviceId].getInfo < CL_DEVICE_EXTENSIONS > ();

  std::string buildOptions = std::string ("");
  // Check if cl_khr_fp64 extension is supported 
  if (strstr (extensions.c_str (), "cl_khr_fp64"))
    {
      buildOptions.append ("-D KHR_DP_EXTENSION");
    }
  else
    {
      // Check if cl_amd_fp64 extension is supported 
      if (!strstr (extensions.c_str (), "cl_amd_fp64"))
	{
	  OPENCL_EXPECTED_ERROR
	    ("Device does not support cl_amd_fp64 extension!");
	}
    }
    cl_uint localMemType;
    // Get device specific information 
    status = devices[deviceId].getInfo<cl_uint>(
             CL_DEVICE_LOCAL_MEM_TYPE,
            &localMemType);
    CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed.");
    
    // If scratchpad is available then update the flag 
    if(localMemType != CL_LOCAL)
	  OPENCL_EXPECTED_ERROR ("Device does not support local memory.");

    // Get Device specific Information 
    status = devices[deviceId].getInfo<size_t>(
              CL_DEVICE_MAX_WORK_GROUP_SIZE, 
              &maxWorkGroupSize);

    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");
    if(threads > maxWorkGroupSize)
	  OPENCL_EXPECTED_ERROR ("Device does not support threads.");
    
    status = devices[deviceId].getInfo<cl_uint>(
             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
             &maxDimensions);
    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");
    

    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));
    
    std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
        maxWorkItemSizes[i] = workItems[i];

    status = devices[deviceId].getInfo<cl_ulong>(
             CL_DEVICE_LOCAL_MEM_SIZE,
             &totalLocalMemory);
    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

  // Set command queue properties
  cl_command_queue_properties prop = 0;
  if (!eAppGFLOPS)
    prop |= CL_QUEUE_PROFILING_ENABLE;

  commandQueue = cl::CommandQueue (context, devices[deviceId], prop, &status);
  CHECK_OPENCL_ERROR (status, "CommandQueue::CommandQueue() failed.");

  // Set Presistent memory only for AMD platform
  cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
  if (isAmdPlatform ())
    inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

  device.push_back (devices[deviceId]);

  // create a CL program using the kernel source
  streamsdk::SDKFile kernelFile;
  std::string kernelPath = sampleCommon->getPath ();

  kernelPath.append ("Kernels.cl");
  if (!kernelFile.open (kernelPath.c_str ()))
  {
      std::cout << "Failed to load kernel file : " << kernelPath <<
      std::endl;
      return SDK_FAILURE;
  }
  cl::Program::Sources programSource (1,
					  std::make_pair (kernelFile.
							  source ().data (),
							  kernelFile.
							  source ().size ()));

  program = cl::Program (context, programSource, &status);
  CHECK_OPENCL_ERROR (status, "Program::Program(Source) failed.");

  std::string flagsStr = std::string ("");

  status = program.build (device, flagsStr.c_str ());

  if (status != CL_SUCCESS)
    {
      if (status == CL_BUILD_PROGRAM_FAILURE)
	{
	  std::string str =
	    program.getBuildInfo < CL_PROGRAM_BUILD_LOG > (devices[deviceId]);

	  std::cout << " \n\t\t\tBUILD LOG\n";
	  std::cout << " ************************************************\n";
	  std::cout << str << std::endl;
	  std::cout << " ************************************************\n";
	}
    }
  CHECK_OPENCL_ERROR (status, "Program::build() failed.");

  // Create kernel  

  // If local memory is present then use the specific kernel 
  mul_kernel = cl::Kernel (program, "mul_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");
  status = mul_kernel.getWorkGroupInfo < cl_ulong > (devices[deviceId],
						 CL_KERNEL_LOCAL_MEM_SIZE,
						 &usedLocalMemory);
  CHECK_OPENCL_ERROR (status,
		      "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed"
		      ".(usedLocalMemory)");

  // Create normalize_kernel  

  // If local memory is present then use the specific kernel 
  normalize_kernel = cl::Kernel (program, "normalize_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");

  // Create normalize2_kernel  

  // If local memory is present then use the specific kernel 
  normalize2_kernel = cl::Kernel (program, "normalize2_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");

  return SDK_SUCCESS;
}
int
UnsharpMask::setupCL()
{
    try
    {
        cl_int err = CL_SUCCESS;
        cl_device_type dType;

        if(sampleArgs->deviceType.compare("cpu") == 0)
        {
            dType = CL_DEVICE_TYPE_CPU;
        }
        else //deviceType == "gpu"
        {
            dType = CL_DEVICE_TYPE_GPU;
            if(sampleArgs->isThereGPU() == false)
            {
                std::cout << "GPU not found. Falling back to CPU device" << std::endl;
                dType = CL_DEVICE_TYPE_CPU;
            }
        }

        /*
        * Have a look at the available platforms and pick either
        * the AMD one if available or a reasonable default.
        */
        cl::Platform::get(&platforms);


        std::vector<cl::Platform>::iterator i;
        if(platforms.size() > 0)
        {
            if(sampleArgs->isPlatformEnabled())
            {
                i = platforms.begin() + sampleArgs->platformId;
            }
            else
            {
                for(i = platforms.begin(); i != platforms.end(); ++i)
                {
                    if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(),
                               "Advanced Micro Devices, Inc."))
                    {
                        break;
                    }
                }
            }
        }

        cl_context_properties cps[3] =
        {
            CL_CONTEXT_PLATFORM,
            (cl_context_properties)(*i)(),
            0
        };

        context = cl::Context(dType, cps, NULL, NULL);

        devices = context.getInfo<CL_CONTEXT_DEVICES>();

        std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
        int deviceCount = (int)devices.size();
        int j = 0;
        for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end();
                ++i, ++j)
        {
            std::cout << "Device " << j << " : ";
            std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
            std::cout << deviceName.c_str() << "\n";
        }
        std::cout << "\n";

        if (deviceCount == 0)
        {
            std::cerr << "No device available\n";
            return SDK_FAILURE;
        }

        if(validateDeviceId(sampleArgs->deviceId, deviceCount))
        {
            error("validateDeviceId() failed");
            return SDK_FAILURE;
        }

        commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0);

        device.push_back(devices[sampleArgs->deviceId]);

        // create a CL program using the kernel source
        SDKFile kernelFile;
        std::string kernelPath = getPath();

        if(sampleArgs->isLoadBinaryEnabled())
        {
            kernelPath.append(sampleArgs->loadBinary.c_str());
            if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS)
            {
                std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
                return SDK_FAILURE;
            }
            cl::Program::Binaries programBinary(1,std::make_pair(
                                                    (const void*)kernelFile.source().data(),
                                                    kernelFile.source().size()));

            program = cl::Program(context, device, programBinary, NULL);
        }
        else
        {
            kernelPath.append("UnsharpMask_Kernels.cl");
            if(!kernelFile.open(kernelPath.c_str()))
            {
                std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
                return SDK_FAILURE;
            }

            cl::Program::Sources programSource(1,
                                               std::make_pair(kernelFile.source().data(),
                                                       kernelFile.source().size()));

            program = cl::Program(context, programSource);
        }

        std::string flagsStr = std::string("");

        // Get additional options
        if(sampleArgs->isComplierFlagsSpecified())
        {
            SDKFile flagsFile;
            std::string flagsPath = getPath();
            flagsPath.append(sampleArgs->flags.c_str());
            if(!flagsFile.open(flagsPath.c_str()))
            {
                std::cout << "Failed to load flags file: " << flagsPath << std::endl;
                return SDK_FAILURE;
            }
            flagsFile.replaceNewlineWithSpaces();
            const char * flags = flagsFile.source().c_str();
            flagsStr.append(flags);
        }

        if(flagsStr.size() != 0)
        {
            std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
        }

        program.build(device, flagsStr.c_str());

        queue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0, &err);

        int dimen = 2*radius+1;
        cl::ImageFormat format(CL_BGRA,CL_UNSIGNED_INT8);

        if(loadInputImage()!=SDK_SUCCESS)
        {
            return SDK_FAILURE;
        }

        gaussian1DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, dimen*sizeof(float));

        // create the 1D Gaussian kernel
        if(dImageBuffer)
        {
            gaussian2DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY,
                                          dimen*dimen*sizeof(float));
            inputBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, imageSize);
            outputBuffer = cl::Buffer (context,CL_MEM_WRITE_ONLY, imageSize);
            unsharp_mask_filter = cl::Kernel(program, "unsharp_mask_filter");

        }
        else
        {
            inputImageObj = cl::Image2D(context, CL_MEM_READ_ONLY, format, width, height);
            sharpenImageObj = cl::Image2D(context, CL_MEM_WRITE_ONLY, format, width,
                                          height);
            tmpImageObj = cl::Buffer(context,CL_MEM_READ_WRITE,
                                     width*height*sizeof(cl_float4));

            // Create kernel
            unsharp_mask_pass1 = cl::Kernel(program, "unsharp_mask_pass1");
            unsharp_mask_pass2 = cl::Kernel(program, "unsharp_mask_pass2");
        }
    }
    catch (cl::Error e)
    {
        if(e.err() == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
        else
        {
            std::cout << e.what() << " failed!"<< std::endl;
            std::cout << "Error code: " << e.err() << std::endl;
        }
        return SDK_FAILURE;
    }
    return SDK_SUCCESS;
}
int SobelFilterImage::setupCL()
{
    cl_int err = CL_SUCCESS;
    cl_device_type dType;

    if(sampleArgs.deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs.isThereGPU)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    err = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(err, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;

    int deviceId = -1;

    for (i=platforms.begin(); i!=platforms.end(); i++)
    {
        //std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";


        devices.clear();
        i->getDevices(dType, &devices);

        if (devices.size() < 0)
            break;

        deviceId = 0;
#if 0
        for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i)
        {
            std::cout << "Device " << " : ";
            std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
            std::cout << deviceName.c_str() << "\n";
        }
#endif
        std::cout << "\n";

    }

    if (deviceId == -1) {
        std::cerr << "Cant find CL device" << std::endl;
        exit(-5);
    }

    device.push_back(devices[deviceId]);

    context = cl::Context( device );
    commandQueue = cl::CommandQueue(context, devices[deviceId], 0,
                                    &err);
    CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed.");

    cl::ImageFormat imageFormat(CL_RGBA, CL_UNSIGNED_INT8);
    /*
    * Create and initialize memory objects
    */
    inputImage2D = cl::Image2D(context,
                               CL_MEM_READ_ONLY,
                               imageFormat,
                               width,
                               height,
                               0,
                               NULL,
                               &err);
    CHECK_OPENCL_ERROR(err, "Image2D::Image2D() failed. (inputImage2D)");


    // Create memory objects for output Image
    outputImage2D = cl::Image2D(context,
                                CL_MEM_WRITE_ONLY,
                                imageFormat,
                                width,
                                height,
                                0,
                                0,
                                &err);
    CHECK_OPENCL_ERROR(err, "Image2D::Image2D() failed. (outputImage2D)");

    // create a CL program using the kernel source


    std::string clSourceData = readFile(sampleArgs.clKernelPath.c_str());

    // create program source
    cl::Program::Sources programSource(1, std::make_pair(clSourceData.c_str(), clSourceData.length()));

    // Create program object
    program = cl::Program(context, programSource, &err);
    CHECK_OPENCL_ERROR(err, "Program::Program() failed.");



    std::string flagsStr = std::string("");

    if(flagsStr.size() != 0)
    {
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
    }

    err = program.build( { device }, flagsStr.c_str());

    if(err != CL_SUCCESS)
    {
        if(err == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }
    CHECK_OPENCL_ERROR(err, "Program::build() failed.");

    // Create kernel
    kernel = cl::Kernel(program, "sobel_filter",  &err);
    CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed.");

    // Check group size against group size returned by kernel
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>
            (devices[deviceId], &err);
    CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo()  failed.");

    if((blockSizeX * blockSizeY) > kernelWorkGroupSize)
    {
        std::cout << "Out of Resources!" << std::endl;
        std::cout << "Group Size specified : "
                  << blockSizeX * blockSizeY << std::endl;
        std::cout << "Max Group Size supported on the kernel : "
                  << kernelWorkGroupSize << std::endl;
        std::cout << "Falling back to " << kernelWorkGroupSize << std::endl;

        if(blockSizeX > kernelWorkGroupSize)
        {
            blockSizeX = kernelWorkGroupSize;
            blockSizeY = 1;
        }
    }

    return 0;
}
int 
GaussianNoise::setupCL()
{
    cl_int err = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    err = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(err, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;
    if(platforms.size() > 0)
    {
        if(isPlatformEnabled())
        {
            i = platforms.begin() + platformId;
        }
        else
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), 
                    "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
    }

    cl_context_properties cps[3] = 
    { 
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)(*i)(),
        0 
    };

    context = cl::Context(dType, cps, NULL, NULL, &err);
    CHECK_OPENCL_ERROR(err, "Context::Context() failed.");

    devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CHECK_OPENCL_ERROR(err, "Context::getInfo() failed.");

    std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
    int deviceCount = (int)devices.size();
    int j = 0;
    for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j)
    {
        std::cout << "Device " << j << " : ";
        std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
        std::cout << deviceName.c_str() << "\n";
    }
    std::cout << "\n";

    if (deviceCount == 0) 
    {
        std::cerr << "No device available\n";
        return SDK_FAILURE;
    }

    if(sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err);
    CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed.");

    /*
    * Create and initialize memory objects
    */

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // Create memory object for input Image 
    inputImageBuffer = cl::Buffer(context, 
                                  inMemFlags, 
                                  width * height * pixelSize,
                                  0,
                                  &err);
    CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)");

    // Create memory object for output Image 
    outputImageBuffer = cl::Buffer(context, 
                                   CL_MEM_WRITE_ONLY, 
                                   width * height * pixelSize,
                                   NULL,
                                   &err);
    CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)");    

    device.push_back(devices[deviceId]);

    // create a CL program using the kernel source 
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Binaries programBinary(1,std::make_pair(
                                              (const void*)kernelFile.source().data(), 
                                              kernelFile.source().size()));

        program = cl::Program(context, device, programBinary, NULL, &err);
        CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed.");
    }
    else
    {
        kernelPath.append("GaussianNoise_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        cl::Program::Sources programSource(1, 
            std::make_pair(kernelFile.source().data(), 
            kernelFile.source().size()));

        program = cl::Program(context, programSource, &err);
        CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed.");
        
    }

    std::string flagsStr = std::string("");

    // Get additional options
    if(isComplierFlagsSpecified())
    {
        streamsdk::SDKFile flagsFile;
        std::string flagsPath = sampleCommon->getPath();
        flagsPath.append(flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;

    err = program.build(device, flagsStr.c_str());
    if(err != CL_SUCCESS)
    {
        if(err == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }
    CHECK_OPENCL_ERROR(err, "Program::build() failed.");

    // Create kernel 
    kernel = cl::Kernel(program, "gaussian_transform",  &err);
    CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed.");


    // Check group size against group size returned by kernel 
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err);
    CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo()  failed.");

    if((blockSizeX * blockSizeY) > kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : "
                      << blockSizeX * blockSizeY << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelWorkGroupSize << std::endl;
            std::cout << "Falling back to " << kernelWorkGroupSize << std::endl;
        }

        if(blockSizeX > kernelWorkGroupSize)
        {
            blockSizeX = kernelWorkGroupSize;
            blockSizeY = 1;
        }
    }

    return SDK_SUCCESS;
}
int 
HDRToneMapping::setupCL()
{
    cl_int err = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }
    
    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    err = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(err, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;
    if(platforms.size() > 0)
    {
        if(isPlatformEnabled())
        {
            i = platforms.begin() + platformId;
        }
        else
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), 
                    "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
    }

    cl_context_properties cps[3] = 
    { 
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)(*i)(),
        0 
    };

    context = cl::Context(dType, cps, NULL, NULL, &err);
    CHECK_OPENCL_ERROR(err, "Context::Context() failed.");

    devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CHECK_OPENCL_ERROR(err, "Context::getInfo() failed.");

    std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
    int deviceCount = (int)devices.size();
    int j = 0;
    for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j)
    {
        std::cout << "Device " << j << " : ";
        std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
        std::cout << deviceName.c_str() << "\n";
    }
    
    std::cout << "\n";

    if (deviceCount == 0) 
    {
        std::cout << "No device available\n";
        return SDK_FAILURE;
    }

    if(sampleCommon->validateDeviceId(deviceId, deviceCount) != SDK_SUCCESS)
    {
        std::cout << "sampleCommon::validateDeviceId() failed";
        return SDK_FAILURE;
    }

    // Get Device specific Information 
    err = devices[deviceId].getInfo<size_t>(
              CL_DEVICE_MAX_WORK_GROUP_SIZE, 
              &maxWorkGroupSize);
    CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");
    
    err = devices[deviceId].getInfo<cl_uint>(
             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
             &maxDimensions);
    CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");

    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));
    
    std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
        maxWorkItemSizes[i] = workItems[i];

    err = devices[deviceId].getInfo<cl_ulong>(
             CL_DEVICE_LOCAL_MEM_SIZE,
             &totalLocalMemory);
    CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

    commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err);
    CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed.");
    
    /*
    * Create and initialize memory objects
    */

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // Create memory object for input Image 
    /**
    * We use CL_MEM_USE_HOST_PTR for CPU as the CPU device is running the kernel 
    * on the actual buffer provided by the application 
    */
    if (dType == CL_DEVICE_TYPE_CPU)
    {
        inputImageBuffer = cl::Buffer(context,
                                CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
                                width * height * numChannels * sizeof(cl_float),
                                input,
                                &err);
        CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)");
        // Create memory object for output Image 
        outputImageBuffer = cl::Buffer(context, 
                                       CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
                                       width * height * numChannels * sizeof(cl_float),
                                       output,
                                       &err);
        CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)");
    }
    else if (dType == CL_DEVICE_TYPE_GPU)
    {
        inputImageBuffer = cl::Buffer(context, 
                                      inMemFlags, 
                                      width * height * numChannels * sizeof(cl_float),
                                      0,
                                      &err);
        CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)");
        // Create memory object for output Image 
        outputImageBuffer = cl::Buffer(context, 
                                       CL_MEM_WRITE_ONLY, 
                                       width * height * numChannels * sizeof(cl_float),
                                       NULL,
                                       &err);
        CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)");
    }

    device.push_back(devices[deviceId]);

    // create a CL program using the kernel source 
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Binaries programBinary(1,std::make_pair(
                                              (const void*)kernelFile.source().data(), 
                                              kernelFile.source().size()));
        
        program = cl::Program(context, device, programBinary, NULL, &err);
        CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed.");
    }
    else
    {
        kernelPath.append("HDRToneMapping_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        cl::Program::Sources programSource(1, 
            std::make_pair(kernelFile.source().data(), 
            kernelFile.source().size()));

        program = cl::Program(context, programSource, &err);
        CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed.");
    }

    std::string flagsStr = std::string("");

    // Get additional options
    if(isComplierFlagsSpecified())
    {
        streamsdk::SDKFile flagsFile;
        std::string flagsPath = sampleCommon->getPath();
        flagsPath.append(flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;

    err = program.build(device, flagsStr.c_str());
    if(err != CL_SUCCESS)
    {
        if(err == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }

    CHECK_OPENCL_ERROR(err, "Program::build() failed.");

    // Create kernel 
    kernel = cl::Kernel(program, "toneMappingPattanaik",  &err);
    CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed.");

    // Check group size against group size returned by kernel 
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err);
    CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo()  failed.");

    /**
    * For CPU device the kernel work group size is 1024.
    * Workgroup creation/replacement is an overhead - 
    * avoid workgroups with small number of workitems (we pay more for replacing a WG than running more WI in a for loop).
    */
    if (kernelWorkGroupSize >= 1024)
    {
        blockSizeX = 32;
        blockSizeY = 32;
    }

    if((cl_uint)(blockSizeX * blockSizeY) > kernelWorkGroupSize)
    {
        if(kernelWorkGroupSize >= 64)
        {
            blockSizeX = 8;
            blockSizeY = 8;
        }
        else if(kernelWorkGroupSize >= 32)
        {
            blockSizeX = 4;
            blockSizeY = 4;
        }
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelWorkGroupSize<<std::endl;
            return SDK_FAILURE;
        }
    }

    if(blockSizeX > maxWorkItemSizes[0] ||
       blockSizeY > maxWorkItemSizes[1] ||
       blockSizeX * blockSizeY > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support requested number of work items." << std::endl;
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}