int
GlobalMemoryBandwidth::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);

#ifdef CL_VERSION_1_1
    if(vStrVal.compare("1.0") > 0)
    {
        char openclVersion[1024];
        status = clGetDeviceInfo(devices[deviceId],
                                 CL_DEVICE_OPENCL_C_VERSION,
                                 sizeof(openclVersion),
                                 openclVersion,
                                 0);
        CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed.");
        
        std::string tempStr(openclVersion);
        size_t dotPos = tempStr.find_first_of(".");
        size_t spacePos = tempStr.find_last_of(" ");
        tempStr = tempStr.substr(dotPos + 1, spacePos - dotPos);
        int minorVersion = atoi(tempStr.c_str());
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(minorVersion < 1 && vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
    else
    {
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
#else
    // OpenCL 1.1 has inbuilt support for vec3 data types
    if(vec3 == true)
    {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
#endif

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    cl_uint sizeElement = vectorSize * sizeof(cl_float);
    cl_uint readLength = length + (NUM_READS * 1024 / sizeElement) + EXTRA_BYTES;
    cl_uint size = readLength * vectorSize * sizeof(cl_float);

    // Create input buffer
    inputBuffer = clCreateBuffer(context, 
                                 CL_MEM_READ_ONLY,
                                 size,
                                 0, 
                                 &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  1,
                                  0,
                                  size,
                                  input,
                                  0,
                                  0,
                                  0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputBuffer)");

    outputBufferReadSingle = clCreateBuffer(context, 
                                            CL_MEM_WRITE_ONLY,
                                            sizeof(cl_float) * vectorSize * length,
                                            0, 
                                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadSingle)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadSingle,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadSingle,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadSingle)");

    outputBufferReadLinear = clCreateBuffer(context, 
                                            CL_MEM_WRITE_ONLY,
                                            sizeof(cl_float) * vectorSize * length,
                                            0, 
                                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLinear)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadLinear,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadLinear,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLinear)");

    outputBufferReadLU = clCreateBuffer(context, 
                                        CL_MEM_WRITE_ONLY,
                                        sizeof(cl_float) * vectorSize * length,
                                        0, 
                                        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLU)");

    // Write data to buffer 
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadLU,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadLU,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLU)");

     outputBufferWriteLinear = clCreateBuffer(context, 
                                              CL_MEM_WRITE_ONLY,
                                              size,
                                              0, 
                                              &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferWriteLinear)");

    // Write data to buffer 
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferWriteLinear,
                                  CL_TRUE,
                                  0,
                                  size,
                                  outputWriteLinear,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferWriteLinear)");

    // create a CL program using the kernel source
    char buildOption[128];
    if(vectorSize == 1)
        sprintf(buildOption, "-D DATATYPE=float -D OFFSET=%d ", OFFSET);
    else
        sprintf(buildOption, "-D DATATYPE=float%d -D OFFSET=%d ", (vec3 == true) ? 3 : vectorSize, OFFSET);

    // create a CL program using the kernel source
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("GlobalMemoryBandwidth_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string(buildOption);
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // Global memory bandwidth from read-single access
    kernel[0] = clCreateKernel(program, "read_single", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_single)");

    // Global memory  bandwidth from read-linear access
    kernel[1] = clCreateKernel(program, "read_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear)");

    // Global memory  bandwidth from read-linear access
    kernel[2] = clCreateKernel(program, "read_linear_uncached", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear_uncached)");

    // Global memory  bandwidth from write-linear access
    kernel[3] = clCreateKernel(program, "write_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(GlobalBandwidth_write_linear)");

    return SDK_SUCCESS;
}
int
FluidSimulation2D::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");


    // If we could find our platform, use it. Otherwise use just available platform.

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                context, 
                devices[deviceId], 
                prop, 
                &status);
        CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed.");
    }

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");

    
    std::string buildOptions = std::string("");
    // Check if cl_khr_fp64 extension is supported 
    if(strstr(deviceInfo.extensions, "cl_khr_fp64"))
    {
        buildOptions.append("-D KHR_DP_EXTENSION");
    }
    else
    {
        // Check if cl_amd_fp64 extension is supported 
        if(!strstr(deviceInfo.extensions, "cl_amd_fp64"))
        {
            reqdExtSupport = false;
            OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!");
        }
    }

    
    /*
    * Create and initialize memory objects
    */

    size_t temp = dims[0] * dims[1];
    d_if0 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if0)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if0,
        1,
        0,
        sizeof(cl_double) * temp,
        h_if0,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if0)");

    d_if1234 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if1234)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if1234,
        1,
        0,
        sizeof(cl_double4) * temp,
        h_if1234,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if1234)");

    d_if5678 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if5678)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if5678,
        1,
        0,
        sizeof(cl_double4) * temp,
        h_if5678,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if5678)");

    d_of0 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of0)");

    d_of1234 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of1234)");

    d_of5678 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of5678)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if0,
        d_of0,
        0, 0, sizeof(cl_double) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if0->d_of0)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if1234,
        d_of1234,
        0, 0, sizeof(cl_double4) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if1234->d_of1234)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if5678,
        d_of5678,
        0, 0, sizeof(cl_double4) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if5678->d_of5678)");

    status = clFinish(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFinish failed.");

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    //Constant arrays
    type = clCreateBuffer(context, 
        inMemFlags, 
        sizeof(cl_bool) * temp,
        0,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (type)");

    weight = clCreateBuffer(context,
        CL_MEM_READ_ONLY,
        sizeof(cl_double) * 9,
        0,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (weight)");

    status = clEnqueueWriteBuffer(commandQueue,
        weight, 
        1, 0, sizeof(cl_double) * 9,
        w,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (weight)");

    velocity = clCreateBuffer(context,
        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
        sizeof(cl_double2) * temp,
        0, &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (velocity)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("FluidSimulation2D_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
    kernel = clCreateKernel(
        program,
        "lbm",
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
示例#3
0
int
PrefixSum::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }
     /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon->getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    //Set device info of given cl_device_id
    status = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); 

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                           context, 
                           devices[deviceId], 
                           prop, 
                           &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");  
    }

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    inputBuffer = clCreateBuffer(
                      context, 
                      inMemFlags,
                      sizeof(cl_float) * length,
                      NULL, 
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)");  

    outputBuffer = clCreateBuffer(
                      context, 
                      CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                      sizeof(cl_float) * length,
                      NULL, 
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)");  

   // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("PrefixSum_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");

    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program, "prefixSum", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");
    return SDK_SUCCESS;
}
int MathBenchmark::setupCL(void) {
	cl_int status = 0;
	cl_device_type dType;

	if (deviceType.compare("cpu") == 0) {
		dType = CL_DEVICE_TYPE_CPU;
	} else //deviceType = "gpu"
	{
		dType = CL_DEVICE_TYPE_GPU;
		if (isThereGPU() == false) {
			std::cout << "GPU not found. Falling back to CPU device"
					<< std::endl;
			dType = CL_DEVICE_TYPE_CPU;
		}
	}

	/*
	 * Have a look at the available platforms and pick either
	 * the AMD one if available or a reasonable default.
	 */

	cl_platform_id platform = NULL;
	int retValue = sampleCommon->getPlatform(platform, platformId,
			isPlatformEnabled());
	CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

	// Display available devices.
	retValue = sampleCommon->displayDevices(platform, dType);
	CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

	/*
	 * If we could find our platform, use it. Otherwise use just available platform.
	 */

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
			(cl_context_properties) platform, 0 };

	context = clCreateContextFromType(cps, dType, NULL, NULL, &status);
	CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");
	
	// getting device on which to run the sample
	status = sampleCommon->getDevices(context, &devices, deviceId,
			isDeviceIdEnabled());
	CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");
	
	//Set device info of given cl_device_id
	retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
	CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

	maxWorkGroup = deviceInfo.maxWorkGroupSize;
	max_mem_alloc_size = deviceInfo.maxMemAllocSize;
	while (maxMemSize <= (unsigned int) (max_mem_alloc_size)) {
		maxMemSize *= 2;
	}
	maxMemSize /= 2;

	if (maxMemSize > 134217728 && dType == CL_DEVICE_TYPE_CPU) {
		maxMemSize = 134217728;
	}

	std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE:\t" << maxWorkGroup
			<< std::endl;
	std::cout << "MaxMemSize:\t" << maxMemSize / (1024 * 1024) << "MB"
			<< std::endl;
	{
		// The block is to move the declaration of prop closer to its use
		cl_command_queue_properties prop = 0;
		prop |= CL_QUEUE_PROFILING_ENABLE;
		commandQueue = clCreateCommandQueue(context, devices[deviceId], prop,
				&status);
		CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
	}
	// create a CL program using the kernel source
	streamsdk::buildProgramData buildData;
	buildData.kernelName = std::string("mathoper.cl");
	buildData.devices = devices;
	buildData.deviceId = deviceId;
	buildData.flagsStr = std::string("");
	if (isLoadBinaryEnabled())
		buildData.binaryName = std::string(loadBinary.c_str());

	if (isComplierFlagsSpecified())
		buildData.flagsFileName = std::string(flags.c_str());

	retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
	CHECK_ERROR(retValue, SDK_SUCCESS,
			"sampleCommon::buildOpenCLProgram() failed");
	
	
    	std::string s;
   	std::stringstream ss(s);
    	ss << "kernel_asinh_withDD";
   	ss << vectorSize;
	// Create the cKermel_kernel_asinh_withDD
	kernel[0]  = clCreateKernel(program, ss.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinh_withDD)");
	
	std::stringstream asinh_withoutDD(s);
	asinh_withoutDD << "kernel_asinh_withoutDD";
	asinh_withoutDD << vectorSize;
	//dumpPTXCode(context,program,asinh_withoutDD.str().c_str());
	// Create the cKermel_kernel_asinh_withoutDD
	kernel[1]  = clCreateKernel(program, asinh_withoutDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinh_withoutDD)");

	std::stringstream acosh_withDD(s);
	acosh_withDD << "kernel_acosh_withDD";
	acosh_withDD << vectorSize;
	//dumpPTXCode(context,program,acosh_withDD.str().c_str());
	// Create the cKermel_kernel_acosh_withDD
	kernel[2]  = clCreateKernel(program, acosh_withDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_acosh_withDD)");

	std::stringstream acosh_withoutDD(s);
	acosh_withoutDD << "kernel_acosh_withoutDD";
	acosh_withoutDD << vectorSize;
	//dumpPTXCode(context,program,acosh_withoutDD.str().c_str());
	// Create the cKermel_kernel_acosh_withoutDD
	kernel[3]  = clCreateKernel(program, acosh_withoutDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_acosh_withoutDD)");

	std::stringstream atanh_withDD(s);
	atanh_withDD << "kernel_atanh_withDD";
	atanh_withDD << vectorSize;
	//dumpPTXCode(context,program,atanh_withDD.str().c_str());
	// Create the cKermel_kernel_atanh_withDD
	kernel[4]  = clCreateKernel(program, atanh_withDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_atanh_withDD)");

	std::stringstream atanh_withoutDD(s);
	atanh_withoutDD << "kernel_atanh_withoutDD";
	atanh_withoutDD << vectorSize;
	//dumpPTXCode(context,program,atanh_withoutDD.str().c_str());
	// Create the cKermel_kernel_atanh_withoutDD
	kernel[5]  = clCreateKernel(program, atanh_withoutDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_atanh_withoutDD)");

	std::stringstream asinpi_withDD(s);
	asinpi_withDD << "kernel_asinpi_withDD";
	asinpi_withDD << vectorSize;
	//dumpPTXCode(context,program,asinpi_withDD.str().c_str());
	// Create the cKermel_kernel_asinpi_withDD
	kernel[6]  = clCreateKernel(program, asinpi_withDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinpi_withDD)");

	std::stringstream asinpi_withoutDD(s);
	asinpi_withoutDD << "kernel_asinpi_withoutDD";
	asinpi_withoutDD << vectorSize;
	//dumpPTXCode(context,program,asinpi_withoutDD.str().c_str());
	// Create the cKermel_kernel_asinpi_withoutDD
	kernel[7]  = clCreateKernel(program, asinpi_withoutDD.str().c_str(), &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinpi_withoutDD)");

	return SDK_SUCCESS;
}
int
FloydWarshall::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Fall back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }


    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");


    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };
    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    pathDistanceBuffer = clCreateBuffer(context, 
                                        CL_MEM_READ_WRITE,
                                        sizeof(cl_uint) * numNodes * numNodes,
                                        NULL, 
                                        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (pathDistanceBuffer)");

    pathBuffer = clCreateBuffer(context, 
                                CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                sizeof(cl_uint) * numNodes * numNodes,
                                NULL, 
                                &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (pathBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("FloydWarshall_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");


    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program, "floydWarshallPass", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
int
MatrixTranspose::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_ACCELERATOR;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

     // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);

    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    // Get Device specific Information, Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE;
        commandQueue = clCreateCommandQueue(
                           context, 
                           devices[deviceId], 
                           prop, 
                           &status);
        CHECK_ERROR(status, 0, "clCreateCommandQueue failed.");
        }

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    /*
    if(isAmdPlatform())
		// To achieve best performance, use persistent memory together with
		// clEnqueueMapBuffer (instead of clEnqeueRead/Write). 
		// At the same time, in general, the best performance is the function
		// of access pattern and size of the buffer.
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;*/

    inputBuffer = clCreateBuffer(
                      context, 
                      inMemFlags,
                      sizeof(cl_float) * width * height,
                      NULL, 
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)");

    outputBuffer = clCreateBuffer(
                      context, 
					  CL_MEM_WRITE_ONLY,
                      sizeof(cl_float) * width * height,
                      NULL, 
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("MatrixTranspose_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
    kernel = clCreateKernel(program, "matrixTranspose", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");		

    status =  kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed");

    availableLocalMemory = deviceInfo.localMemSize - kernelInfo.localMemoryUsed;

	// each work item is going to work on [elemsPerThread1Dim x elemsPerThread1Dim] matrix elements,
	// therefore the total size of needed local memory is calculated as
	// # of WIs in a group multiplied by # of matrix elements per a WI
    neededLocalMemory    = blockSize * blockSize * elemsPerThread1Dim * elemsPerThread1Dim * sizeof(cl_float);

    if(neededLocalMemory > availableLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    if((cl_uint)(blockSize * blockSize) > kernelInfo.kernelWorkGroupSize)
    {
        if(kernelInfo.kernelWorkGroupSize >= 64)
            blockSize = 8; 
        else if(kernelInfo.kernelWorkGroupSize >= 32)
            blockSize = 4; 
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSize * blockSize << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelInfo.kernelWorkGroupSize << std::endl;
            return SDK_FAILURE;
        }
    }

    if(blockSize > deviceInfo.maxWorkItemSizes[0] ||
       blockSize > deviceInfo.maxWorkItemSizes[1] ||
       (size_t)blockSize * blockSize > deviceInfo.maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support requested number of work items." << std::endl;
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
int
EigenValue::setupCL(void)
{
    cl_int status = 0;

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    // If we could find our platform, use it. Otherwise use just available platform.
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform,
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, 0, "sampleCommon::getDevices() failed");


    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                           context, 
                           devices[deviceId], 
                           prop, 
                           &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // cl mem to store the diagonal elements of the matrix
    diagonalBuffer = clCreateBuffer(
                      context, 
                      inMemFlags,
                      sizeof(cl_float) * length,
                      NULL, 
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (diagonalBuffer)");

    // cl mem to store the number of eigenvalues in each interval
    numEigenValuesIntervalBuffer = clCreateBuffer(
                                    context, 
                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                    sizeof(cl_uint) * length,
                                    NULL, 
                                    &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (diagonalBuffer)");

    // cl mem to store the offDiagonal elements of the matrix
    offDiagonalBuffer = clCreateBuffer(
                         context, 
                         inMemFlags,
                         sizeof(cl_float) * (length-1),
                         NULL, 
                         &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (offDiagonalBuffer)");

    // cl mem to store the eigenvalue intervals
    for(int i = 0 ; i < 2 ; ++ i)
    {
        eigenIntervalBuffer[i] = clCreateBuffer(
                                 context, 
                                 CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                 sizeof(cl_uint) * length * 2,
                                 NULL, 
                                 &status);
       CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (eigenIntervalBuffer)");
    }

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("EigenValue_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("-x clc++");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel[0] = clCreateKernel(program, "calNumEigenValueInterval", &status);
    if(sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed."))
        return SDK_FAILURE;

    // get a kernel object handle for a kernel with the given name
    kernel[1] = clCreateKernel(program, "recalculateEigenIntervals", &status);
    if(sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed."))
        return SDK_FAILURE;

    return SDK_SUCCESS;
}
int
DeviceFission::setupCLPlatform()
{
    cl_int status = CL_SUCCESS;

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform(rootplatform) failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, CL_DEVICE_TYPE_ALL);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices(rootplatform) failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    rContext = clCreateContextFromType(platform ? cps : NULL,
									   CL_DEVICE_TYPE_ALL,
									   NULL,
									   NULL,
									   &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting devices on which to run the sample
    status = sampleCommon->getDevices(rContext, &Devices, 0, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    // Set deviceListSize from clGetContextInfo
	status = clGetContextInfo(rContext, CL_CONTEXT_DEVICES, 0, 0, &deviceListSize);
	CHECK_ERROR(status, SDK_SUCCESS, "clGetContextInfo failed. (deviceListSize)");

	// Get GPU device and CPU devices by the deviceInfo.
	for (cl_uint i = 0 ; i < deviceListSize / sizeof(cl_device_id) ; i++)
	{
		retValue = deviceInfo.setDeviceInfo(Devices[i]);
		CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");
		if (deviceInfo.dType == CL_DEVICE_TYPE_GPU)
		{
			gpuAvailable = CL_TRUE;
			gpuDevice = Devices[i];
			groupSize = deviceInfo.maxWorkGroupSize;
		}
		else if (deviceInfo.dType == CL_DEVICE_TYPE_CPU)
		{
			cpuDevice = Devices[i];
		}
	}

	// Using CPU to replace GPU if unable to find GPU.
	if(gpuAvailable == CL_FALSE)
	{
		std::cout << "\nUnable to find GPU, disable cpu2gpu mode."<< std::endl;
		gpuDevice = cpuDevice;
		cpu2gpu = CL_FALSE;
	}

	// Get allocate memory for subDevices
    subDevices = (cl_device_id*)malloc(numSubDevices * sizeof(cl_device_id));
    CHECK_ALLOCATION(subDevices, "Failed to allocate memory. (subDevices)");

	// Get allocate memory for subKernel
	subKernel = (cl_kernel*)malloc(numSubDevices * sizeof(cl_kernel));
	CHECK_ALLOCATION(subKernel, "Failed to allocate memory. (subKernel)");

	// Get allocate memory for gpuKernel
	gpuKernel = (cl_kernel*)malloc(numSubDevices * sizeof(cl_kernel));
	CHECK_ALLOCATION(gpuKernel, "Failed to allocate memory. (gpuKernel)");

	// Get maxSubDevices from clGetDeviceInfo
	cl_uint maxSubDevices;
	status = clGetDeviceInfo(cpuDevice, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, sizeof(maxSubDevices), &maxSubDevices, NULL);
	CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed. (maxSubDevices)")

	if(maxSubDevices <= 1)
	{
		std::cout<<"Error: The CPU should have than one core to run this sample."<<std::endl;
		return SDK_FAILURE;
	}

	// Initialize required partition property
	cl_device_partition_property partitionPrty[5] =
	{
		CL_DEVICE_PARTITION_BY_COUNTS,  
		maxSubDevices / 2, maxSubDevices / 2,
		CL_DEVICE_PARTITION_BY_COUNTS_LIST_END, 
		0 };

	// Create sub-devices
	status = clCreateSubDevices(cpuDevice, partitionPrty, numSubDevices, subDevices, NULL);
	CHECK_OPENCL_ERROR( status, "clCreateSubDevices failed.");

    return SDK_SUCCESS;
}
int 
BoxFilterSeparable::setupCL()
{
    cl_int status = 0;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");


    // If we could find our platform, use it. Otherwise use just available platform.

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                context, 
                devices[deviceId], 
                prop, 
                &status);
        CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed.");
    }

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");

    // Create and initialize memory objects

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // Create memory object for input Image
    inputImageBuffer = clCreateBuffer(
        context,
        inMemFlags,
        width * height * pixelSize,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)");

    // Create memory object for temp Image
    tempImageBuffer = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        width * height * pixelSize,
        0,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (tempImageBuffer)");

    // Create memory objects for output Image
    outputImageBuffer = clCreateBuffer(context,
        CL_MEM_WRITE_ONLY,
        width * height * pixelSize,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputImageBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("BoxFilter_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());
    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    verticalKernel = clCreateKernel(program,
                                    "box_filter_vertical",
                                    &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed. (vertical)");

#ifdef USE_LDS
    horizontalKernel = clCreateKernel(program,
                                      "box_filter_horizontal_local",
                                      &status);
#else
    horizontalKernel = clCreateKernel(program,
                                      "box_filter_horizontal",
                                      &status);
#endif
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed. (horizontal)");

    status =  kernelInfoH.setKernelWorkGroupInfo(horizontalKernel, devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed");

    status =  kernelInfoV.setKernelWorkGroupInfo(verticalKernel, devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed");

    if((blockSizeX * blockSizeY) > kernelInfoV.kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : "
                      << blockSizeX * blockSizeY << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelInfoV.kernelWorkGroupSize << std::endl;
            std::cout << "Falling back to " << kernelInfoV.kernelWorkGroupSize << std::endl;
        }

        // Three possible cases
        if(blockSizeX > kernelInfoV.kernelWorkGroupSize)
        {
            blockSizeX = kernelInfoV.kernelWorkGroupSize;
            blockSizeY = 1;
        }
    }
    return SDK_SUCCESS;
}
int 
ImageOverlap::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
	{
		dType = CL_DEVICE_TYPE_GPU;
		if(isThereGPU() == false)
		{
			std::cout << "GPU not found. Falling back to CPU device" << std::endl;
			dType = CL_DEVICE_TYPE_CPU;
		}
	}

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    // If we could find our platform, use it. Otherwise use just available platform.
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    status = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed");

    if(!deviceInfo.imageSupport)
    {
        OPENCL_EXPECTED_ERROR(" Expected Error: Device does not support Images");
    }
	 
	blockSizeX = deviceInfo.maxWorkGroupSize<GROUP_SIZE?deviceInfo.maxWorkGroupSize:GROUP_SIZE;

    // Create command queue
	cl_command_queue_properties prop = 0;
	for(int i=0;i<3;i++)
	{
		commandQueue[i] = clCreateCommandQueue(
			context,
			devices[deviceId],
			prop,
			&status);
		 CHECK_OPENCL_ERROR(status,"clCreateCommandQueuefailed.");
	}

    // Create and initialize image objects

	// Create map image
	mapImage = clCreateImage(context,
		CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
		&imageFormat,
		&image_desc,
		mapImageData,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (mapImage)");
	int color[4] = {0,0,80,255};
	size_t origin[3] = {300,300,0};
	size_t region[3] = {100,100,1};
	status = clEnqueueFillImage(commandQueue[0], mapImage, color, origin, region, NULL, NULL, &eventlist[0]);

    // Create fill image
	fillImage = clCreateImage(context,
		CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
		&imageFormat,
		&image_desc,
		fillImageData,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (fillImage)");

	color[0] = 80;
	color[1] = 0;
	color[2] = 0;
	color[3] = 0;
	origin[0] = 50;
	origin[1] = 50;
	status = clEnqueueFillImage(commandQueue[1], fillImage, color, origin, region, NULL, NULL, &eventlist[1]);
	
	//Create output image
	outputImage = clCreateImage(context,
		CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
		&imageFormat,
		&image_desc,
		NULL,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (outputImage)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("ImageOverlap_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
	kernelOverLap = clCreateKernel(program, "OverLap", &status);
	CHECK_OPENCL_ERROR(status,"clCreateKernel failed.(OverLap)");

    return SDK_SUCCESS;
}
int
BinomialOption::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };
    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    status = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    // Create and initialize memory objects

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    // if(isAmdPlatform())
    //     inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // Create memory object for stock price
    randBuffer = clCreateBuffer(context,
                                inMemFlags,
                                numSamples * sizeof(cl_float4),
                                NULL,
                                &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)");

    // Create memory object for output array
    outBuffer = clCreateBuffer(context,
                               CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                               numSamples * sizeof(cl_float4),
                               NULL,
                               &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("BinomialOption_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program,
                            "binomial_options",
                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");

    // If group-size is gerater than maximum supported on kernel
    if((size_t)(numSteps + 1) > kernelInfo.kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << (numSteps + 1) << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelInfo.kernelWorkGroupSize << std::endl;
            std::cout << "Using appropiate group-size." << std::endl;
            std::cout << "-------------------------------------------" << std::endl;
        }
        numSteps = (cl_int)kernelInfo.kernelWorkGroupSize - 2;
    }

    return SDK_SUCCESS;
}
int
ScanLargeArrays::setupCL(void)
{
    cl_int status = 0;

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");


    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status,"clCreateContextFromType failed.");

    status = sampleCommon->getDevices(context, &devices, deviceId,  isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
            context, 
            devices[deviceId], 
            prop, 
            &status);
        if(sampleCommon->checkVal(
            status,
            0,
            "clCreateCommandQueue failed."))
            return SDK_FAILURE;
    }

    // Get Device specific Information 

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("ScanLargeArrays_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    bScanKernel = clCreateKernel(program, "ScanLargeArrays", &status);
    CHECK_OPENCL_ERROR(status,"clCreateKernel failed.(bScanKernel)");

    bAddKernel = clCreateKernel(program, "blockAddition", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(bAddKernel)");

    // get a kernel object handle for a kernel with the given name
    pScanKernel = clCreateKernel(program, "prefixSum", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(pScanKernel)");

    status = kernelInfoBScan.setKernelWorkGroupInfo(bScanKernel,devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed");

    status = kernelInfoBAdd.setKernelWorkGroupInfo(pScanKernel,devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed");

    status = kernelInfoPScan.setKernelWorkGroupInfo(bAddKernel,devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed");

    // Find munimum of all kernel's group-sizes
    size_t temp = min(kernelInfoBScan.kernelWorkGroupSize, kernelInfoPScan.kernelWorkGroupSize);
    temp = (temp > kernelInfoBAdd.kernelWorkGroupSize) ? kernelInfoBAdd.kernelWorkGroupSize : temp;

    if(blockSize > (cl_uint)temp)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSize << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << temp << std::endl;
            std::cout << "Falling back to " << temp << std::endl;
        }
        blockSize = (cl_uint)temp;
    }
	
	blockSize = min(blockSize,length/2);
    // Calculate number of passes required
    float t = log((float)length) / log((float)blockSize);
    pass = (cl_uint)t;

    // If t is equal to pass
    if(fabs(t - (float)pass) < 1e-7)
    {
        pass--;
    }

    // Create input buffer on device
    inputBuffer = clCreateBuffer(
        context, 
        CL_MEM_READ_ONLY,
        sizeof(cl_float) * length,
        0, 
        &status);
    CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(inputBuffer)");

    // Allocate output buffers
    outputBuffer = (cl_mem*)malloc(pass * sizeof(cl_mem));

    for(int i = 0; i < (int)pass; i++)
    {
        int size = (int)(length / pow((float)blockSize,(float)i));
        outputBuffer[i] = clCreateBuffer(
            context, 
            CL_MEM_READ_WRITE,
            sizeof(cl_float) * size,
            0, 
            &status);
        CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(outputBuffer)");
    }

    // Allocate blockSumBuffers
    blockSumBuffer = (cl_mem*)malloc(pass * sizeof(cl_mem));

    for(int i = 0; i < (int)pass; i++)
    {
        int size = (int)(length / pow((float)blockSize,(float)(i + 1)));
        blockSumBuffer[i] = clCreateBuffer(
            context, 
            CL_MEM_READ_WRITE,
            sizeof(cl_float) * size,
            0, 
            &status);

    CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(blockSumBuffer)");       	
    }

    // Create a tempBuffer on device
    int tempLength = (int)(length / pow((float)blockSize, (float)pass));

    tempBuffer = clCreateBuffer(context,
        CL_MEM_READ_WRITE,
        sizeof(cl_float) * tempLength,
        0,
        &status);
    CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(tempBuffer)");

    return SDK_SUCCESS;
}