예제 #1
0
int
ComputeBench::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if (sampleArgs->deviceType.compare("cpu") == 0) {
        dType = CL_DEVICE_TYPE_CPU;
    } else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if (sampleArgs->isThereGPU() == false) {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    // Display available devices.
    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties) platform,
        0
    };

    context = clCreateContextFromType(cps,
            dType,
            NULL,
            NULL,
            &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
            sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);


    // OpenCL 1.1 has inbuilt support for vec3 data types
    if (vec3 == true) {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
    // The block is to move the declaration of prop closer to its use
    /* Note: Using deprecated clCreateCommandQueue as CL_QUEUE_PROFILING_ENABLE flag not currently working 
     ***with clCreateCommandQueueWithProperties*/
    cl_command_queue_properties prop = 0;
    prop |= CL_QUEUE_PROFILING_ENABLE;

    commandQueue = clCreateCommandQueue(context,
            devices[sampleArgs->deviceId],
            prop,
            &status);
    CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");

    if (sampleArgs->isLoadBinaryEnabled()) {
        // Always assuming kernel was dumped for vector-width 1
        if (vectorSize != 0) {
            std::cout <<
                    "Ignoring specified vector-width. Assuming kernel was dumped for vector-width 1"
                    << std::endl;
        }
        vectorSize = 1;
    } else {
        // If vector-size is not specified in the command-line, choose the preferred size for the device
        if (vectorSize == 0) {
            vectorSize = deviceInfo.preferredFloatVecWidth;
        } else if (vectorSize == 3) {
            //Make vectorSize as 4 if -v option is 3.
            //This memory alignment is required as per OpenCL for type3 vectors
            vec3 = true;
            vectorSize = 4;
        } else if ((1 != vectorSize) && (2 != vectorSize) && (4 != vectorSize) &&
                (8 != vectorSize) && (16 != vectorSize)) {
            std::cout << "The vectorsize can only be one of 1,2,3(4),4,8,16!" << std::endl;
            return SDK_FAILURE;
        }
    }


    outputKadd = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof (cl_float) * vectorSize * length, 0, &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputKadd)");

    // create a CL program using the kernel source
    char buildOption[512];
    if (vectorSize == 1) {
        sprintf(buildOption, "-D DATATYPE=uint -D DATATYPE2=uint4 ");
        //sprintf(buildOption, "-D DATATYPE=float -D DATATYPE2=float4 ");
    } else {
        sprintf(buildOption, "-D DATATYPE=uint%d -D DATATYPE2=uint%d ", (vec3 == true) ? 3 : vectorSize, (vec3 == true) ? 3 : vectorSize);
        //sprintf(buildOption, "-D DATATYPE=float%d -D DATATYPE2=float%d ", (vec3 == true) ? 3 : vectorSize, (vec3 == true) ? 3 : vectorSize);
    }

    strcat(buildOption, "-D IDXTYPE=uint ");

    // create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("ComputeBench.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string(buildOption);
    if (sampleArgs->isLoadBinaryEnabled()) {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if (sampleArgs->isComplierFlagsSpecified()) {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");

    // Global memory bandwidth from read-single access
    kernel[0] = clCreateKernel(program, "Kadd", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(Kadd)");

    return SDK_SUCCESS;
}
int AtomicCounters::setupCL(void) {
  cl_int status = 0;
  cl_device_type dType;
  if (sampleArgs->deviceType.compare("cpu") == 0) {
    dType = CL_DEVICE_TYPE_CPU;
  } else  // deviceType = "gpu"
  {
    dType = CL_DEVICE_TYPE_GPU;
    if (sampleArgs->isThereGPU() == false) {
      std::cout << "GPU not found. Falling back to CPU" << std::endl;
      dType = CL_DEVICE_TYPE_CPU;
    }
  }
  cl_platform_id platform = NULL;
  int retValue = getPlatform(platform, sampleArgs->platformId,
                             sampleArgs->isPlatformEnabled());
  CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed.");
  // Display available devices.
  retValue = displayDevices(platform, dType);
  CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed.");
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
                                  (cl_context_properties)platform, 0};
  context = clCreateContextFromType(cps, dType, NULL, NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");
  // getting device on which to run the sample
  status = getDevices(context, &devices, sampleArgs->deviceId,
                      sampleArgs->isDeviceIdEnabled());
  CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed ");
  // Set device info of given cl_device_id
  retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
  CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");
  // Check device extensions
  if (!strstr(deviceInfo.extensions, "cl_ext_atomic_counters_32")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_ext_atomic_counters_32 extension!");
  }
  if (!strstr(deviceInfo.extensions, "cl_khr_local_int32_base_atomics")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_khr_local_int32_base_atomics extension!");
  }
  // Get OpenCL device version
  std::string deviceVersionStr = std::string(deviceInfo.deviceVersion);
  size_t vStart = deviceVersionStr.find(" ", 0);
  size_t vEnd = deviceVersionStr.find(" ", vStart + 1);
  std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
// Check of OPENCL_C_VERSION if device version is 1.1 or later
#ifdef CL_VERSION_1_1
  if (deviceInfo.openclCVersion) {
    // Exit if OpenCL C device version is 1.0
    deviceVersionStr = std::string(deviceInfo.openclCVersion);
    vStart = deviceVersionStr.find(" ", 0);
    vStart = deviceVersionStr.find(" ", vStart + 1);
    vEnd = deviceVersionStr.find(" ", vStart + 1);
    vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
    if (vStrVal.compare("1.0") <= 0) {
      OPENCL_EXPECTED_ERROR(
          "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
    }
  } else {
    OPENCL_EXPECTED_ERROR(
        "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
  }
#else
  OPENCL_EXPECTED_ERROR(
      "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
#endif
  // Setup application data
  if (setupAtomicCounters() != SDK_SUCCESS) {
    return SDK_FAILURE;
  }
  cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
  commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId],
                                      props, &status);
  CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed(commandQueue)");
  // Set Persistent memory only for AMD platform
  cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
  if (sampleArgs->isAmdPlatform()) {
    inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
  }
  // Create buffer for input array
  inBuf = clCreateBuffer(context, inMemFlags, length * sizeof(cl_uint), NULL,
                         &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)");
  // Set up data for input array
  cl_event writeEvt;
  status =
      clEnqueueWriteBuffer(commandQueue, inBuf, CL_FALSE, 0,
                           length * sizeof(cl_uint), input, 0, NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed..");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
  counterOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                 NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf).");
  globalOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf).");
  // create a CL program using the kernel source
  buildProgramData buildData;
  buildData.kernelName = std::string("AtomicCounters_Kernels.cl");
  buildData.devices = devices;
  buildData.deviceId = sampleArgs->deviceId;
  buildData.flagsStr = std::string("");
  if (sampleArgs->isLoadBinaryEnabled()) {
    buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
  }
  if (sampleArgs->isComplierFlagsSpecified()) {
    buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
  }
  retValue = buildOpenCLProgram(program, context, buildData);
  CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");
  // ConstantBuffer bandwidth from single access
  counterKernel = clCreateKernel(program, "atomicCounters", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel).");
  globalKernel = clCreateKernel(program, "globalAtomics", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed.");
  status = kernelInfoC.setKernelWorkGroupInfo(counterKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  status = kernelInfoG.setKernelWorkGroupInfo(globalKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  if (counterWorkGroupSize > kernelInfoC.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << counterWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(readKernel) : "
                << kernelInfoC.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoC.kernelWorkGroupSize
                << std::endl;
    }
    counterWorkGroupSize = kernelInfoC.kernelWorkGroupSize;
  }
  if (globalWorkGroupSize > kernelInfoG.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << globalWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(writeKernel) : "
                << kernelInfoG.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoG.kernelWorkGroupSize
                << std::endl;
    }
    globalWorkGroupSize = kernelInfoG.kernelWorkGroupSize;
  }
  // Wait for event and release event
  status = waitForEventAndRelease(&writeEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
  return SDK_SUCCESS;
}
int
FluidSimulation2D::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");


    // If we could find our platform, use it. Otherwise use just available platform.

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                context, 
                devices[deviceId], 
                prop, 
                &status);
        CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed.");
    }

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");

    
    std::string buildOptions = std::string("");
    // Check if cl_khr_fp64 extension is supported 
    if(strstr(deviceInfo.extensions, "cl_khr_fp64"))
    {
        buildOptions.append("-D KHR_DP_EXTENSION");
    }
    else
    {
        // Check if cl_amd_fp64 extension is supported 
        if(!strstr(deviceInfo.extensions, "cl_amd_fp64"))
        {
            reqdExtSupport = false;
            OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!");
        }
    }

    
    /*
    * Create and initialize memory objects
    */

    size_t temp = dims[0] * dims[1];
    d_if0 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if0)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if0,
        1,
        0,
        sizeof(cl_double) * temp,
        h_if0,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if0)");

    d_if1234 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if1234)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if1234,
        1,
        0,
        sizeof(cl_double4) * temp,
        h_if1234,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if1234)");

    d_if5678 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if5678)");

    status = clEnqueueWriteBuffer(commandQueue,
        d_if5678,
        1,
        0,
        sizeof(cl_double4) * temp,
        h_if5678,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if5678)");

    d_of0 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of0)");

    d_of1234 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of1234)");

    d_of5678 = clCreateBuffer(context, 
        CL_MEM_READ_WRITE, 
        sizeof(cl_double4) * temp, 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of5678)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if0,
        d_of0,
        0, 0, sizeof(cl_double) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if0->d_of0)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if1234,
        d_of1234,
        0, 0, sizeof(cl_double4) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if1234->d_of1234)");

    status = clEnqueueCopyBuffer(commandQueue,
        d_if5678,
        d_of5678,
        0, 0, sizeof(cl_double4) * temp,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if5678->d_of5678)");

    status = clFinish(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFinish failed.");

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    //Constant arrays
    type = clCreateBuffer(context, 
        inMemFlags, 
        sizeof(cl_bool) * temp,
        0,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (type)");

    weight = clCreateBuffer(context,
        CL_MEM_READ_ONLY,
        sizeof(cl_double) * 9,
        0,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (weight)");

    status = clEnqueueWriteBuffer(commandQueue,
        weight, 
        1, 0, sizeof(cl_double) * 9,
        w,
        0, 0, 0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (weight)");

    velocity = clCreateBuffer(context,
        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
        sizeof(cl_double2) * temp,
        0, &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (velocity)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("FluidSimulation2D_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
    kernel = clCreateKernel(
        program,
        "lbm",
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
int
GlobalMemoryBandwidth::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);

#ifdef CL_VERSION_1_1
    if(vStrVal.compare("1.0") > 0)
    {
        char openclVersion[1024];
        status = clGetDeviceInfo(devices[deviceId],
                                 CL_DEVICE_OPENCL_C_VERSION,
                                 sizeof(openclVersion),
                                 openclVersion,
                                 0);
        CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed.");
        
        std::string tempStr(openclVersion);
        size_t dotPos = tempStr.find_first_of(".");
        size_t spacePos = tempStr.find_last_of(" ");
        tempStr = tempStr.substr(dotPos + 1, spacePos - dotPos);
        int minorVersion = atoi(tempStr.c_str());
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(minorVersion < 1 && vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
    else
    {
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
#else
    // OpenCL 1.1 has inbuilt support for vec3 data types
    if(vec3 == true)
    {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
#endif

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    cl_uint sizeElement = vectorSize * sizeof(cl_float);
    cl_uint readLength = length + (NUM_READS * 1024 / sizeElement) + EXTRA_BYTES;
    cl_uint size = readLength * vectorSize * sizeof(cl_float);

    // Create input buffer
    inputBuffer = clCreateBuffer(context, 
                                 CL_MEM_READ_ONLY,
                                 size,
                                 0, 
                                 &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  1,
                                  0,
                                  size,
                                  input,
                                  0,
                                  0,
                                  0);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputBuffer)");

    outputBufferReadSingle = clCreateBuffer(context, 
                                            CL_MEM_WRITE_ONLY,
                                            sizeof(cl_float) * vectorSize * length,
                                            0, 
                                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadSingle)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadSingle,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadSingle,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadSingle)");

    outputBufferReadLinear = clCreateBuffer(context, 
                                            CL_MEM_WRITE_ONLY,
                                            sizeof(cl_float) * vectorSize * length,
                                            0, 
                                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLinear)");

    // Write data to buffer
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadLinear,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadLinear,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLinear)");

    outputBufferReadLU = clCreateBuffer(context, 
                                        CL_MEM_WRITE_ONLY,
                                        sizeof(cl_float) * vectorSize * length,
                                        0, 
                                        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLU)");

    // Write data to buffer 
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferReadLU,
                                  CL_TRUE,
                                  0,
                                  sizeof(cl_float) * vectorSize * length,
                                  outputReadLU,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLU)");

     outputBufferWriteLinear = clCreateBuffer(context, 
                                              CL_MEM_WRITE_ONLY,
                                              size,
                                              0, 
                                              &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferWriteLinear)");

    // Write data to buffer 
    status = clEnqueueWriteBuffer(commandQueue,
                                  outputBufferWriteLinear,
                                  CL_TRUE,
                                  0,
                                  size,
                                  outputWriteLinear,
                                  0,
                                  NULL,
                                  NULL);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferWriteLinear)");

    // create a CL program using the kernel source
    char buildOption[128];
    if(vectorSize == 1)
        sprintf(buildOption, "-D DATATYPE=float -D OFFSET=%d ", OFFSET);
    else
        sprintf(buildOption, "-D DATATYPE=float%d -D OFFSET=%d ", (vec3 == true) ? 3 : vectorSize, OFFSET);

    // create a CL program using the kernel source
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("GlobalMemoryBandwidth_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string(buildOption);
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // Global memory bandwidth from read-single access
    kernel[0] = clCreateKernel(program, "read_single", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_single)");

    // Global memory  bandwidth from read-linear access
    kernel[1] = clCreateKernel(program, "read_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear)");

    // Global memory  bandwidth from read-linear access
    kernel[2] = clCreateKernel(program, "read_linear_uncached", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear_uncached)");

    // Global memory  bandwidth from write-linear access
    kernel[3] = clCreateKernel(program, "write_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(GlobalBandwidth_write_linear)");

    return SDK_SUCCESS;
}
int
MatrixMulDouble::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    status = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(status, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;
    if(platforms.size() > 0)
    {
        if(sampleArgs->isPlatformEnabled())
        {
            i = platforms.begin() + sampleArgs->platformId;
        }
        else
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(),
                           "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
    }

    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)(*i)(),
        0
    };


    if(NULL == (*i)())
    {
        error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    context = cl::Context(dType, cps, NULL, NULL, &status);
    CHECK_OPENCL_ERROR(status, "Context::Context() failed.");

    devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CHECK_OPENCL_ERROR(status, "Context::getInfo() failed.");

    std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
    int deviceCount = (int)devices.size();
    int j = 0;
    for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end();
            ++i, ++j)
    {
        std::cout << "Device " << j << " : ";
        std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
        std::cout << deviceName.c_str() << "\n";
    }
    std::cout << "\n";

    if (deviceCount == 0)
    {
        std::cerr << "No device available\n";
        return SDK_FAILURE;
    }

    if(validateDeviceId(sampleArgs->deviceId, deviceCount))
    {
        error("validateDeviceId() failed");
        return SDK_FAILURE;
    }

    std::string extensions =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_EXTENSIONS>();

    std::string buildOptions = std::string("");
    // Check if cl_khr_fp64 extension is supported
    if(strstr(extensions.c_str(), "cl_khr_fp64"))
    {
        buildOptions.append("-D KHR_DP_EXTENSION");
    }
    else
    {
        // Check if cl_amd_fp64 extension is supported
        if(!strstr(extensions.c_str(), "cl_amd_fp64"))
        {
            OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!");
        }
    }
    cl_uint localMemType;
    // Get device specific information
    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_LOCAL_MEM_TYPE,
                 &localMemType);
    CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed.");

    // If scratchpad is available then update the flag
    if(localMemType == CL_LOCAL)
    {
        lds = true;
    }

    // Get Device specific Information
    status = devices[sampleArgs->deviceId].getInfo<size_t>(
                 CL_DEVICE_MAX_WORK_GROUP_SIZE,
                 &maxWorkGroupSize);

    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");

    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                 &maxDimensions);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    std::vector<size_t> workItems =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
    {
        maxWorkItemSizes[i] = workItems[i];
    }

    status = devices[sampleArgs->deviceId].getInfo<cl_ulong>(
                 CL_DEVICE_LOCAL_MEM_SIZE,
                 &totalLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

    // Set command queue properties
    cl_command_queue_properties prop = 0;
    if(!eAppGFLOPS)
    {
        prop |= CL_QUEUE_PROFILING_ENABLE;
    }

    commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], prop,
                                    &status);
    CHECK_OPENCL_ERROR(status, "CommandQueue::CommandQueue() failed.");

    // Set Persistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(sampleArgs->isAmdPlatform())
    {
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
    }

    // Create buffer for matrix A
    inputBufA = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthA * heightA,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufA)");

    // Create buffer for matrix B
    inputBufB = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthB * heightB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufB)");

    outputBuf = cl::Buffer(
                    context,
                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                    sizeof(cl_double) * heightA * widthB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (outputBuf)");

    device.push_back(devices[sampleArgs->deviceId]);

    // create a CL program using the kernel source
    SDKFile kernelFile;
    std::string kernelPath = getPath();

    if(sampleArgs->isLoadBinaryEnabled())
    {
        kernelPath.append(sampleArgs->loadBinary.c_str());
        if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS)
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Binaries programBinary(1,std::make_pair(
                                                (const void*)kernelFile.source().data(),
                                                kernelFile.source().size()));

        program = cl::Program(context, device, programBinary, NULL, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Binary) failed.");

    }
    else
    {
        kernelPath.append("MatrixMulDouble_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Sources programSource(
            1,
            std::make_pair(kernelFile.source().data(),
                           kernelFile.source().size()));

        program = cl::Program(context, programSource, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Source) failed.");

    }

    std::string flagsStr = std::string("");

    // Get build options if any
    flagsStr.append(buildOptions.c_str());

    // Get additional options
    if(sampleArgs->isComplierFlagsSpecified())
    {
        SDKFile flagsFile;
        std::string flagsPath = getPath();
        flagsPath.append(sampleArgs->flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
    {
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
    }

    status = program.build(device, flagsStr.c_str());

    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }
    CHECK_OPENCL_ERROR(status, "Program::build() failed.");

    // Create kernel

    // If local memory is present then use the specific kernel
    if(lds)
    {
        kernel = cl::Kernel(program, "mmmKernel_local", &status);
    }
    else
    {
        kernel = cl::Kernel(program, "mmmKernel", &status);
    }

    CHECK_OPENCL_ERROR(status, "cl::Kernel failed.");
    status =  kernel.getWorkGroupInfo<cl_ulong>(

                  devices[sampleArgs->deviceId],
                  CL_KERNEL_LOCAL_MEM_SIZE,
                  &usedLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed"
                       ".(usedLocalMemory)");

    availableLocalMemory = totalLocalMemory - usedLocalMemory;
    if(lds)
    {
        neededLocalMemory = (blockSize * 4) * (blockSize * 4) * sizeof(cl_double);
    }
    else
    {
        neededLocalMemory = 0;
    }

    if(neededLocalMemory > availableLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    // Check group size against group size returned by kernel
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>
                          (devices[sampleArgs->deviceId], &status);
    CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo()  failed.");

    if((cl_uint)(blockSize * blockSize) > kernelWorkGroupSize)
    {
        if(kernelWorkGroupSize >= 64)
        {
            blockSize = 8;
        }
        else if(kernelWorkGroupSize >= 32)
        {
            blockSize = 4;
        }
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSize * blockSize << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelWorkGroupSize<<std::endl;
            return SDK_FAILURE;
        }
    }

    if(blockSize > maxWorkItemSizes[0] ||
            blockSize > maxWorkItemSizes[1] ||
            blockSize * blockSize > maxWorkGroupSize)
    {
        error("Unsupported: Device does not support requested number of work items.");
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
int
Histogram::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //sampleArgs->deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId,
                               sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    // Display available devices.
    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");


    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };
    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
                        sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    // Create command queue
    commandQueue = clCreateCommandQueue(context,
                                        devices[sampleArgs->deviceId],
                                        0,
                                        &status);
    CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");
    if(scalar && vector)//if both options are specified
    {
        std::cout<<"Ignoring --scalar and --vector option and using the default vector width of the device"<<std::endl;
        vectorWidth = deviceInfo.preferredFloatVecWidth;
    }

    else if(scalar)
    {
        vectorWidth = 1;
    }
    else if(vector)
    {
        vectorWidth = 4;
    }
    else //if no option is specified.
    {
        vectorWidth = deviceInfo.preferredFloatVecWidth;
    }

    if(!sampleArgs->quiet)
    {
        if(vectorWidth == 1)
        {
            std::cout<<"Selecting scalar kernel\n"<<std::endl;
        }
        else
        {
            std::cout<<"Selecting vector kernel\n"<<std::endl;
        }
    }

    subHistgCnt = (width * height) / (groupSize * groupIterations);

    // Check if byte-addressable store is supported
    if(!strstr(deviceInfo.extensions, "cl_khr_byte_addressable_store"))
    {
        byteRWSupport = false;
        OPENCL_EXPECTED_ERROR("Device does not support cl_khr_byte_addressable_store extension!");
    }

    dataBuf = clCreateBuffer(
                  context,
                  CL_MEM_READ_ONLY,
                  sizeof(cl_uint) * width  * height,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (dataBuf)");

    midDeviceBinBuf = clCreateBuffer(
                          context,
                          CL_MEM_WRITE_ONLY,
                          sizeof(cl_uint) * binSize * subHistgCnt,
                          NULL,
                          &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (midDeviceBinBuf)");

    // create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("Histogram_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string("");
    if(sampleArgs->isLoadBinaryEnabled())
    {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if(sampleArgs->isComplierFlagsSpecified())
    {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    const char *kernelName = (vectorWidth == 4)? "histogram256_vector":
                             "histogram256_scalar";

    kernel = clCreateKernel(program, kernelName, &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
예제 #7
0
파일: Kernels.cpp 프로젝트: sasaki3/clLucas
int
Lucas::setupCL (void)
{
  cl_int status = 0;
  cl_device_type dType;

  if (deviceType.compare ("cpu") == 0)
    dType = CL_DEVICE_TYPE_CPU;
  else				//deviceType = "gpu" 
    {
      dType = CL_DEVICE_TYPE_GPU;
      if (isThereGPU () == false)
	{
	  std::cout << "GPU not found. Falling back to CPU device" << std::
	    endl;
	  dType = CL_DEVICE_TYPE_CPU;
	}
    }

  /*
   * Have a look at the available platforms and pick either
   * the AMD one if available or a reasonable default.
   */
  status = cl::Platform::get (&platforms);
  CHECK_OPENCL_ERROR (status, "Platform::get() failed.");

  std::vector < cl::Platform >::iterator i;
  if (platforms.size () > 0)
    {
      if (isPlatformEnabled ())
	{
	  i = platforms.begin () + platformId;
	}
      else
	{
	  for (i = platforms.begin (); i != platforms.end (); ++i)
	    {
	      if (!strcmp ((*i).getInfo < CL_PLATFORM_VENDOR > ().c_str (),
			   "Advanced Micro Devices, Inc."))
		{
		  break;
		}
	    }
	}
    }

  cl_context_properties cps[3] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties) (*i) (),
    0
  };

  if (NULL == (*i) ())
    {
      sampleCommon->error ("NULL platform found so Exiting Application.");
      return SDK_FAILURE;
    }

  context = cl::Context (dType, cps, NULL, NULL, &status);
  CHECK_OPENCL_ERROR (status, "Context::Context() failed.");

  devices = context.getInfo < CL_CONTEXT_DEVICES > ();
  CHECK_OPENCL_ERROR (status, "Context::getInfo() failed.");

  std::cout << "Platform :" << (*i).getInfo < CL_PLATFORM_VENDOR >
    ().c_str () << "\n";
  int deviceCount = (int) devices.size ();
  int j = 0;
  for (std::vector < cl::Device >::iterator i = devices.begin ();
       i != devices.end (); ++i, ++j)
    {
      std::cout << "Device " << j << " : ";
      std::string deviceName = (*i).getInfo < CL_DEVICE_NAME > ();
      std::cout << deviceName.c_str () << "\n";
    }
  std::cout << "\n";

  if (deviceCount == 0)
    {
      std::cerr << "No device available\n";
      return SDK_FAILURE;
    }

  if (sampleCommon->validateDeviceId (deviceId, deviceCount))
    {
      sampleCommon->error ("sampleCommon::validateDeviceId() failed");
      return SDK_FAILURE;
    }

  std::string extensions =
    devices[deviceId].getInfo < CL_DEVICE_EXTENSIONS > ();

  std::string buildOptions = std::string ("");
  // Check if cl_khr_fp64 extension is supported 
  if (strstr (extensions.c_str (), "cl_khr_fp64"))
    {
      buildOptions.append ("-D KHR_DP_EXTENSION");
    }
  else
    {
      // Check if cl_amd_fp64 extension is supported 
      if (!strstr (extensions.c_str (), "cl_amd_fp64"))
	{
	  OPENCL_EXPECTED_ERROR
	    ("Device does not support cl_amd_fp64 extension!");
	}
    }
    cl_uint localMemType;
    // Get device specific information 
    status = devices[deviceId].getInfo<cl_uint>(
             CL_DEVICE_LOCAL_MEM_TYPE,
            &localMemType);
    CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed.");
    
    // If scratchpad is available then update the flag 
    if(localMemType != CL_LOCAL)
	  OPENCL_EXPECTED_ERROR ("Device does not support local memory.");

    // Get Device specific Information 
    status = devices[deviceId].getInfo<size_t>(
              CL_DEVICE_MAX_WORK_GROUP_SIZE, 
              &maxWorkGroupSize);

    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");
    if(threads > maxWorkGroupSize)
	  OPENCL_EXPECTED_ERROR ("Device does not support threads.");
    
    status = devices[deviceId].getInfo<cl_uint>(
             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
             &maxDimensions);
    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");
    

    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));
    
    std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
        maxWorkItemSizes[i] = workItems[i];

    status = devices[deviceId].getInfo<cl_ulong>(
             CL_DEVICE_LOCAL_MEM_SIZE,
             &totalLocalMemory);
    CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

  // Set command queue properties
  cl_command_queue_properties prop = 0;
  if (!eAppGFLOPS)
    prop |= CL_QUEUE_PROFILING_ENABLE;

  commandQueue = cl::CommandQueue (context, devices[deviceId], prop, &status);
  CHECK_OPENCL_ERROR (status, "CommandQueue::CommandQueue() failed.");

  // Set Presistent memory only for AMD platform
  cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
  if (isAmdPlatform ())
    inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

  device.push_back (devices[deviceId]);

  // create a CL program using the kernel source
  streamsdk::SDKFile kernelFile;
  std::string kernelPath = sampleCommon->getPath ();

  kernelPath.append ("Kernels.cl");
  if (!kernelFile.open (kernelPath.c_str ()))
  {
      std::cout << "Failed to load kernel file : " << kernelPath <<
      std::endl;
      return SDK_FAILURE;
  }
  cl::Program::Sources programSource (1,
					  std::make_pair (kernelFile.
							  source ().data (),
							  kernelFile.
							  source ().size ()));

  program = cl::Program (context, programSource, &status);
  CHECK_OPENCL_ERROR (status, "Program::Program(Source) failed.");

  std::string flagsStr = std::string ("");

  status = program.build (device, flagsStr.c_str ());

  if (status != CL_SUCCESS)
    {
      if (status == CL_BUILD_PROGRAM_FAILURE)
	{
	  std::string str =
	    program.getBuildInfo < CL_PROGRAM_BUILD_LOG > (devices[deviceId]);

	  std::cout << " \n\t\t\tBUILD LOG\n";
	  std::cout << " ************************************************\n";
	  std::cout << str << std::endl;
	  std::cout << " ************************************************\n";
	}
    }
  CHECK_OPENCL_ERROR (status, "Program::build() failed.");

  // Create kernel  

  // If local memory is present then use the specific kernel 
  mul_kernel = cl::Kernel (program, "mul_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");
  status = mul_kernel.getWorkGroupInfo < cl_ulong > (devices[deviceId],
						 CL_KERNEL_LOCAL_MEM_SIZE,
						 &usedLocalMemory);
  CHECK_OPENCL_ERROR (status,
		      "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed"
		      ".(usedLocalMemory)");

  // Create normalize_kernel  

  // If local memory is present then use the specific kernel 
  normalize_kernel = cl::Kernel (program, "normalize_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");

  // Create normalize2_kernel  

  // If local memory is present then use the specific kernel 
  normalize2_kernel = cl::Kernel (program, "normalize2_Kernel", &status);

  CHECK_OPENCL_ERROR (status, "cl::Kernel failed.");

  return SDK_SUCCESS;
}
int
LDSBandwidth::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId,
                               sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    // Display available devices.
    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
                        sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);

#ifdef CL_VERSION_1_1
    if(vStrVal.compare("1.0") > 0)
    {
        char openclVersion[1024];
        status = clGetDeviceInfo(devices[sampleArgs->deviceId],
                                 CL_DEVICE_OPENCL_C_VERSION,
                                 sizeof(openclVersion),
                                 openclVersion,
                                 0);
        CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed.");

        std::string tempStr(openclVersion);
        size_t dotPos = tempStr.find_first_of(".");
        size_t spacePos = tempStr.find_last_of(" ");
        tempStr = tempStr.substr(dotPos + 1, spacePos - dotPos);
        int minorVersion = atoi(tempStr.c_str());
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(minorVersion < 1 && vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
    else
    {
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
#else
    // OpenCL 1.1 has inbuilt support for vec3 data types
    if(vec3 == true)
    {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
#endif

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context,
                                            devices[sampleArgs->deviceId],
                                            prop,
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    outputBuffer = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  sizeof(cl_float) * vectorSize * length,
                                  0,
                                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)");

    // create a CL program using the kernel source
    char buildOption[64];
    if(vectorSize == 1)
    {
        sprintf(buildOption, "-D DATATYPE=float ");
    }
    else
    {
        sprintf(buildOption, "-D DATATYPE=float%d ", vec3 == true ? 3 : vectorSize);
    }

    buildProgramData buildData;
    buildData.kernelName = std::string("LDSBandwidth_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string(buildOption);
    if(sampleArgs->isLoadBinaryEnabled())
    {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if(sampleArgs->isComplierFlagsSpecified())
    {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");

    // ConstantBuffer bandwidth from single access
    kernel[0] = clCreateKernel(program, "LDSBandwidth_single", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_single)");

    // ConstantBuffer bandwidth from linear access
    kernel[1] = clCreateKernel(program, "LDSBandwidth_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_linear)");

    kernel[2] = clCreateKernel(program, "LDSBandwidth_single_verify", &status);
    CHECK_OPENCL_ERROR(status,
                       "clCreateKernel failed.(LDSBandwidth_single_verify)");

    kernel[3] = clCreateKernel(program, "LDSBandwidth_linear_verify", &status);
    CHECK_OPENCL_ERROR(status,
                       "clCreateKernel failed.(LDSBandwidth_linear_verify)");

    kernel[4] = clCreateKernel(program, "LDSBandwidth_write_linear", &status);
    CHECK_OPENCL_ERROR(status,
                       "clCreateKernel failed.(LDSBandwidth_linear_verify)");

    kernel[5] = clCreateKernel(program, "LDSBandwidth_write_linear_verify",
                               &status);
    CHECK_OPENCL_ERROR(status,
                       "clCreateKernel failed.(LDSBandwidth_linear_verify)");

    return SDK_SUCCESS;
}
int 
ImageOverlap::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
	{
		dType = CL_DEVICE_TYPE_GPU;
		if(isThereGPU() == false)
		{
			std::cout << "GPU not found. Falling back to CPU device" << std::endl;
			dType = CL_DEVICE_TYPE_CPU;
		}
	}

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    // If we could find our platform, use it. Otherwise use just available platform.
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    status = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed");

    if(!deviceInfo.imageSupport)
    {
        OPENCL_EXPECTED_ERROR(" Expected Error: Device does not support Images");
    }
	 
	blockSizeX = deviceInfo.maxWorkGroupSize<GROUP_SIZE?deviceInfo.maxWorkGroupSize:GROUP_SIZE;

    // Create command queue
	cl_command_queue_properties prop = 0;
	for(int i=0;i<3;i++)
	{
		commandQueue[i] = clCreateCommandQueue(
			context,
			devices[deviceId],
			prop,
			&status);
		 CHECK_OPENCL_ERROR(status,"clCreateCommandQueuefailed.");
	}

    // Create and initialize image objects

	// Create map image
	mapImage = clCreateImage(context,
		CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
		&imageFormat,
		&image_desc,
		mapImageData,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (mapImage)");
	int color[4] = {0,0,80,255};
	size_t origin[3] = {300,300,0};
	size_t region[3] = {100,100,1};
	status = clEnqueueFillImage(commandQueue[0], mapImage, color, origin, region, NULL, NULL, &eventlist[0]);

    // Create fill image
	fillImage = clCreateImage(context,
		CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
		&imageFormat,
		&image_desc,
		fillImageData,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (fillImage)");

	color[0] = 80;
	color[1] = 0;
	color[2] = 0;
	color[3] = 0;
	origin[0] = 50;
	origin[1] = 50;
	status = clEnqueueFillImage(commandQueue[1], fillImage, color, origin, region, NULL, NULL, &eventlist[1]);
	
	//Create output image
	outputImage = clCreateImage(context,
		CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
		&imageFormat,
		&image_desc,
		NULL,
		&status);
	CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (outputImage)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("ImageOverlap_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
	kernelOverLap = clCreateKernel(program, "OverLap", &status);
	CHECK_OPENCL_ERROR(status,"clCreateKernel failed.(OverLap)");

    return SDK_SUCCESS;
}