int
UnsharpMask::setupCL()
{
    try
    {
        cl_int err = CL_SUCCESS;
        cl_device_type dType;

        if(sampleArgs->deviceType.compare("cpu") == 0)
        {
            dType = CL_DEVICE_TYPE_CPU;
        }
        else //deviceType == "gpu"
        {
            dType = CL_DEVICE_TYPE_GPU;
            if(sampleArgs->isThereGPU() == false)
            {
                std::cout << "GPU not found. Falling back to CPU device" << std::endl;
                dType = CL_DEVICE_TYPE_CPU;
            }
        }

        /*
        * Have a look at the available platforms and pick either
        * the AMD one if available or a reasonable default.
        */
        cl::Platform::get(&platforms);


        std::vector<cl::Platform>::iterator i;
        if(platforms.size() > 0)
        {
            if(sampleArgs->isPlatformEnabled())
            {
                i = platforms.begin() + sampleArgs->platformId;
            }
            else
            {
                for(i = platforms.begin(); i != platforms.end(); ++i)
                {
                    if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(),
                               "Advanced Micro Devices, Inc."))
                    {
                        break;
                    }
                }
            }
        }

        cl_context_properties cps[3] =
        {
            CL_CONTEXT_PLATFORM,
            (cl_context_properties)(*i)(),
            0
        };

        context = cl::Context(dType, cps, NULL, NULL);

        devices = context.getInfo<CL_CONTEXT_DEVICES>();

        std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
        int deviceCount = (int)devices.size();
        int j = 0;
        for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end();
                ++i, ++j)
        {
            std::cout << "Device " << j << " : ";
            std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
            std::cout << deviceName.c_str() << "\n";
        }
        std::cout << "\n";

        if (deviceCount == 0)
        {
            std::cerr << "No device available\n";
            return SDK_FAILURE;
        }

        if(validateDeviceId(sampleArgs->deviceId, deviceCount))
        {
            error("validateDeviceId() failed");
            return SDK_FAILURE;
        }

        commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0);

        device.push_back(devices[sampleArgs->deviceId]);

        // create a CL program using the kernel source
        SDKFile kernelFile;
        std::string kernelPath = getPath();

        if(sampleArgs->isLoadBinaryEnabled())
        {
            kernelPath.append(sampleArgs->loadBinary.c_str());
            if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS)
            {
                std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
                return SDK_FAILURE;
            }
            cl::Program::Binaries programBinary(1,std::make_pair(
                                                    (const void*)kernelFile.source().data(),
                                                    kernelFile.source().size()));

            program = cl::Program(context, device, programBinary, NULL);
        }
        else
        {
            kernelPath.append("UnsharpMask_Kernels.cl");
            if(!kernelFile.open(kernelPath.c_str()))
            {
                std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
                return SDK_FAILURE;
            }

            cl::Program::Sources programSource(1,
                                               std::make_pair(kernelFile.source().data(),
                                                       kernelFile.source().size()));

            program = cl::Program(context, programSource);
        }

        std::string flagsStr = std::string("");

        // Get additional options
        if(sampleArgs->isComplierFlagsSpecified())
        {
            SDKFile flagsFile;
            std::string flagsPath = getPath();
            flagsPath.append(sampleArgs->flags.c_str());
            if(!flagsFile.open(flagsPath.c_str()))
            {
                std::cout << "Failed to load flags file: " << flagsPath << std::endl;
                return SDK_FAILURE;
            }
            flagsFile.replaceNewlineWithSpaces();
            const char * flags = flagsFile.source().c_str();
            flagsStr.append(flags);
        }

        if(flagsStr.size() != 0)
        {
            std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
        }

        program.build(device, flagsStr.c_str());

        queue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0, &err);

        int dimen = 2*radius+1;
        cl::ImageFormat format(CL_BGRA,CL_UNSIGNED_INT8);

        if(loadInputImage()!=SDK_SUCCESS)
        {
            return SDK_FAILURE;
        }

        gaussian1DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, dimen*sizeof(float));

        // create the 1D Gaussian kernel
        if(dImageBuffer)
        {
            gaussian2DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY,
                                          dimen*dimen*sizeof(float));
            inputBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, imageSize);
            outputBuffer = cl::Buffer (context,CL_MEM_WRITE_ONLY, imageSize);
            unsharp_mask_filter = cl::Kernel(program, "unsharp_mask_filter");

        }
        else
        {
            inputImageObj = cl::Image2D(context, CL_MEM_READ_ONLY, format, width, height);
            sharpenImageObj = cl::Image2D(context, CL_MEM_WRITE_ONLY, format, width,
                                          height);
            tmpImageObj = cl::Buffer(context,CL_MEM_READ_WRITE,
                                     width*height*sizeof(cl_float4));

            // Create kernel
            unsharp_mask_pass1 = cl::Kernel(program, "unsharp_mask_pass1");
            unsharp_mask_pass2 = cl::Kernel(program, "unsharp_mask_pass2");
        }
    }
    catch (cl::Error e)
    {
        if(e.err() == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
        else
        {
            std::cout << e.what() << " failed!"<< std::endl;
            std::cout << "Error code: " << e.err() << std::endl;
        }
        return SDK_FAILURE;
    }
    return SDK_SUCCESS;
}
int
MatrixMulDouble::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    status = cl::Platform::get(&platforms);
    CHECK_OPENCL_ERROR(status, "Platform::get() failed.");

    std::vector<cl::Platform>::iterator i;
    if(platforms.size() > 0)
    {
        if(sampleArgs->isPlatformEnabled())
        {
            i = platforms.begin() + sampleArgs->platformId;
        }
        else
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(),
                           "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
    }

    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)(*i)(),
        0
    };


    if(NULL == (*i)())
    {
        error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    context = cl::Context(dType, cps, NULL, NULL, &status);
    CHECK_OPENCL_ERROR(status, "Context::Context() failed.");

    devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CHECK_OPENCL_ERROR(status, "Context::getInfo() failed.");

    std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n";
    int deviceCount = (int)devices.size();
    int j = 0;
    for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end();
            ++i, ++j)
    {
        std::cout << "Device " << j << " : ";
        std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>();
        std::cout << deviceName.c_str() << "\n";
    }
    std::cout << "\n";

    if (deviceCount == 0)
    {
        std::cerr << "No device available\n";
        return SDK_FAILURE;
    }

    if(validateDeviceId(sampleArgs->deviceId, deviceCount))
    {
        error("validateDeviceId() failed");
        return SDK_FAILURE;
    }

    std::string extensions =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_EXTENSIONS>();

    std::string buildOptions = std::string("");
    // Check if cl_khr_fp64 extension is supported
    if(strstr(extensions.c_str(), "cl_khr_fp64"))
    {
        buildOptions.append("-D KHR_DP_EXTENSION");
    }
    else
    {
        // Check if cl_amd_fp64 extension is supported
        if(!strstr(extensions.c_str(), "cl_amd_fp64"))
        {
            OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!");
        }
    }
    cl_uint localMemType;
    // Get device specific information
    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_LOCAL_MEM_TYPE,
                 &localMemType);
    CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed.");

    // If scratchpad is available then update the flag
    if(localMemType == CL_LOCAL)
    {
        lds = true;
    }

    // Get Device specific Information
    status = devices[sampleArgs->deviceId].getInfo<size_t>(
                 CL_DEVICE_MAX_WORK_GROUP_SIZE,
                 &maxWorkGroupSize);

    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed.");

    status = devices[sampleArgs->deviceId].getInfo<cl_uint>(
                 CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                 &maxDimensions);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed.");


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    std::vector<size_t> workItems =
        devices[sampleArgs->deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

    for(cl_uint i = 0; i < maxDimensions; ++i)
    {
        maxWorkItemSizes[i] = workItems[i];
    }

    status = devices[sampleArgs->deviceId].getInfo<cl_ulong>(
                 CL_DEVICE_LOCAL_MEM_SIZE,
                 &totalLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed.");

    // Set command queue properties
    cl_command_queue_properties prop = 0;
    if(!eAppGFLOPS)
    {
        prop |= CL_QUEUE_PROFILING_ENABLE;
    }

    commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], prop,
                                    &status);
    CHECK_OPENCL_ERROR(status, "CommandQueue::CommandQueue() failed.");

    // Set Persistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(sampleArgs->isAmdPlatform())
    {
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
    }

    // Create buffer for matrix A
    inputBufA = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthA * heightA,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufA)");

    // Create buffer for matrix B
    inputBufB = cl::Buffer(
                    context,
                    inMemFlags,
                    sizeof(cl_double) * widthB * heightB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufB)");

    outputBuf = cl::Buffer(
                    context,
                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                    sizeof(cl_double) * heightA * widthB,
                    NULL,
                    &status);
    CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (outputBuf)");

    device.push_back(devices[sampleArgs->deviceId]);

    // create a CL program using the kernel source
    SDKFile kernelFile;
    std::string kernelPath = getPath();

    if(sampleArgs->isLoadBinaryEnabled())
    {
        kernelPath.append(sampleArgs->loadBinary.c_str());
        if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS)
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Binaries programBinary(1,std::make_pair(
                                                (const void*)kernelFile.source().data(),
                                                kernelFile.source().size()));

        program = cl::Program(context, device, programBinary, NULL, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Binary) failed.");

    }
    else
    {
        kernelPath.append("MatrixMulDouble_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        cl::Program::Sources programSource(
            1,
            std::make_pair(kernelFile.source().data(),
                           kernelFile.source().size()));

        program = cl::Program(context, programSource, &status);
        CHECK_OPENCL_ERROR(status, "Program::Program(Source) failed.");

    }

    std::string flagsStr = std::string("");

    // Get build options if any
    flagsStr.append(buildOptions.c_str());

    // Get additional options
    if(sampleArgs->isComplierFlagsSpecified())
    {
        SDKFile flagsFile;
        std::string flagsPath = getPath();
        flagsPath.append(sampleArgs->flags.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        const char * flags = flagsFile.source().c_str();
        flagsStr.append(flags);
    }

    if(flagsStr.size() != 0)
    {
        std::cout << "Build Options are : " << flagsStr.c_str() << std::endl;
    }

    status = program.build(device, flagsStr.c_str());

    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]);

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << str << std::endl;
            std::cout << " ************************************************\n";
        }
    }
    CHECK_OPENCL_ERROR(status, "Program::build() failed.");

    // Create kernel

    // If local memory is present then use the specific kernel
    if(lds)
    {
        kernel = cl::Kernel(program, "mmmKernel_local", &status);
    }
    else
    {
        kernel = cl::Kernel(program, "mmmKernel", &status);
    }

    CHECK_OPENCL_ERROR(status, "cl::Kernel failed.");
    status =  kernel.getWorkGroupInfo<cl_ulong>(

                  devices[sampleArgs->deviceId],
                  CL_KERNEL_LOCAL_MEM_SIZE,
                  &usedLocalMemory);
    CHECK_OPENCL_ERROR(status,
                       "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed"
                       ".(usedLocalMemory)");

    availableLocalMemory = totalLocalMemory - usedLocalMemory;
    if(lds)
    {
        neededLocalMemory = (blockSize * 4) * (blockSize * 4) * sizeof(cl_double);
    }
    else
    {
        neededLocalMemory = 0;
    }

    if(neededLocalMemory > availableLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    // Check group size against group size returned by kernel
    kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>
                          (devices[sampleArgs->deviceId], &status);
    CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo()  failed.");

    if((cl_uint)(blockSize * blockSize) > kernelWorkGroupSize)
    {
        if(kernelWorkGroupSize >= 64)
        {
            blockSize = 8;
        }
        else if(kernelWorkGroupSize >= 32)
        {
            blockSize = 4;
        }
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << blockSize * blockSize << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelWorkGroupSize<<std::endl;
            return SDK_FAILURE;
        }
    }

    if(blockSize > maxWorkItemSizes[0] ||
            blockSize > maxWorkItemSizes[1] ||
            blockSize * blockSize > maxWorkGroupSize)
    {
        error("Unsupported: Device does not support requested number of work items.");
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
Пример #3
0
int Parallel::setup() {
    /**
     * OpenCL initialization.
     */
    cl_int status = Simulator::setup();
    CheckStatus(status, "Simulator::setup() failed.");
    
    cl_uint numPlatforms;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    CheckStatus(status, "clGetPlatformIDs, fetching number");
    DEBUG_STDOUT("Number of platforms: " << numPlatforms);
    
    cl_platform_id platform = NULL;
    if (numPlatforms > 0) {
        std::unique_ptr<cl_platform_id[]> platforms (new cl_platform_id[numPlatforms]);
        status = clGetPlatformIDs(numPlatforms, platforms.get(), NULL);
        CheckStatus(status, "clGetPlatformIDs, fetching platforms");
        
        for (unsigned i = 0; i < numPlatforms; ++i) {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);
            CheckStatus(status, "clGetPlatformInfo");
        }
        
        // Just grab the first platform.
        platform = platforms[0];
    }
    CheckConditional(platform != NULL, "platform == NULL");
    
    cl_uint numDevices;
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
    CheckStatus(status, "clGetDeviceIDs: fetching number");
    DEBUG_STDOUT("Number of devices: " << numDevices);
    
    cl_device_id *devices = new cl_device_id[numDevices];
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
    CheckStatus(status, "clGetDeviceIDs: fetching devices");
    
    int deviceIndex = 0;
    for (unsigned i = 0; i < numDevices; ++i) {
        char pbuf[100];
        status = clGetDeviceInfo(devices[i],
                                 CL_DEVICE_NAME,
                                 sizeof(pbuf),
                                 pbuf,
                                 NULL);
        if (!strncmp(pbuf, "ATI", 3)) {
            deviceIndex = i;
        }
    }
    
    /* Create the context. */
    context = clCreateContext(0, numDevices, devices, NULL, NULL, &status);
    CheckConditional(context != NULL, "clCreateContextFromType");
    
    /* Create command queue */
    cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE;
    commandQueue = clCreateCommandQueue(context, devices[deviceIndex], prop, &status);
    CheckStatus(status, "clCreateCommandQueue");
    
    /* Create a CL program using the kernel source */
    SDKFile kernelFile;
    std::string kernelPath = getenv("HOME") + std::string("/md-simulator/src/TestKernel.cl");
    if(!kernelFile.open(kernelPath.c_str())) {
        DEBUG_STDERR("Failed to load kernel file : " << kernelPath);
        return MD_FAILURE;
    }
    
    const char *source = kernelFile.source().c_str();
    size_t sourceSize[] = {strlen(source)};
    program = clCreateProgramWithSource(context,
                                        1,
                                        &source,
                                        sourceSize,
                                        &status);
    CheckStatus(status, "clCreateProgramWithSource");
    
    /* Create a cl program executable for all the devices specified */
    status = clBuildProgram(program,
                            numDevices,
                            devices,
                            NULL,
                            NULL,
                            NULL);
    
    if (status != CL_SUCCESS) {
        if (status == CL_BUILD_PROGRAM_FAILURE) {
            cl_int logStatus;
            std::unique_ptr<char[]> buildLog (nullptr);
            //char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo(program,
                                              devices[deviceIndex],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog.get(),
                                              &buildLogSize);
            CheckStatus(logStatus, "clGetProgramBuildInfo");
            
            buildLog = std::unique_ptr<char[]>(new char[buildLogSize]);
            if(!buildLog) {
                return MD_FAILURE;
            }
            std::fill_n(buildLog.get(), buildLogSize, 0);
            
            logStatus = clGetProgramBuildInfo(program,
                                              devices[deviceIndex],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog.get(),
                                              NULL);
            CheckStatus(logStatus, "clGetProgramBuildInfo (2)");
            
            DEBUG_STDERR("\n\t\t\tBUILD LOG\n");
            DEBUG_STDERR("************************************************\n");
            DEBUG_STDERR(buildLog.get());
            DEBUG_STDERR("************************************************\n");
        }
    }
    CheckStatus(status, "clBuildProgram");
    
    /* Get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(program, "computeAccelerations", &status);
    CheckStatus(status, "clCreateKernel");
    
    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceIndex],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    CheckStatus(status, "clGetKernelWorkGroupInfo");
    DEBUG_STDOUT("kernelWorkGroupSize: " << kernelWorkGroupSize);
    
    /**
     * Initialize some simulator data structures.
     */
    global = particleCount * particleCount;
    local = particleCount;
    
    if (global * local > kernelWorkGroupSize) {
        DEBUG_STDERR("WARNING - global * local > kernelWorkGroupSize; global: " << global << ", local: " << local << ", kernelWorkGroupSize: " << kernelWorkGroupSize);
        return MD_FAILURE;
    }
    
    // Data holds the molecule positions.
    data = std::unique_ptr<float[]> (new float[particleCount * 3]);
    
    // Constants holds simulator constants.
    constants = std::unique_ptr<float[]> (new float[NUM_CONSTANTS]);
    
    // Copy constants to buffer;
    constants[0] = epsilon;
    constants[1] = sigma;
    constants[2] = negForceCutoffMinusHalf;
    constants[3] = forceCutoffMinusHalf;
    constants[4] = wallStiffness;
    
    // Results holds pairwise forces.
    results = std::unique_ptr<float[]> (new float[particleCount * particleCount * 3]);
    
    return MD_SUCCESS;
}