int MatrixMulDouble::setupCL(void) { cl_int status = 0; cl_device_type dType; if(sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ status = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(status, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(sampleArgs->isPlatformEnabled()) { i = platforms.begin() + sampleArgs->platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; if(NULL == (*i)()) { error("NULL platform found so Exiting Application."); return SDK_FAILURE; } context = cl::Context(dType, cps, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(status, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if(validateDeviceId(sampleArgs->deviceId, deviceCount)) { error("validateDeviceId() failed"); return SDK_FAILURE; } std::string extensions = devices[sampleArgs->deviceId].getInfo<CL_DEVICE_EXTENSIONS>(); std::string buildOptions = std::string(""); // Check if cl_khr_fp64 extension is supported if(strstr(extensions.c_str(), "cl_khr_fp64")) { buildOptions.append("-D KHR_DP_EXTENSION"); } else { // Check if cl_amd_fp64 extension is supported if(!strstr(extensions.c_str(), "cl_amd_fp64")) { OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!"); } } cl_uint localMemType; // Get device specific information status = devices[sampleArgs->deviceId].getInfo<cl_uint>( CL_DEVICE_LOCAL_MEM_TYPE, &localMemType); CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed."); // If scratchpad is available then update the flag if(localMemType == CL_LOCAL) { lds = true; } // Get Device specific Information status = devices[sampleArgs->deviceId].getInfo<size_t>( CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed."); status = devices[sampleArgs->deviceId].getInfo<cl_uint>( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed."); maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); std::vector<size_t> workItems = devices[sampleArgs->deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); for(cl_uint i = 0; i < maxDimensions; ++i) { maxWorkItemSizes[i] = workItems[i]; } status = devices[sampleArgs->deviceId].getInfo<cl_ulong>( CL_DEVICE_LOCAL_MEM_SIZE, &totalLocalMemory); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed."); // Set command queue properties cl_command_queue_properties prop = 0; if(!eAppGFLOPS) { prop |= CL_QUEUE_PROFILING_ENABLE; } commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "CommandQueue::CommandQueue() failed."); // Set Persistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(sampleArgs->isAmdPlatform()) { inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; } // Create buffer for matrix A inputBufA = cl::Buffer( context, inMemFlags, sizeof(cl_double) * widthA * heightA, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufA)"); // Create buffer for matrix B inputBufB = cl::Buffer( context, inMemFlags, sizeof(cl_double) * widthB * heightB, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufB)"); outputBuf = cl::Buffer( context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double) * heightA * widthB, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (outputBuf)"); device.push_back(devices[sampleArgs->deviceId]); // create a CL program using the kernel source SDKFile kernelFile; std::string kernelPath = getPath(); if(sampleArgs->isLoadBinaryEnabled()) { kernelPath.append(sampleArgs->loadBinary.c_str()); if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &status); CHECK_OPENCL_ERROR(status, "Program::Program(Binary) failed."); } else { kernelPath.append("MatrixMulDouble_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource( 1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &status); CHECK_OPENCL_ERROR(status, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get build options if any flagsStr.append(buildOptions.c_str()); // Get additional options if(sampleArgs->isComplierFlagsSpecified()) { SDKFile flagsFile; std::string flagsPath = getPath(); flagsPath.append(sampleArgs->flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) { std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; } status = program.build(device, flagsStr.c_str()); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(status, "Program::build() failed."); // Create kernel // If local memory is present then use the specific kernel if(lds) { kernel = cl::Kernel(program, "mmmKernel_local", &status); } else { kernel = cl::Kernel(program, "mmmKernel", &status); } CHECK_OPENCL_ERROR(status, "cl::Kernel failed."); status = kernel.getWorkGroupInfo<cl_ulong>( devices[sampleArgs->deviceId], CL_KERNEL_LOCAL_MEM_SIZE, &usedLocalMemory); CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed" ".(usedLocalMemory)"); availableLocalMemory = totalLocalMemory - usedLocalMemory; if(lds) { neededLocalMemory = (blockSize * 4) * (blockSize * 4) * sizeof(cl_double); } else { neededLocalMemory = 0; } if(neededLocalMemory > availableLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE> (devices[sampleArgs->deviceId], &status); CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo() failed."); if((cl_uint)(blockSize * blockSize) > kernelWorkGroupSize) { if(kernelWorkGroupSize >= 64) { blockSize = 8; } else if(kernelWorkGroupSize >= 32) { blockSize = 4; } else { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSize * blockSize << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize<<std::endl; return SDK_FAILURE; } } if(blockSize > maxWorkItemSizes[0] || blockSize > maxWorkItemSizes[1] || blockSize * blockSize > maxWorkGroupSize) { error("Unsupported: Device does not support requested number of work items."); return SDK_FAILURE; } return SDK_SUCCESS; }
/** Initialization OpenCL entities * * Method inits all required entities for work with OpenCL code. * * @param config * Contain information about simulating configuration */ void owOpenCLSolver::initializeOpenCL(owConfigProperty * config) { cl_int err; std::vector< cl::Platform > platformList; err = cl::Platform::get( &platformList ); //TODO make check that returned value isn't error if( platformList.size() < 1 || err != CL_SUCCESS ){ throw std::runtime_error( "No OpenCL platforms found" ); } char cBuffer[1024]; cl_platform_id cl_pl_id[10]; cl_uint n_pl; clGetPlatformIDs(10,cl_pl_id,&n_pl); cl_int ciErrNum; int sz; for(int i=0;i<(int)n_pl;i++) { // Get OpenCL platform name and version ciErrNum = clGetPlatformInfo (cl_pl_id[i], CL_PLATFORM_VERSION, sz = sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { printf(" CL_PLATFORM_VERSION [%d]: \t%s\n", i, cBuffer); } else { printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum); } } //0-CPU, 1-GPU // depends on the time order of system OpenCL drivers installation on your local machine // CL_DEVICE_TYPE cl_device_type type; unsigned int device_type [] = {CL_DEVICE_TYPE_CPU,CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ALL}; int plList = -1;//selected platform index in platformList array [choose CPU by default] //added autodetection of device number corresonding to preferrable device type (CPU|GPU) | otherwise the choice will be made from list of existing devices cl_uint ciDeviceCount = 0; cl_device_id * devices_t; bool bPassed = true, findDevice = false; cl_int result; cl_uint device_coumpute_unit_num; cl_uint device_coumpute_unit_num_current = 0; unsigned int deviceNum = 0; //Selection of more appropriate device while(!findDevice){ for(int clSelectedPlatformID = 0;clSelectedPlatformID < (int)n_pl;clSelectedPlatformID++){ //if(findDevice) // break; clGetDeviceIDs (cl_pl_id[clSelectedPlatformID], device_type[config->getDeviceType()], 0, NULL, &ciDeviceCount); if((devices_t = static_cast<cl_device_id *>(malloc(sizeof(cl_device_id) * ciDeviceCount))) == NULL) bPassed = false; if(bPassed){ result= clGetDeviceIDs (cl_pl_id[clSelectedPlatformID], device_type[config->getDeviceType()], ciDeviceCount, devices_t, &ciDeviceCount); if( result == CL_SUCCESS){ for( cl_uint i =0; i < ciDeviceCount; ++i ){ clGetDeviceInfo(devices_t[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL); if( type & device_type[config->getDeviceType()]){ clGetDeviceInfo(devices_t[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(device_coumpute_unit_num), &device_coumpute_unit_num, NULL); if(device_coumpute_unit_num_current <= device_coumpute_unit_num){ plList = clSelectedPlatformID; device_coumpute_unit_num_current = device_coumpute_unit_num; findDevice = true; deviceNum = i; } //break; } } } free(devices_t); } } if(!findDevice){ //plList = 0; deviceNum = 0; std::string deviceTypeName = (config->getDeviceType() == ALL)? "ALL": (config->getDeviceType() == CPU)? "CPU":"GPU"; std::cout << "Unfortunately OpenCL couldn't find device " << deviceTypeName << std::endl; std::cout << "OpenCL try to init existing device " << std::endl; if(config->getDeviceType() != ALL) config->setDeviceType(ALL); else throw std::runtime_error("Sibernetic can't find any OpenCL devices. Please check you're environment configuration."); } } cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) (platformList[plList])(), 0 }; context = cl::Context( device_type[config->getDeviceType()], cprops, NULL, NULL, &err ); devices = context.getInfo< CL_CONTEXT_DEVICES >(); if( devices.size() < 1 ){ throw std::runtime_error( "No OpenCL devices were found" ); } //Print some information about chosen platform size_t compUnintsCount, memoryInfo, workGroupSize; result = devices[deviceNum].getInfo(CL_DEVICE_NAME,&cBuffer);// CL_INVALID_VALUE = -30; if(result == CL_SUCCESS){ std::cout << "CL_CONTEXT_PLATFORM ["<< plList << "]: CL_DEVICE_NAME [" << deviceNum << "]:\t" << cBuffer << "\n" << std::endl; } if(strlen(cBuffer)<1000){ config->setDeviceName(cBuffer); } result = devices[deviceNum].getInfo(CL_DEVICE_TYPE,&cBuffer); if(result == CL_SUCCESS){ std::cout << "CL_CONTEXT_PLATFORM ["<< plList << "]: CL_DEVICE_TYPE [" << deviceNum << "]:\t" << (((int)cBuffer[0] == CL_DEVICE_TYPE_CPU)? "CPU" : "GPU") << std::endl; } result = devices[deviceNum].getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE,&workGroupSize); if(result == CL_SUCCESS){ std::cout << "CL_CONTEXT_PLATFORM ["<< plList << "]: CL_DEVICE_MAX_WORK_GROUP_SIZE [" << deviceNum <<"]: \t" << workGroupSize <<std::endl; } result = devices[deviceNum].getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,&compUnintsCount); if(result == CL_SUCCESS){ std::cout<<"CL_CONTEXT_PLATFORM [" << plList << "]: CL_DEVICE_MAX_COMPUTE_UNITS [" << deviceNum << "]: \t" << compUnintsCount << std::endl; } result = devices[deviceNum].getInfo(CL_DEVICE_GLOBAL_MEM_SIZE,&memoryInfo); if(result == CL_SUCCESS){ std::cout<<"CL_CONTEXT_PLATFORM [" << plList <<"]: CL_DEVICE_GLOBAL_MEM_SIZE ["<< deviceNum <<"]: \t" << deviceNum <<std::endl; } result = devices[deviceNum].getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,&memoryInfo); if(result == CL_SUCCESS){ std::cout << "CL_CONTEXT_PLATFORM [" << plList <<"]: CL_DEVICE_GLOBAL_MEM_CACHE_SIZE [" << deviceNum <<"]:\t" << memoryInfo <<std::endl; } result = devices[deviceNum].getInfo(CL_DEVICE_LOCAL_MEM_SIZE,&memoryInfo); if(result == CL_SUCCESS){ std::cout << "CL_CONTEXT_PLATFORM " << plList <<": CL_DEVICE_LOCAL_MEM_SIZE [" << deviceNum <<"]:\t" << memoryInfo << std::endl; } queue = cl::CommandQueue( context, devices[ deviceNum ], 0, &err ); if( err != CL_SUCCESS ){ throw std::runtime_error( "Failed to create command queue" ); } std::ifstream file( config->getSourceFileName().c_str() ); if( !file.is_open() ){ throw std::runtime_error( "Could not open file with OpenCL program check input arguments oclsourcepath: " + config->getSourceFileName() ); } std::string programSource( std::istreambuf_iterator<char>( file ), ( std::istreambuf_iterator<char>() )); cl::Program::Sources source( 1, std::make_pair( programSource.c_str(), programSource.length()+1 )); program = cl::Program( context, source ); #if defined(__APPLE__) err = program.build( devices, "-g -cl-opt-disable" ); #else #if INTEL_OPENCL_DEBUG err = program.build( devices, OPENCL_DEBUG_PROGRAM_PATH + "-g -cl-opt-disable"); #else err = program.build( devices, ""); #endif #endif if( err != CL_SUCCESS ){ std::string compilationErrors; compilationErrors = program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( devices[ 0 ] ); std::cerr << "Compilation failed: " << std::endl << compilationErrors << std::endl; throw std::runtime_error( "failed to build program" ); } std::cout<<"OPENCL program was successfully build. Program file oclsourcepath: " << config->getSourceFileName() << std::endl; return; }
int Lucas::setupCL (void) { cl_int status = 0; cl_device_type dType; if (deviceType.compare ("cpu") == 0) dType = CL_DEVICE_TYPE_CPU; else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if (isThereGPU () == false) { std::cout << "GPU not found. Falling back to CPU device" << std:: endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ status = cl::Platform::get (&platforms); CHECK_OPENCL_ERROR (status, "Platform::get() failed."); std::vector < cl::Platform >::iterator i; if (platforms.size () > 0) { if (isPlatformEnabled ()) { i = platforms.begin () + platformId; } else { for (i = platforms.begin (); i != platforms.end (); ++i) { if (!strcmp ((*i).getInfo < CL_PLATFORM_VENDOR > ().c_str (), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) (*i) (), 0 }; if (NULL == (*i) ()) { sampleCommon->error ("NULL platform found so Exiting Application."); return SDK_FAILURE; } context = cl::Context (dType, cps, NULL, NULL, &status); CHECK_OPENCL_ERROR (status, "Context::Context() failed."); devices = context.getInfo < CL_CONTEXT_DEVICES > (); CHECK_OPENCL_ERROR (status, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo < CL_PLATFORM_VENDOR > ().c_str () << "\n"; int deviceCount = (int) devices.size (); int j = 0; for (std::vector < cl::Device >::iterator i = devices.begin (); i != devices.end (); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo < CL_DEVICE_NAME > (); std::cout << deviceName.c_str () << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if (sampleCommon->validateDeviceId (deviceId, deviceCount)) { sampleCommon->error ("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } std::string extensions = devices[deviceId].getInfo < CL_DEVICE_EXTENSIONS > (); std::string buildOptions = std::string (""); // Check if cl_khr_fp64 extension is supported if (strstr (extensions.c_str (), "cl_khr_fp64")) { buildOptions.append ("-D KHR_DP_EXTENSION"); } else { // Check if cl_amd_fp64 extension is supported if (!strstr (extensions.c_str (), "cl_amd_fp64")) { OPENCL_EXPECTED_ERROR ("Device does not support cl_amd_fp64 extension!"); } } cl_uint localMemType; // Get device specific information status = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_LOCAL_MEM_TYPE, &localMemType); CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed."); // If scratchpad is available then update the flag if(localMemType != CL_LOCAL) OPENCL_EXPECTED_ERROR ("Device does not support local memory."); // Get Device specific Information status = devices[deviceId].getInfo<size_t>( CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed."); if(threads > maxWorkGroupSize) OPENCL_EXPECTED_ERROR ("Device does not support threads."); status = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed."); maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); for(cl_uint i = 0; i < maxDimensions; ++i) maxWorkItemSizes[i] = workItems[i]; status = devices[deviceId].getInfo<cl_ulong>( CL_DEVICE_LOCAL_MEM_SIZE, &totalLocalMemory); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed."); // Set command queue properties cl_command_queue_properties prop = 0; if (!eAppGFLOPS) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = cl::CommandQueue (context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR (status, "CommandQueue::CommandQueue() failed."); // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if (isAmdPlatform ()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; device.push_back (devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath (); kernelPath.append ("Kernels.cl"); if (!kernelFile.open (kernelPath.c_str ())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource (1, std::make_pair (kernelFile. source ().data (), kernelFile. source ().size ())); program = cl::Program (context, programSource, &status); CHECK_OPENCL_ERROR (status, "Program::Program(Source) failed."); std::string flagsStr = std::string (""); status = program.build (device, flagsStr.c_str ()); if (status != CL_SUCCESS) { if (status == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo < CL_PROGRAM_BUILD_LOG > (devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR (status, "Program::build() failed."); // Create kernel // If local memory is present then use the specific kernel mul_kernel = cl::Kernel (program, "mul_Kernel", &status); CHECK_OPENCL_ERROR (status, "cl::Kernel failed."); status = mul_kernel.getWorkGroupInfo < cl_ulong > (devices[deviceId], CL_KERNEL_LOCAL_MEM_SIZE, &usedLocalMemory); CHECK_OPENCL_ERROR (status, "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed" ".(usedLocalMemory)"); // Create normalize_kernel // If local memory is present then use the specific kernel normalize_kernel = cl::Kernel (program, "normalize_Kernel", &status); CHECK_OPENCL_ERROR (status, "cl::Kernel failed."); // Create normalize2_kernel // If local memory is present then use the specific kernel normalize2_kernel = cl::Kernel (program, "normalize2_Kernel", &status); CHECK_OPENCL_ERROR (status, "cl::Kernel failed."); return SDK_SUCCESS; }
int UnsharpMask::setupCL() { try { cl_int err = CL_SUCCESS; cl_device_type dType; if(sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType == "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl::Platform::get(&platforms); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(sampleArgs->isPlatformEnabled()) { i = platforms.begin() + sampleArgs->platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context(dType, cps, NULL, NULL); devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if(validateDeviceId(sampleArgs->deviceId, deviceCount)) { error("validateDeviceId() failed"); return SDK_FAILURE; } commandQueue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0); device.push_back(devices[sampleArgs->deviceId]); // create a CL program using the kernel source SDKFile kernelFile; std::string kernelPath = getPath(); if(sampleArgs->isLoadBinaryEnabled()) { kernelPath.append(sampleArgs->loadBinary.c_str()); if(kernelFile.readBinaryFromFile(kernelPath.c_str()) != SDK_SUCCESS) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL); } else { kernelPath.append("UnsharpMask_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource(1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource); } std::string flagsStr = std::string(""); // Get additional options if(sampleArgs->isComplierFlagsSpecified()) { SDKFile flagsFile; std::string flagsPath = getPath(); flagsPath.append(sampleArgs->flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) { std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; } program.build(device, flagsStr.c_str()); queue = cl::CommandQueue(context, devices[sampleArgs->deviceId], 0, &err); int dimen = 2*radius+1; cl::ImageFormat format(CL_BGRA,CL_UNSIGNED_INT8); if(loadInputImage()!=SDK_SUCCESS) { return SDK_FAILURE; } gaussian1DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, dimen*sizeof(float)); // create the 1D Gaussian kernel if(dImageBuffer) { gaussian2DBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, dimen*dimen*sizeof(float)); inputBuffer = cl::Buffer(context,CL_MEM_READ_ONLY, imageSize); outputBuffer = cl::Buffer (context,CL_MEM_WRITE_ONLY, imageSize); unsharp_mask_filter = cl::Kernel(program, "unsharp_mask_filter"); } else { inputImageObj = cl::Image2D(context, CL_MEM_READ_ONLY, format, width, height); sharpenImageObj = cl::Image2D(context, CL_MEM_WRITE_ONLY, format, width, height); tmpImageObj = cl::Buffer(context,CL_MEM_READ_WRITE, width*height*sizeof(cl_float4)); // Create kernel unsharp_mask_pass1 = cl::Kernel(program, "unsharp_mask_pass1"); unsharp_mask_pass2 = cl::Kernel(program, "unsharp_mask_pass2"); } } catch (cl::Error e) { if(e.err() == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[sampleArgs->deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } else { std::cout << e.what() << " failed!"<< std::endl; std::cout << "Error code: " << e.err() << std::endl; } return SDK_FAILURE; } return SDK_SUCCESS; }
int SobelFilterImage::setupCL() { cl_int err = CL_SUCCESS; cl_device_type dType; if(sampleArgs.deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs.isThereGPU) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } err = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(err, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; int deviceId = -1; for (i=platforms.begin(); i!=platforms.end(); i++) { //std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; devices.clear(); i->getDevices(dType, &devices); if (devices.size() < 0) break; deviceId = 0; #if 0 for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i) { std::cout << "Device " << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } #endif std::cout << "\n"; } if (deviceId == -1) { std::cerr << "Cant find CL device" << std::endl; exit(-5); } device.push_back(devices[deviceId]); context = cl::Context( device ); commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err); CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed."); cl::ImageFormat imageFormat(CL_RGBA, CL_UNSIGNED_INT8); /* * Create and initialize memory objects */ inputImage2D = cl::Image2D(context, CL_MEM_READ_ONLY, imageFormat, width, height, 0, NULL, &err); CHECK_OPENCL_ERROR(err, "Image2D::Image2D() failed. (inputImage2D)"); // Create memory objects for output Image outputImage2D = cl::Image2D(context, CL_MEM_WRITE_ONLY, imageFormat, width, height, 0, 0, &err); CHECK_OPENCL_ERROR(err, "Image2D::Image2D() failed. (outputImage2D)"); // create a CL program using the kernel source std::string clSourceData = readFile(sampleArgs.clKernelPath.c_str()); // create program source cl::Program::Sources programSource(1, std::make_pair(clSourceData.c_str(), clSourceData.length())); // Create program object program = cl::Program(context, programSource, &err); CHECK_OPENCL_ERROR(err, "Program::Program() failed."); std::string flagsStr = std::string(""); if(flagsStr.size() != 0) { std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; } err = program.build( { device }, flagsStr.c_str()); if(err != CL_SUCCESS) { if(err == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(err, "Program::build() failed."); // Create kernel kernel = cl::Kernel(program, "sobel_filter", &err); CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed."); // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE> (devices[deviceId], &err); CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo() failed."); if((blockSizeX * blockSizeY) > kernelWorkGroupSize) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelWorkGroupSize << std::endl; if(blockSizeX > kernelWorkGroupSize) { blockSizeX = kernelWorkGroupSize; blockSizeY = 1; } } return 0; }
int GaussianNoise::setupCL() { cl_int err = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ err = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(err, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(isPlatformEnabled()) { i = platforms.begin() + platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context(dType, cps, NULL, NULL, &err); CHECK_OPENCL_ERROR(err, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(err, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if(sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err); CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed."); /* * Create and initialize memory objects */ // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for input Image inputImageBuffer = cl::Buffer(context, inMemFlags, width * height * pixelSize, 0, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, width * height * pixelSize, NULL, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); device.push_back(devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed."); } else { kernelPath.append("GaussianNoise_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource(1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; err = program.build(device, flagsStr.c_str()); if(err != CL_SUCCESS) { if(err == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(err, "Program::build() failed."); // Create kernel kernel = cl::Kernel(program, "gaussian_transform", &err); CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed."); // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err); CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo() failed."); if((blockSizeX * blockSizeY) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelWorkGroupSize << std::endl; } if(blockSizeX > kernelWorkGroupSize) { blockSizeX = kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; }
int HDRToneMapping::setupCL() { cl_int err = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ err = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(err, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(isPlatformEnabled()) { i = platforms.begin() + platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context(dType, cps, NULL, NULL, &err); CHECK_OPENCL_ERROR(err, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(err, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cout << "No device available\n"; return SDK_FAILURE; } if(sampleCommon->validateDeviceId(deviceId, deviceCount) != SDK_SUCCESS) { std::cout << "sampleCommon::validateDeviceId() failed"; return SDK_FAILURE; } // Get Device specific Information err = devices[deviceId].getInfo<size_t>( CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed."); err = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed."); maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); for(cl_uint i = 0; i < maxDimensions; ++i) maxWorkItemSizes[i] = workItems[i]; err = devices[deviceId].getInfo<cl_ulong>( CL_DEVICE_LOCAL_MEM_SIZE, &totalLocalMemory); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed."); commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err); CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed."); /* * Create and initialize memory objects */ // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for input Image /** * We use CL_MEM_USE_HOST_PTR for CPU as the CPU device is running the kernel * on the actual buffer provided by the application */ if (dType == CL_DEVICE_TYPE_CPU) { inputImageBuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, width * height * numChannels * sizeof(cl_float), input, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, width * height * numChannels * sizeof(cl_float), output, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); } else if (dType == CL_DEVICE_TYPE_GPU) { inputImageBuffer = cl::Buffer(context, inMemFlags, width * height * numChannels * sizeof(cl_float), 0, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, width * height * numChannels * sizeof(cl_float), NULL, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); } device.push_back(devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed."); } else { kernelPath.append("HDRToneMapping_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource(1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; err = program.build(device, flagsStr.c_str()); if(err != CL_SUCCESS) { if(err == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(err, "Program::build() failed."); // Create kernel kernel = cl::Kernel(program, "toneMappingPattanaik", &err); CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed."); // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err); CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo() failed."); /** * For CPU device the kernel work group size is 1024. * Workgroup creation/replacement is an overhead - * avoid workgroups with small number of workitems (we pay more for replacing a WG than running more WI in a for loop). */ if (kernelWorkGroupSize >= 1024) { blockSizeX = 32; blockSizeY = 32; } if((cl_uint)(blockSizeX * blockSizeY) > kernelWorkGroupSize) { if(kernelWorkGroupSize >= 64) { blockSizeX = 8; blockSizeY = 8; } else if(kernelWorkGroupSize >= 32) { blockSizeX = 4; blockSizeY = 4; } else { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize<<std::endl; return SDK_FAILURE; } } if(blockSizeX > maxWorkItemSizes[0] || blockSizeY > maxWorkItemSizes[1] || blockSizeX * blockSizeY > maxWorkGroupSize) { std::cout << "Unsupported: Device does not support requested number of work items." << std::endl; return SDK_FAILURE; } return SDK_SUCCESS; }