cl::Program *oclKernelCache::ForcedCompile(cl::Context &context, cl::Device &device, const std::string &kernelsParameters, const std::string &kernelSource, cl::STRING_CLASS *error) { cl::Program *program = NULL; try { cl::Program::Sources source(1, std::make_pair(kernelSource.c_str(), kernelSource.length())); program = new cl::Program(context, source); VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(device); program->build(buildDevice, kernelsParameters.c_str()); } catch (cl::Error err) { const std::string clerr = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::stringstream ss; ss << "ERROR " << err.what() << "[" << luxrays::oclErrorString(err.err()) << "]:" << std::endl << clerr << std::endl; *error = ss.str(); if (program) delete program; program = NULL; } return program; }
void OpenCLPrinter::printContextInfo(cl::Context context) { print("--- ContextInfo ---", ""); VECTOR_CLASS<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); print("Number of devices", devices.size()); for(int i=0; i<devices.size(); ++i) printDeviceInfo(devices[i]); }
OpenCLBVHKernel(OpenCLIntersectionDevice *dev) : OpenCLKernel(dev), vertsBuff(NULL), trisBuff(NULL), bvhBuff(NULL) { const Context *deviceContext = device->GetContext(); cl::Context &oclContext = device->GetOpenCLContext(); cl::Device &oclDevice = device->GetOpenCLDevice(); const std::string &deviceName(device->GetName()); // Compile sources std::string code( _LUXRAYS_POINT_OCLDEFINE _LUXRAYS_VECTOR_OCLDEFINE _LUXRAYS_RAY_OCLDEFINE _LUXRAYS_RAYHIT_OCLDEFINE _LUXRAYS_TRIANGLE_OCLDEFINE _LUXRAYS_BBOX_OCLDEFINE); code += KernelSource_BVH; cl::Program::Sources source(1, std::make_pair(code.c_str(), code.length())); cl::Program program = cl::Program(oclContext, source); try { VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(oclDevice); program.build(buildDevice); } catch (cl::Error err) { cl::STRING_CLASS strError = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(oclDevice); LR_LOG(deviceContext, "[OpenCL device::" << deviceName << "] BVH compilation error:\n" << strError.c_str()); throw err; } delete kernel; kernel = new cl::Kernel(program, "Intersect"); kernel->getWorkGroupInfo<size_t>(oclDevice, CL_KERNEL_WORK_GROUP_SIZE, &workGroupSize); LR_LOG(deviceContext, "[OpenCL device::" << deviceName << "] BVH kernel work group size: " << workGroupSize); kernel->getWorkGroupInfo<size_t>(oclDevice, CL_KERNEL_WORK_GROUP_SIZE, &workGroupSize); LR_LOG(deviceContext, "[OpenCL device::" << deviceName << "] Suggested work group size: " << workGroupSize); if (device->GetForceWorkGroupSize() > 0) { workGroupSize = device->GetForceWorkGroupSize(); LR_LOG(deviceContext, "[OpenCL device::" << deviceName << "] Forced work group size: " << workGroupSize); } }
CLDeviceSelection::CLDeviceSelection(const QString selectString, bool allowGPU, bool allowCPU) { VECTOR_CLASS<cl::Platform> platforms; CL_DETECT_ERROR(cl::Platform::get(&platforms)); int selectIndex = 0; qDebug().nospace().noquote()<<"Making OpenCL device selection with selectstring='"<< selectString<< "', allowGPU="<<allowGPU<<", allowCPU="<<allowCPU<<""; for (size_t i = 0; i < platforms.size(); ++i) { qDebug().nospace().noquote()<<"Platform-" << i << ": " << QString::fromLocal8Bit(platforms[i].getInfo<CL_PLATFORM_VENDOR>().c_str()); // Get the list of devices available on the platform VECTOR_CLASS<cl::Device> devices; platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices); for (size_t j = 0; j < devices.size(); ++j) { bool selected = false; auto type=devices[j].getInfo<CL_DEVICE_TYPE>(); if ((allowGPU && (type == CL_DEVICE_TYPE_GPU)) || (allowCPU && (type == CL_DEVICE_TYPE_CPU))) { if (selectString.length() == 0) { selected = true; } else { if (selectString.length() <= selectIndex) { qWarning()<< "OpenCL select devices string (opencl.devices.select) has the wrong length"; exit(1); } if (selectString.at(selectIndex) == '1') { selected = true; } } } if (selected) { push_back(devices[j]); } qDebug().nospace()<< " |-- Device-"<<i <<"."<<j <<": "<< devices[j]<<(selected?" [SEL]":" [---]"); ++selectIndex; } } if (size() == 0) { qWarning()<<"This program requires OpenCL enabled hardware. Unable to find any OpenCL GPU devices, so quitting"; exit(1); } }
void SetUpOpenCL() { //---------------------------------------------------------------------- // Compile kernel //---------------------------------------------------------------------- const std::string &kernelFileName = commandLineOpts["kernel"].as<std::string>(); OCLTOY_LOG("Compile OpenCL kernel: " << kernelFileName); // Read the kernel const std::string kernelSource = ReadSources(kernelFileName, "jugCLer"); // Create the kernel program cl::Device &oclDevice = selectedDevices[0]; cl::Context &oclContext = deviceContexts[0]; cl::Program program = cl::Program(oclContext, kernelSource); try { VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(oclDevice); program.build(buildDevice); } catch (cl::Error err) { cl::STRING_CLASS strError = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(oclDevice); OCLTOY_LOG("Kernel compilation error:\n" << strError.c_str()); throw err; } kernelsJugCLer = cl::Kernel(program, "render_gpu"); kernelsJugCLer.getWorkGroupInfo<size_t>(oclDevice, CL_KERNEL_WORK_GROUP_SIZE, &kernelsWorkGroupSize); if (commandLineOpts.count("workgroupsize")) kernelsWorkGroupSize = commandLineOpts["workgroupsize"].as<size_t>(); OCLTOY_LOG("Using workgroup size: " << kernelsWorkGroupSize); //---------------------------------------------------------------------- // Allocate buffer //---------------------------------------------------------------------- AllocateBuffers(); //---------------------------------------------------------------------- // Set kernel arguments //---------------------------------------------------------------------- kernelsJugCLer.setArg(0, *sceneBuff); kernelsJugCLer.setArg(1, *pixelsBuff); }
void OpenCLPixelDevice::CompileKernel(cl::Context &ctx, cl::Device &device, const std::string &src, const char *kernelName, cl::Kernel **kernel) { // Compile sources cl::Program::Sources source(1, std::make_pair(src.c_str(), src.length())); cl::Program program = cl::Program(ctx, source); try { VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(device); program.build(buildDevice, "-I."); } catch (cl::Error err) { cl::STRING_CLASS strError = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); LR_LOG(deviceContext, "[OpenCL device::" << deviceName << "] " << kernelName << " compilation error:\n" << strError.c_str()); throw err; } *kernel = new cl::Kernel(program, kernelName); }
Context::Context(LuxRaysDebugHandler handler, const int openclPlatformIndex, const bool verb) { debugHandler = handler; currentDataSet = NULL; started = false; verbose = verb; // Get the list of devices available on the platform NativeThreadDeviceDescription::AddDeviceDescs(deviceDescriptions); #if !defined(LUXRAYS_DISABLE_OPENCL) // Platform info VECTOR_CLASS<cl::Platform> platforms; cl::Platform::get(&platforms); for (size_t i = 0; i < platforms.size(); ++i) LR_LOG(this, "OpenCL Platform " << i << ": " << platforms[i].getInfo<CL_PLATFORM_VENDOR>().c_str()); if (openclPlatformIndex < 0) { if (platforms.size() > 0) { // Just use all the platforms available for (size_t i = 0; i < platforms.size(); ++i) OpenCLDeviceDescription::AddDeviceDescs( platforms[i], DEVICE_TYPE_OPENCL_ALL, deviceDescriptions); } else LR_LOG(this, "No OpenCL platform available"); } else { if ((platforms.size() == 0) || (openclPlatformIndex >= (int)platforms.size())) throw std::runtime_error("Unable to find an appropriate OpenCL platform"); else { OpenCLDeviceDescription::AddDeviceDescs( platforms[openclPlatformIndex], DEVICE_TYPE_OPENCL_ALL, deviceDescriptions); } } #endif // Print device info for (size_t i = 0; i < deviceDescriptions.size(); ++i) { DeviceDescription *desc = deviceDescriptions[i]; LR_LOG(this, "Device " << i << " name: " << desc->GetName()); LR_LOG(this, "Device " << i << " type: " << DeviceDescription::GetDeviceType(desc->GetType())); LR_LOG(this, "Device " << i << " compute units: " << desc->GetComputeUnits()); LR_LOG(this, "Device " << i << " preferred float vector width: " << desc->GetNativeVectorWidthFloat()); LR_LOG(this, "Device " << i << " max allocable memory: " << desc->GetMaxMemory() / (1024 * 1024) << "MBytes"); LR_LOG(this, "Device " << i << " max allocable memory block size: " << desc->GetMaxMemoryAllocSize() / (1024 * 1024) << "MBytes"); } }
cl::Program *oclKernelVolatileCache::Compile(cl::Context &context, cl::Device& device, const std::string &kernelsParameters, const std::string &kernelSource, bool *cached, cl::STRING_CLASS *error) { // Check if the kernel is available in the cache std::map<std::string, cl::Program::Binaries>::iterator it = kernelCache.find(kernelsParameters); if (it == kernelCache.end()) { // It isn't available, compile the source cl::Program *program = ForcedCompile( context, device, kernelsParameters, kernelSource, error); if (!program) return NULL; // Obtain the binaries of the sources VECTOR_CLASS<char *> bins = program->getInfo<CL_PROGRAM_BINARIES>(); assert (bins.size() == 1); VECTOR_CLASS<size_t> sizes = program->getInfo<CL_PROGRAM_BINARY_SIZES>(); assert (sizes.size() == 1); if (sizes[0] > 0) { // Add the kernel to the cache char *bin = new char[sizes[0]]; memcpy(bin, bins[0], sizes[0]); kernels.push_back(bin); kernelCache[kernelsParameters] = cl::Program::Binaries(1, std::make_pair(bin, sizes[0])); } if (cached) *cached = false; return program; } else { // Compile from the binaries VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(device); cl::Program *program = new cl::Program(context, buildDevice, it->second); program->build(buildDevice); if (cached) *cached = true; return program; } }
void OpenCLPrinter::printPlatformAndDeviceInfo() { VECTOR_CLASS<cl::Platform> platforms; cl::Platform::get(&platforms); VECTOR_CLASS<cl::Device> devices; for(unsigned int i = 0; i < platforms.size(); i++) { printPlatformInfo(platforms[i]); platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices); for(unsigned int j = 0; j < devices.size(); j++) { printDeviceInfo(devices[j]); } } print("Number of platforms", platforms.size()); print("Number of devices", devices.size()); }
cl::Program *oclKernelPersistentCache::Compile(cl::Context &context, cl::Device& device, const std::string &kernelsParameters, const std::string &kernelSource, bool *cached, cl::STRING_CLASS *error) { // Check if the kernel is available in the cache cl::Platform platform = device.getInfo<CL_DEVICE_PLATFORM>(); std::string platformName = platform.getInfo<CL_PLATFORM_VENDOR>(); std::string deviceName = device.getInfo<CL_DEVICE_NAME>(); std::string deviceUnits = ToString(device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()); std::string kernelName = HashString(kernelsParameters) + "-" + HashString(kernelSource) + ".ocl"; std::string dirName = "kernel_cache/" + appName + "/" + platformName + "/" + deviceName + "/" + deviceUnits; std::string fileName = dirName +"/" +kernelName; if (!boost::filesystem::exists(fileName)) { // It isn't available, compile the source cl::Program *program = ForcedCompile( context, device, kernelsParameters, kernelSource, error); if (!program) return NULL; // Obtain the binaries of the sources VECTOR_CLASS<char *> bins = program->getInfo<CL_PROGRAM_BINARIES>(); assert (bins.size() == 1); VECTOR_CLASS<size_t> sizes = program->getInfo<CL_PROGRAM_BINARY_SIZES >(); assert (sizes.size() == 1); // Create the file only if the binaries include something if (sizes[0] > 0) { // Add the kernel to the cache boost::filesystem::create_directories(dirName); BOOST_OFSTREAM file(fileName.c_str(), std::ios_base::out | std::ios_base::binary); file.write(bins[0], sizes[0]); // Check for errors char buf[512]; if (file.fail()) { sprintf(buf, "Unable to write kernel file cache %s", fileName.c_str()); throw std::runtime_error(buf); } file.close(); } if (cached) *cached = false; return program; } else { const size_t kernelSize = boost::filesystem::file_size(fileName); if (kernelSize > 0) { char *kernelBin = new char[kernelSize]; BOOST_IFSTREAM file(fileName.c_str(), std::ios_base::in | std::ios_base::binary); file.read(kernelBin, kernelSize); // Check for errors char buf[512]; if (file.fail()) { sprintf(buf, "Unable to read kernel file cache %s", fileName.c_str()); throw std::runtime_error(buf); } file.close(); // Compile from the binaries VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(device); cl::Program *program = new cl::Program(context, buildDevice, cl::Program::Binaries(1, std::make_pair(kernelBin, kernelSize))); program->build(buildDevice); if (cached) *cached = true; delete[] kernelBin; return program; } else { // Something wrong in the file, remove the file and retry boost::filesystem::remove(fileName); return Compile(context, device, kernelsParameters, kernelSource, cached, error); } } }
void printDeviceInfo(const cl::Device &device, cl_device_info info) { if (!initialized) { printf("not initialized. call initCLUtility()."); } printf("%s: ", clDeviceInfoMetaInfos[info].name); switch (clDeviceInfoMetaInfos[info].infoType) { case CLDeviceInfoType_bool: { cl_bool val; device.getInfo(info, &val); printf(val != 0 ? "YES" : "NO"); break; } case CLDeviceInfoType_uint: { cl_uint val; device.getInfo(info, &val); printf("%u", val); break; } case CLDeviceInfoType_ulong: { cl_ulong val; device.getInfo(info, &val); printf("%llu", val); break; } case CLDeviceInfoType_size_t: { size_t val; device.getInfo(info, &val); printf("%lu", val); break; } case CLDeviceInfoType_string: { std::string val; device.getInfo(info, &val); printf("%s", val.c_str()); break; } case CLDeviceInfoType_size_t_vec: { VECTOR_CLASS<size_t> val; device.getInfo(info, &val); for (uint32_t i = 0; i < val.size() - 1; ++i) { printf("%lu, ", val[i]); } printf("%lu", val.back()); break; } case CLDeviceInfoType_device_id: { cl_device_id val; device.getInfo(info, &val); printf("%#018llx", (uint64_t)val); break; } case CLDeviceInfoType_platform_id: { cl_platform_id val; device.getInfo(info, &val); printf("%#018llx", (uint64_t)val); break; } case CLDeviceInfoType_device_type: { cl_device_type val; device.getInfo(info, &val); switch (val) { case CL_DEVICE_TYPE_CPU: printf("CPU"); break; case CL_DEVICE_TYPE_GPU: printf("GPU"); break; case CL_DEVICE_TYPE_ACCELERATOR: printf("Accelerator"); break; case CL_DEVICE_TYPE_CUSTOM: printf("Custom"); break; default: break; } break; } case CLDeviceInfoType_device_fp_config: { cl_device_fp_config val; device.getInfo(info, &val); if ((val & CL_FP_DENORM) != 0) printf("CL_FP_DENORM "); if ((val & CL_FP_INF_NAN) != 0) printf("CL_FP_INF_NAN "); if ((val & CL_FP_ROUND_TO_NEAREST) != 0) printf("CL_FP_ROUND_TO_NEAREST "); if ((val & CL_FP_ROUND_TO_ZERO) != 0) printf("CL_FP_ROUND_TO_ZERO "); if ((val & CL_FP_ROUND_TO_INF) != 0) printf("CL_FP_ROUND_TO_INF "); if ((val & CL_FP_FMA) != 0) printf("CL_FP_FMA "); if ((val & CL_FP_SOFT_FLOAT) != 0) printf("CL_FP_SOFT_FLOAT "); if ((val & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) != 0) printf("CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT "); break; } case CLDeviceInfoType_device_local_mem_type: { cl_device_local_mem_type val; device.getInfo(info, &val); switch (val) { case CL_LOCAL: printf("Local"); break; case CL_GLOBAL: printf("Global"); default: break; } break; } case CLDeviceInfoType_device_mem_cache_type: { cl_device_mem_cache_type val; device.getInfo(info, &val); switch (val) { case CL_NONE: printf("None"); break; case CL_READ_ONLY_CACHE: printf("Read Only Cache"); break; case CL_READ_WRITE_CACHE: printf("Read Write Cache"); break; default: break; } break; } case CLDeviceInfoType_device_exec_capabilities: { cl_device_exec_capabilities val; device.getInfo(info, &val); if ((val & CL_EXEC_KERNEL) != 0) printf("CL_EXEC_KERNEL "); if ((val & CL_EXEC_NATIVE_KERNEL) != 0) printf("CL_EXEC_NATIVE_KERNEL "); break; } case CLDeviceInfoType_command_queue_properties: { cl_command_queue_properties val; device.getInfo(info, &val); if ((val & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0) printf("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE "); if ((val & CL_QUEUE_PROFILING_ENABLE) != 0) printf("CL_QUEUE_PROFILING_ENABLE "); break; } case CLDeviceInfoType_device_affinity_domain: { cl_device_affinity_domain val; device.getInfo(info, &val); if ((val & CL_DEVICE_AFFINITY_DOMAIN_NUMA) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_NUMA "); if ((val & CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE "); if ((val & CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE "); if ((val & CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE "); if ((val & CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE "); if ((val & CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE) != 0) printf("CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE "); break; } case CLDeviceInfoType_device_partition_property_vec: { VECTOR_CLASS<cl_device_partition_property> val; device.getInfo(info, &val); for (uint32_t i = 0; i < val.size(); ++i) { switch (val[i]) { case CL_DEVICE_PARTITION_EQUALLY: printf("CL_DEVICE_PARTITION_EQUALLY"); break; case CL_DEVICE_PARTITION_BY_COUNTS: printf("CL_DEVICE_PARTITION_BY_COUNTS"); break; case CL_DEVICE_PARTITION_BY_COUNTS_LIST_END: printf("CL_DEVICE_PARTITION_BY_COUNTS_LIST_END"); break; case CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN: printf("CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN"); break; default: break; } if (i < val.size() - 1) printf(", "); } break; } default: break; } printf("\n"); }
OCLRendererThread::OCLRendererThread(const size_t threadIndex, OCLRenderer *renderer, cl::Device device) : index(threadIndex), renderer(renderer), dev(device), usedDeviceMemory(0) { const GameLevel &gameLevel(*(renderer->gameLevel)); const unsigned int width = gameLevel.gameConfig->GetScreenWidth(); const unsigned int height = gameLevel.gameConfig->GetScreenHeight(); const CompiledScene &compiledScene(*(renderer->compiledScene)); if (renderer->renderThread.size() > 1) cpuFrameBuffer = new FrameBuffer(width, height); else cpuFrameBuffer = NULL; //-------------------------------------------------------------------------- // OpenCL setup //-------------------------------------------------------------------------- // Allocate a context with the selected device VECTOR_CLASS<cl::Device> devices; devices.push_back(dev); cl::Platform platform = dev.getInfo<CL_DEVICE_PLATFORM>(); // The first thread uses OpenCL/OpenGL interoperability if (index == 0) { #if defined (__APPLE__) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties cps[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; #else #ifdef WIN32 cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, (intptr_t)wglGetCurrentContext(), CL_WGL_HDC_KHR, (intptr_t)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0 }; #else cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, (intptr_t)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (intptr_t)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0 }; #endif #endif ctx = new cl::Context(devices, cps); } else ctx = new cl::Context(devices); // Allocate the queue for this device cmdQueue = new cl::CommandQueue(*ctx, dev); //-------------------------------------------------------------------------- // Allocate the buffers //-------------------------------------------------------------------------- passFrameBuffer = NULL; tmpFrameBuffer = NULL; frameBuffer = NULL; toneMapFrameBuffer = NULL; bvhBuffer = NULL; gpuTaskBuffer = NULL; cameraBuffer = NULL; infiniteLightBuffer = NULL; matBuffer = NULL; matIndexBuffer = NULL; texMapBuffer = NULL; texMapRGBBuffer = NULL; texMapInstanceBuffer = NULL; bumpMapInstanceBuffer = NULL; AllocOCLBufferRW(&passFrameBuffer, sizeof(Pixel) * width * height, "Pass FrameBuffer"); AllocOCLBufferRW(&tmpFrameBuffer, sizeof(Pixel) * width * height, "Temporary FrameBuffer"); if (index == 0) { AllocOCLBufferRW(&frameBuffer, sizeof(Pixel) * width * height, "FrameBuffer"); AllocOCLBufferRW(&toneMapFrameBuffer, sizeof(Pixel) * width * height, "ToneMap FrameBuffer"); } AllocOCLBufferRW(&gpuTaskBuffer, sizeof(ocl_kernels::GPUTask) * width * height, "GPUTask"); AllocOCLBufferRO(&cameraBuffer, sizeof(compiledscene::Camera), "Camera"); AllocOCLBufferRO(&infiniteLightBuffer, (void *)(gameLevel.scene->infiniteLight->GetTexture()->GetTexMap()->GetPixels()), sizeof(Spectrum) * gameLevel.scene->infiniteLight->GetTexture()->GetTexMap()->GetWidth() * gameLevel.scene->infiniteLight->GetTexture()->GetTexMap()->GetHeight(), "Inifinite Light"); AllocOCLBufferRO(&matBuffer, (void *)(&compiledScene.mats[0]), sizeof(compiledscene::Material) * compiledScene.mats.size(), "Materials"); AllocOCLBufferRO(&matIndexBuffer, (void *)(&compiledScene.sphereMats[0]), sizeof(unsigned int) * compiledScene.sphereMats.size(), "Material Indices"); if (compiledScene.texMaps.size() > 0) { AllocOCLBufferRO(&texMapBuffer, (void *)(&compiledScene.texMaps[0]), sizeof(compiledscene::TexMap) * compiledScene.texMaps.size(), "Texture Maps"); AllocOCLBufferRO(&texMapRGBBuffer, (void *)(compiledScene.rgbTexMem), sizeof(Spectrum) * compiledScene.totRGBTexMem, "Texture Map Images"); AllocOCLBufferRO(&texMapInstanceBuffer, (void *)(&compiledScene.sphereTexs[0]), sizeof(compiledscene::TexMapInstance) * compiledScene.sphereTexs.size(), "Texture Map Instances"); if (compiledScene.sphereBumps.size() > 0) AllocOCLBufferRO(&bumpMapInstanceBuffer, (void *)(&compiledScene.sphereBumps[0]), sizeof(compiledscene::BumpMapInstance) * compiledScene.sphereBumps.size(), "Bump Map Instances"); } SFERA_LOG("[OCLRenderer] Total OpenCL device memory used: " << fixed << setprecision(2) << usedDeviceMemory / (1024 * 1024) << "Mbytes"); if (index == 0) { //-------------------------------------------------------------------------- // Create pixel buffer object for display //-------------------------------------------------------------------------- glGenBuffersARB(1, &pbo); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); pboBuff = new cl::BufferGL(*ctx, CL_MEM_READ_WRITE, pbo); } //-------------------------------------------------------------------------- // Compile the kernel source //-------------------------------------------------------------------------- // Set #define symbols stringstream ss; ss.precision(6); ss << scientific << " -D PARAM_SCREEN_WIDTH=" << width << " -D PARAM_SCREEN_HEIGHT=" << height << " -D PARAM_SCREEN_SAMPLEPERPASS="******" -D PARAM_RAY_EPSILON=" << EPSILON << "f" << " -D PARAM_MAX_DIFFUSE_BOUNCE=" << gameLevel.maxPathDiffuseBounces << " -D PARAM_MAX_SPECULARGLOSSY_BOUNCE=" << gameLevel.maxPathSpecularGlossyBounces << " -D PARAM_IL_SHIFT_U=" << gameLevel.scene->infiniteLight->GetShiftU() << "f" << " -D PARAM_IL_SHIFT_V=" << gameLevel.scene->infiniteLight->GetShiftV() << "f" << " -D PARAM_IL_GAIN_R=" << gameLevel.scene->infiniteLight->GetGain().r << "f" << " -D PARAM_IL_GAIN_G=" << gameLevel.scene->infiniteLight->GetGain().g << "f" << " -D PARAM_IL_GAIN_B=" << gameLevel.scene->infiniteLight->GetGain().b << "f" << " -D PARAM_IL_MAP_WIDTH=" << gameLevel.scene->infiniteLight->GetTexture()->GetTexMap()->GetWidth() << " -D PARAM_IL_MAP_HEIGHT=" << gameLevel.scene->infiniteLight->GetTexture()->GetTexMap()->GetHeight() << " -D PARAM_GAMMA=" << gameLevel.toneMap->GetGamma() << "f" << " -D PARAM_MEM_TYPE=" << gameLevel.gameConfig->GetOpenCLMemType(); if (compiledScene.enable_MAT_MATTE) ss << " -D PARAM_ENABLE_MAT_MATTE"; if (compiledScene.enable_MAT_MIRROR) ss << " -D PARAM_ENABLE_MAT_MIRROR"; if (compiledScene.enable_MAT_GLASS) ss << " -D PARAM_ENABLE_MAT_GLASS"; if (compiledScene.enable_MAT_METAL) ss << " -D PARAM_ENABLE_MAT_METAL"; if (compiledScene.enable_MAT_ALLOY) ss << " -D PARAM_ENABLE_MAT_ALLOY"; if (texMapBuffer) { ss << " -D PARAM_HAS_TEXTUREMAPS"; if (compiledScene.sphereBumps.size() > 0) ss << " -D PARAM_HAS_BUMPMAPS"; } switch (gameLevel.toneMap->GetType()) { case TONEMAP_REINHARD02: ss << " -D PARAM_TM_LINEAR_SCALE=1.0f"; break; case TONEMAP_LINEAR: { LinearToneMap *tm = (LinearToneMap *)gameLevel.toneMap; ss << " -D PARAM_TM_LINEAR_SCALE=" << tm->scale << "f"; break; } default: assert (false); } #if defined(__APPLE__) ss << " -D __APPLE__"; #endif SFERA_LOG("[OCLRenderer] Defined symbols: " << ss.str()); SFERA_LOG("[OCLRenderer] Compiling kernels"); cl::Program::Sources source(1, std::make_pair(KernelSource_kernel_core.c_str(), KernelSource_kernel_core.length())); cl::Program program = cl::Program(*ctx, source); try { VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(dev); program.build(buildDevice, ss.str().c_str()); } catch (cl::Error err) { cl::STRING_CLASS strError = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(dev); SFERA_LOG("[OCLRenderer] Kernel compilation error:\n" << strError.c_str()); throw err; } kernelInit = new cl::Kernel(program, "Init"); kernelInit->setArg(0, *gpuTaskBuffer); cmdQueue->enqueueNDRangeKernel(*kernelInit, cl::NullRange, cl::NDRange(RoundUp<unsigned int>(width * height, WORKGROUP_SIZE)), cl::NDRange(WORKGROUP_SIZE)); kernelInitFrameBuffer = new cl::Kernel(program, "InitFB"); if (index == 0) { kernelInitFrameBuffer->setArg(0, *frameBuffer); cmdQueue->enqueueNDRangeKernel(*kernelInitFrameBuffer, cl::NullRange, cl::NDRange(RoundUp<unsigned int>(width * height, WORKGROUP_SIZE)), cl::NDRange(WORKGROUP_SIZE)); } kernelInitFrameBuffer->setArg(0, *passFrameBuffer); kernelPathTracing = new cl::Kernel(program, "PathTracing"); unsigned int argIndex = 0; kernelPathTracing->setArg(argIndex++, *gpuTaskBuffer); argIndex++; kernelPathTracing->setArg(argIndex++, *cameraBuffer); kernelPathTracing->setArg(argIndex++, *infiniteLightBuffer); kernelPathTracing->setArg(argIndex++, *passFrameBuffer); kernelPathTracing->setArg(argIndex++, *matBuffer); kernelPathTracing->setArg(argIndex++, *matIndexBuffer); if (texMapBuffer) { kernelPathTracing->setArg(argIndex++, *texMapBuffer); kernelPathTracing->setArg(argIndex++, *texMapRGBBuffer); kernelPathTracing->setArg(argIndex++, *texMapInstanceBuffer); if (compiledScene.sphereBumps.size() > 0) kernelPathTracing->setArg(argIndex++, *bumpMapInstanceBuffer); } kernelApplyBlurLightFilterXR1 = new cl::Kernel(program, "ApplyBlurLightFilterXR1"); kernelApplyBlurLightFilterXR1->setArg(0, *passFrameBuffer); kernelApplyBlurLightFilterXR1->setArg(1, *tmpFrameBuffer); kernelApplyBlurLightFilterYR1 = new cl::Kernel(program, "ApplyBlurLightFilterYR1"); kernelApplyBlurLightFilterYR1->setArg(0, *tmpFrameBuffer); kernelApplyBlurLightFilterYR1->setArg(1, *passFrameBuffer); kernelApplyBlurHeavyFilterXR1 = new cl::Kernel(program, "ApplyBlurHeavyFilterXR1"); kernelApplyBlurHeavyFilterXR1->setArg(0, *passFrameBuffer); kernelApplyBlurHeavyFilterXR1->setArg(1, *tmpFrameBuffer); kernelApplyBlurHeavyFilterYR1 = new cl::Kernel(program, "ApplyBlurHeavyFilterYR1"); kernelApplyBlurHeavyFilterYR1->setArg(0, *tmpFrameBuffer); kernelApplyBlurHeavyFilterYR1->setArg(1, *passFrameBuffer); kernelApplyBoxFilterXR1 = new cl::Kernel(program, "ApplyBoxFilterXR1"); kernelApplyBoxFilterXR1->setArg(0, *passFrameBuffer); kernelApplyBoxFilterXR1->setArg(1, *tmpFrameBuffer); kernelApplyBoxFilterYR1 = new cl::Kernel(program, "ApplyBoxFilterYR1"); kernelApplyBoxFilterYR1->setArg(0, *tmpFrameBuffer); kernelApplyBoxFilterYR1->setArg(1, *passFrameBuffer); if (index == 0) { kernelBlendFrame = new cl::Kernel(program, "BlendFrame"); kernelBlendFrame->setArg(0, *passFrameBuffer); kernelBlendFrame->setArg(1, *frameBuffer); kernelToneMapLinear = new cl::Kernel(program, "ToneMapLinear"); kernelToneMapLinear->setArg(0, *frameBuffer); kernelToneMapLinear->setArg(1, *toneMapFrameBuffer); kernelUpdatePixelBuffer = new cl::Kernel(program, "UpdatePixelBuffer"); kernelUpdatePixelBuffer->setArg(0, *toneMapFrameBuffer); kernelUpdatePixelBuffer->setArg(1, *pboBuff); } else { kernelBlendFrame = NULL; kernelToneMapLinear = NULL; kernelUpdatePixelBuffer = NULL; } }
RenderDevice::RenderDevice(const cl::Device &device, const string &kernelFileName, const unsigned int forceGPUWorkSize, Camera *camera, Sphere *spheres, const unsigned int sceneSphereCount/*, boost::barrier *startBarrier, boost::barrier *endBarrier*/) : /*renderThread(NULL), threadStartBarrier(startBarrier), threadEndBarrier(endBarrier),*/ sphereCount(sceneSphereCount), colorBuffer(NULL), pixelBuffer(NULL), seedBuffer(NULL), pixels(NULL), colors(NULL), seeds(NULL), exeUnitCount(0.0), exeTime(0.0) { deviceName = "anonymouse";//device.getInfo<CL_DEVICE_NAME > ().c_str(); // Allocate a context with the selected device cl::Platform platform = device.getInfo<CL_DEVICE_PLATFORM>(); VECTOR_CLASS<cl::Device> devices; devices.push_back(device); cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0 }; context = new cl::Context(devices, cps); // Allocate the queue for this device cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE; queue = new cl::CommandQueue(*context, device, prop); // Create the kernel string src = ReadSources(kernelFileName); // Compile sources cl::Program::Sources source(1, make_pair(src.c_str(), src.length())); cl::Program program = cl::Program(*context, source); try { VECTOR_CLASS<cl::Device> buildDevice; buildDevice.push_back(device); #if defined(__EMSCRIPTEN__) program.build(buildDevice, ""); #elif defined(__APPLE__) program.build(buildDevice, "-D__APPLE__"); #else program.build(buildDevice, ""); #endif cl::string result = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); cerr << "[Device::" << deviceName << "]" << " Compilation result: " << result.c_str() << endl; } catch (cl::Error err) { cl::string strError = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); cerr << "[Device::" << deviceName << "]" << " Compilation error:" << endl << strError.c_str() << endl; throw err; } kernel = new cl::Kernel(program, "RadianceGPU"); kernel->getWorkGroupInfo<size_t>(device, CL_KERNEL_WORK_GROUP_SIZE, &workGroupSize); cerr << "[Device::" << deviceName << "]" << " Suggested work group size: " << workGroupSize << endl; // Force workgroup size if applicable and required if ((forceGPUWorkSize > 0) && (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU)) { workGroupSize = forceGPUWorkSize; cerr << "[Device::" << deviceName << "]" << " Forced work group size: " << workGroupSize << endl; } // Create the thread for the rendering //renderThread = new boost::thread(boost::bind(RenderDevice::RenderThread, this)); // Create camera buffer cameraBuffer = new cl::Buffer(*context, #if defined (__APPLE__) CL_MEM_READ_ONLY, // CL_MEM_USE_HOST_PTR is very slow with Apple's OpenCL #else CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, #endif sizeof(Camera), camera); cerr << "[Device::" << deviceName << "] Camera buffer size: " << (sizeof(Camera) / 1024) << "Kb" << endl; sphereBuffer = new cl::Buffer(*context, #if defined (__APPLE__) CL_MEM_READ_ONLY, // CL_MEM_USE_HOST_PTR is very slow with Apple's OpenCL #else CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, #endif sizeof(Sphere) * sphereCount, spheres); cerr << "[Device::" << deviceName << "] Scene buffer size: " << (sizeof(Sphere) * sphereCount / 1024) << "Kb" << endl; }
int main(int argc, char* argv[]) { cl_int retCode; // get platforms const cl_platform_info PLATFORM_INFOS[] = { CL_PLATFORM_NAME, CL_PLATFORM_VENDOR, CL_PLATFORM_PROFILE, CL_PLATFORM_VERSION }; VECTOR_CLASS<Platform> platforms; retCode = Platform::get(&platforms); if (retCode != CL_SUCCESS) die("failed to get platforms"); std::cout << "platforms found: " << platforms.size() << std::endl; for (uint i = 0; i < platforms.size(); i++) { std::cout << "platform " << i+1 << ": "; for (uint j = 0; j < (sizeof(PLATFORM_INFOS) / sizeof(cl_platform_info)); j++) { STRING_CLASS info; retCode = platforms[i].getInfo(PLATFORM_INFOS[j], &info); if (retCode != CL_SUCCESS) die("failed to get platform info"); std::cout << info << " "; } std::cout << std::endl; // get devices const cl_device_info DEVICE_INFOS[] = { CL_DEVICE_NAME, CL_DEVICE_VENDOR, CL_DEVICE_PROFILE, CL_DEVICE_VERSION, CL_DRIVER_VERSION, CL_DEVICE_OPENCL_C_VERSION }; VECTOR_CLASS<Device> devices; retCode = platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices); if (retCode != CL_SUCCESS) die(" failed to get devices"); std::cout << " devices found: " << devices.size() << std::endl; for (uint j = 0; j < devices.size(); j++) { std::cout << " device " << j+1 << ": "; for (uint k = 0; k < (sizeof(DEVICE_INFOS) / sizeof(cl_device_info)); k++) { STRING_CLASS info; retCode = devices[j].getInfo(DEVICE_INFOS[k], &info); if (retCode != CL_SUCCESS) die(" failed to get device info"); std::cout << info << " "; } cl_ulong memSize; retCode = devices[j].getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &memSize); if (retCode != CL_SUCCESS) die(" failed to get device info"); std::cout << "GlobalMemSize:" << memSize / 1024 / 1024 << "MB "; retCode = devices[j].getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &memSize); if (retCode != CL_SUCCESS) die(" failed to get device info"); std::cout << "LocalMemSize:" << memSize / 1024 << "KB "; std::cout << std::endl; } } }
void OCLRendererThread::DrawFrame() { const GameLevel &gameLevel(*(renderer->gameLevel)); const GameConfig &gameConfig(*(gameLevel.gameConfig)); const unsigned int width = gameConfig.GetScreenWidth(); const unsigned int height = gameConfig.GetScreenHeight(); //-------------------------------------------------------------------------- // Merge all the framebuffers if required //-------------------------------------------------------------------------- const size_t threadCount = renderer->renderThread.size(); if (threadCount > 1) { vector<Pixel *> cpuFrameBuffers(threadCount); for (size_t i = 0; i < threadCount; ++i) cpuFrameBuffers[i] = renderer->renderThread[i]->cpuFrameBuffer->GetPixels(); Pixel *dst = cpuFrameBuffers[0]; for (size_t i = 0; i < width * height; ++i) { float r = dst->r; float g = dst->g; float b = dst->b; for (size_t j = 1; j < threadCount; ++j) { r += cpuFrameBuffers[j]->r; g += cpuFrameBuffers[j]->g; b += cpuFrameBuffers[j]->b; cpuFrameBuffers[j] += 1; } dst->r = r; dst->g = g; dst->b = b; ++dst; } cmdQueue->enqueueWriteBuffer(*passFrameBuffer, CL_FALSE, 0, sizeof(Pixel) * width * height, cpuFrameBuffers[0]); } //-------------------------------------------------------------------------- // Blend the new frame with the old one //-------------------------------------------------------------------------- kernelBlendFrame->setArg(2, renderer->blendFactor); cmdQueue->enqueueNDRangeKernel(*kernelBlendFrame, cl::NullRange, cl::NDRange(RoundUp<unsigned int>(width * height, WORKGROUP_SIZE)), cl::NDRange(WORKGROUP_SIZE)); //-------------------------------------------------------------------------- // Tone mapping //-------------------------------------------------------------------------- switch (gameLevel.toneMap->GetType()) { case TONEMAP_REINHARD02: case TONEMAP_LINEAR: cmdQueue->enqueueNDRangeKernel(*kernelToneMapLinear, cl::NullRange, cl::NDRange(RoundUp<unsigned int>(width * height, WORKGROUP_SIZE)), cl::NDRange(WORKGROUP_SIZE)); break; default: assert (false); } //-------------------------------------------------------------------------- // Copy the OpenCL frame buffer to OpenGL one //-------------------------------------------------------------------------- VECTOR_CLASS<cl::Memory> buffs; buffs.push_back(*pboBuff); cmdQueue->enqueueAcquireGLObjects(&buffs); cmdQueue->enqueueNDRangeKernel(*kernelUpdatePixelBuffer, cl::NullRange, cl::NDRange(RoundUp<unsigned int>(width * height, WORKGROUP_SIZE)), cl::NDRange(WORKGROUP_SIZE)); cmdQueue->enqueueReleaseGLObjects(&buffs); cmdQueue->finish(); // Draw the image on the screen glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); }
OCLRenderer::OCLRenderer(GameLevel *level) : LevelRenderer(level) { compiledScene = new CompiledScene(level); timeSinceLastCameraEdit = WallClockTime(); timeSinceLastNoCameraEdit = timeSinceLastCameraEdit; //-------------------------------------------------------------------------- // OpenCL setup //-------------------------------------------------------------------------- vector<cl::Device> selectedDevices; // Scan all platforms and devices available VECTOR_CLASS<cl::Platform> platforms; cl::Platform::get(&platforms); const string &selectString = gameLevel->gameConfig->GetOpenCLDeviceSelect(); size_t selectIndex = 0; for (size_t i = 0; i < platforms.size(); ++i) { SFERA_LOG("[OCLRenderer] OpenCL Platform " << i << ": " << platforms[i].getInfo<CL_PLATFORM_VENDOR>()); // Get the list of devices available on the platform VECTOR_CLASS<cl::Device> devices; platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices); for (size_t j = 0; j < devices.size(); ++j) { SFERA_LOG("[OCLRenderer] OpenCL device " << j << ": " << devices[j].getInfo<CL_DEVICE_NAME>()); SFERA_LOG("[OCLRenderer] Type: " << OCLDeviceTypeString(devices[j].getInfo<CL_DEVICE_TYPE>())); SFERA_LOG("[OCLRenderer] Units: " << devices[j].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()); SFERA_LOG("[OCLRenderer] Global memory: " << devices[j].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1024 << "Kbytes"); SFERA_LOG("[OCLRenderer] Local memory: " << devices[j].getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << "Kbytes"); SFERA_LOG("[OCLRenderer] Local memory type: " << OCLLocalMemoryTypeString(devices[j].getInfo<CL_DEVICE_LOCAL_MEM_TYPE>())); SFERA_LOG("[OCLRenderer] Constant memory: " << devices[j].getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>() / 1024 << "Kbytes"); bool selected = false; if (!gameLevel->gameConfig->GetOpenCLUseOnlyGPUs() || (devices[j].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU)) { if (selectString.length() == 0) selected = true; else { if (selectString.length() <= selectIndex) throw runtime_error("OpenCL select devices string (opencl.devices.select) has the wrong length"); if (selectString.at(selectIndex) == '1') selected = true; } } if (selected) { selectedDevices.push_back(devices[j]); SFERA_LOG("[OCLRenderer] SELECTED"); } else SFERA_LOG("[OCLRenderer] NOT SELECTED"); ++selectIndex; } } if (selectedDevices.size() == 0) throw runtime_error("Unable to find a OpenCL GPU device"); // Create synchronization barrier barrier = new boost::barrier(selectedDevices.size() + 1); totSamplePerPass = 0; for (size_t i = 0; i < selectedDevices.size(); ++i) totSamplePerPass += gameLevel->gameConfig->GetOpenCLDeviceSamplePerPass(i); renderThread.resize(selectedDevices.size(), NULL); for (size_t i = 0; i < selectedDevices.size(); ++i) { OCLRendererThread *rt = new OCLRendererThread(i, this, selectedDevices[i]); renderThread[i] = rt; } for (size_t i = 0; i < renderThread.size(); ++i) renderThread[i]->Start(); }
bool VNNclAlgorithm::reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes) { mMeasurementNames.clear(); int numBlocks = 10; // FIXME? needs to be the same as the number of input bscans to the voxel_method kernel // Split input US into blocks // Splits and copies data from the processed input in the way the kernel will processes it, which is per frameBlock frameBlock_t* inputBlocks = new frameBlock_t[numBlocks]; size_t nPlanes_numberOfInputImages = input->getDimensions()[2]; this->initializeFrameBlocks(inputBlocks, numBlocks, input); // Allocate CL memory for each frame block VECTOR_CLASS<cl::Buffer> clBlocks; report("Allocating OpenCL input block buffers"); for (int i = 0; i < numBlocks; i++) { //TODO why does the context suddenly contain a "dummy" device? cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].length, inputBlocks[i].data, "block buffer "+QString::number(i).toStdString()); clBlocks.push_back(buffer); } // Allocate output memory int *outputDims = outputData->GetDimensions(); size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] * sizeof(unsigned char); report(QString("Allocating CL output buffer, size %1").arg(outputVolumeSize)); cl_ulong globalMemUse = 10 * inputBlocks[0].length + outputVolumeSize + sizeof(float) * 16 * nPlanes_numberOfInputImages + sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1]; if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].length, globalMemUse)) return false; cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL, "output volume buffer"); // Fill the plane matrices float *planeMatrices = new float[16 * nPlanes_numberOfInputImages]; //4x4 (matrix) = 16 this->fillPlaneMatrices(planeMatrices, input); cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages * sizeof(float) * 16, planeMatrices, "plane matrices buffer"); // US Probe mask cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1], input->getMask()->GetScalarPointer(), "mask buffer"); double *out_spacing = outputData->GetSpacing(); float spacings[2]; float f_out_spacings[3]; f_out_spacings[0] = out_spacing[0]; f_out_spacings[1] = out_spacing[1]; f_out_spacings[2] = out_spacing[2]; spacings[0] = input->getSpacing()[0]; spacings[1] = input->getSpacing()[1]; //TODO why 4? because float4 is used?? size_t planes_eqs_size = sizeof(cl_float)*4*nPlanes_numberOfInputImages; // Find the optimal local work size size_t local_work_size; unsigned int deviceNumber = 0; cl::Device device = mOulContex->getDevice(deviceNumber); mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size); size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes); this->setKernelArguments( mKernel, outputDims[0], outputDims[1], outputDims[2], f_out_spacings[0], f_out_spacings[1], f_out_spacings[2], input->getDimensions()[0], input->getDimensions()[1], spacings[0], spacings[1], clBlocks, outputBuffer, clPlaneMatrices, clMask, planes_eqs_size, close_planes_size, radius); report(QString("Using %1 as local workgroup size").arg(local_work_size)); // We will divide the work into cubes of CUBE_DIM^3 voxels. The global work size is the total number of voxels divided by that. int cube_dim = 4; int cube_dim_pow3 = cube_dim * cube_dim * cube_dim; // Global work items: size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3); // = number of cubes = number of kernels to run // Round global_work_size up to nearest multiple of local_work_size if (global_work_size % local_work_size) global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size; // ceil(...) unsigned int queueNumber = 0; cl::CommandQueue queue = mOulContex->getQueue(queueNumber); this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName); this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(), "vnncl_read_buffer"); setDeepModified(outputData); // Cleaning up report(QString("Done, freeing GPU memory")); this->freeFrameBlocks(inputBlocks, numBlocks); delete[] inputBlocks; inputBlocks = NULL; return true; }
OCLutil::OCLutil(cl_device_type type,std::string arq,std::string buildOptions,std::string nomeRot,int n) { VECTOR_CLASS<cl::Platform> platforms; cl::Platform::get(&platforms); if(platforms.size() == 0){ std::cout<<"No OpenCL platforms were found"<<std::endl; } int platformID = -1; for(unsigned int i = 0; i < platforms.size(); i++) { try { VECTOR_CLASS<cl::Device> devices; platforms[i].getDevices(type, &devices); platformID = i; break; } catch(std::exception e) { std::cout<<"Error ao ler plataforma: "<<std::endl; continue; } } if(platformID == -1){ std::cout<<"No compatible OpenCL platform found"<<std::endl; } cl::Platform platform = platforms[platformID]; std::cout << "Using platform vendor: " << platform.getInfo<CL_PLATFORM_VENDOR>() << std::endl; // Use the preferred platform and create a context cl_context_properties cps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platform)(), 0 }; try { context = cl::Context(type, cps); } catch(std::exception e) { std::cout<<"Failed to create an OpenCL context!"<<std::endl; } std::string filename = arq; std::ifstream sourceFile(filename.c_str()); if(sourceFile.fail()) std::cout<<"Failed to open OpenCL source file"<<std::endl; std::string sourceCode( std::istreambuf_iterator<char>(sourceFile), (std::istreambuf_iterator<char>())); cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1)); // Make program of the source code in the context cl::Program program = cl::Program(context, source); VECTOR_CLASS<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string deviceInfo; cl_ulong memInfo; size_t tam; cl_uint clUnit; int indexDev = 0; int maxU = 0; for (int i = 0; i < devices.size(); ++i) { devices[i].getInfo((cl_device_info) CL_DEVICE_NAME, &deviceInfo); std::cout << "Device info: " << deviceInfo << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_VERSION, &deviceInfo); std::cout << "Versão CL: " << deviceInfo << std::endl; devices[i].getInfo((cl_device_info) CL_DRIVER_VERSION, &deviceInfo); std::cout << "Versão Driver: " << deviceInfo << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_GLOBAL_MEM_SIZE, &memInfo); std::cout << "Memoria Global: " << memInfo << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_LOCAL_MEM_SIZE, &memInfo); std::cout << "Memoria Local: " << memInfo << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_LOCAL_MEM_SIZE, &tam); std::cout << "Max tamanho Work-group: " << tam << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &clUnit); std::cout << "Max dimensao: " << clUnit << std::endl; devices[i].getInfo((cl_device_info) CL_DEVICE_MAX_COMPUTE_UNITS, &clUnit); std::cout << "Unidades CL: " << clUnit << std::endl; std::cout << "*********************************" << std::endl; if((int)clUnit>maxU){ indexDev = i; maxU = (int)clUnit; } } // Build program for these specific devices cl_int error = program.build(devices, buildOptions.c_str()); if(error != 0) { std::cout << "Build log:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl; } std::cout << "Index Dispositino selecionado: " << indexDev << std::endl; queue = cl::CommandQueue(context, devices[indexDev]); int posi = 0; int posf = 0; for(int i = 0; i < n; i++){ posf = nomeRot.find(",",posi); std::string nomeRoti; if(posf != -1){ nomeRoti = nomeRot.substr(posi,posf-posi); }else{ nomeRoti = nomeRot.substr(posi); } std::cout<<"Nome rotina["<<i<<"]: "<<nomeRoti.data()<<std::endl; rotina.push_back(cl::Kernel(program, nomeRoti.data())); posi = posf + 1; } }