void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true); kern = cl::Kernel(prog, "julia"); }); //auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern); static const NDRange local(8, 8); NDRange global(local[0] * divup(DIMX, local[0]), local[1] * divup(DIMY, local[1])); kern.setArg(0, devOut); kern.setArg(1, DIMX); kern.setArg(2, DIMY); queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local); //juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY); }
inline ImageBuffer DoBasicOp(cl::Kernel& kernel, const ImageBuffer& i1, ImageBuffer o) { kernel.setArg(0, i1.mem()); kernel.setArg(1, o.mem()); Enqueue(kernel, o); return o; }
void initSimulation() { // source: http://stackoverflow.com/questions/26517114/how-to-compile-opencl-project-with-kernels try { std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms[PLATFORM_ID].getDevices(CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, &devices); context = cl::Context(devices); queue = cl::CommandQueue(context, devices[DEVICE_ID]); std::ifstream sourceFile{"kernels/programs.cl"}; std::string sourceCode(std::istreambuf_iterator<char>(sourceFile), (std::istreambuf_iterator<char>())); cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length())); simulationProgram = cl::Program(context, source); simulationProgram.build(devices); visualizationBufferGPU = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); randomizeField(); stepKernel = cl::Kernel(simulationProgram, "tick"); stepKernel.setArg(0, fieldWidth); stepKernel.setArg(1, fieldHeight); stepKernel.setArg(3, visualizationBufferGPU); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(2); } }
void CoreSersicProfile::add_kernel_parameters(unsigned int index, cl::Kernel &kernel) const { kernel.setArg(index++, static_cast<FT>(re)); kernel.setArg(index++, static_cast<FT>(rb)); kernel.setArg(index++, static_cast<FT>(nser)); kernel.setArg(index++, static_cast<FT>(a)); kernel.setArg(index++, static_cast<FT>(b)); kernel.setArg(index++, static_cast<FT>(_bn)); }
void initParticles(uint32_t num_particles) { m_geom = gl::Geometry::create(); m_geom->setPrimitiveType(GL_POINTS); m_mesh = gl::Mesh::create(m_geom, m_pointMaterial); m_numParticles = num_particles; GLsizei numBytes = m_numParticles * sizeof(vec4); m_geom->vertices().resize(m_numParticles, vec3(0)); m_geom->colors().resize(m_numParticles, vec4(1)); m_geom->point_sizes().resize(m_numParticles, 9.f); m_geom->createGLBuffers(); m_mesh->material()->setPointSize(2.f); scene().addObject(m_mesh); try { // shared position buffer for OpenGL / OpenCL m_positions = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->vertexBuffer().id()); m_colors = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->colorBuffer().id()); //create the OpenCL only arrays m_velocities = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_positionGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_velocityGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); vector<vec4> posGen, velGen; for (int i = 0; i < m_numParticles; i++) { posGen.push_back( vec4(glm::ballRand(20.0f), 1.f) ); vec2 tmp = glm::linearRand(vec2(-100), vec2(100)); float life = kinski::random(2.f, 5.f); float yVel = kinski::random<float>(5, 15); velGen.push_back(vec4(tmp.x, yVel, tmp.y, life)); m_geom->point_sizes()[i] = kinski::random(5.f, 15.f); } m_geom->createGLBuffers(); m_queue.enqueueWriteBuffer(m_velocities, CL_TRUE, 0, numBytes, &velGen[0]); m_queue.enqueueWriteBuffer(m_positionGen, CL_TRUE, 0, numBytes, &posGen[0]); m_queue.enqueueWriteBuffer(m_velocityGen, CL_TRUE, 0, numBytes, &velGen[0]); m_particleKernel.setArg(0, m_positions); m_particleKernel.setArg(1, m_colors); m_particleKernel.setArg(2, m_velocities); m_particleKernel.setArg(3, m_positionGen); m_particleKernel.setArg(4, m_velocityGen); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
inline ImageBuffer DoBasicOp(cl::Kernel& kernel, const ImageBuffer& i1, const ImageBuffer& i2, const ImageBuffer& i3, const ImageBuffer& i4, ImageBuffer o) { kernel.setArg(0, i1.mem()); kernel.setArg(1, i2.mem()); kernel.setArg(2, i3.mem()); kernel.setArg(3, i4.mem()); kernel.setArg(4, o.mem()); Enqueue(kernel, o); return o; }
void MetaBallsApp::updateParticles() { int random = rand(); float time = 1.0f / 60.0f; mClParticleUpdate.setArg( 3, sizeof(float), &time ); mClParticleUpdate.setArg( 4, sizeof(int32_t), &random ); // Queue the kernel up for execution across the array mClCommandQueue.enqueueNDRangeKernel( mClParticleUpdate, cl::NullRange, cl::NDRange( NUM_PARTICLES ) ); }
void MetaBallsApp::setupParticleKernel() { auto program = ocl::createProgram( mClContext, loadAsset( "kernels/particles.cl" ), true ); mClParticleUpdate = ocl::Kernel( program, "particle_update" ); float maxLife = 60.0; float minVelSqd = 0.5 * 0.5; mClParticleUpdate.setArg( 0, mClParticleBuf ); mClParticleUpdate.setArg( 1, sizeof(float), &maxLife ); mClParticleUpdate.setArg( 2, sizeof(float), &minVelSqd ); mClParticleUpdate.setArg( 5, sizeof(cl_int), &NUM_PARTICLES ); }
void neo::randomUniform(cl::Image3D &image3D, sys::ComputeSystem &cs, cl::Kernel &randomUniform3DKernel, cl_int3 size, cl_float2 range, std::mt19937 &rng) { int argIndex = 0; std::uniform_int_distribution<int> seedDist; cl_uint2 seed = { seedDist(rng), seedDist(rng) }; randomUniform3DKernel.setArg(argIndex++, image3D); randomUniform3DKernel.setArg(argIndex++, seed); randomUniform3DKernel.setArg(argIndex++, range); cs.getQueue().enqueueNDRangeKernel(randomUniform3DKernel, cl::NullRange, cl::NDRange(size.x, size.y, size.z)); }
void updateLevelSetFunction( OpenCL &ocl, cl::Kernel &kernel, cl::Image3D &input, cl::Buffer &positions, int activeVoxels, int numberOfThreads, int groupSize, cl::Memory * phi_read, cl::Memory * phi_write, float threshold, float epsilon, float alpha ) { kernel.setArg(0, input); kernel.setArg(1, positions); kernel.setArg(2, activeVoxels); kernel.setArg(3, *phi_read); kernel.setArg(4, *phi_write); kernel.setArg(5, threshold); kernel.setArg(6, epsilon); kernel.setArg(7, alpha); ocl.queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(numberOfThreads), cl::NDRange(groupSize) ); }
void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
inline void OpenCL::addkernelarg(std::size_t i, T const & arg, cl::Kernel & kernel,std::vector<cl::Buffer> &outputbuffer,cl::CommandQueue &quene) const { kernel.setArg(i,arg); // Push back a dummy since we actually dont need to allocate anything for a scalar outputbuffer.push_back(cl::Buffer()); }
void updateParticles(float timeDelta) { try { vector<cl::Memory> glBuffers; glBuffers.push_back(m_positions); glBuffers.push_back(m_colors); //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL // this passes in the vector of VBO buffer objects (position and color) m_queue.enqueueAcquireGLObjects(&glBuffers); m_particleKernel.setArg(5, timeDelta); //pass in the timestep //execute the kernel m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles), cl::NullRange); //Release the VBOs so OpenGL can play with them m_queue.enqueueReleaseGLObjects(&glBuffers, NULL); m_queue.finish(); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void RectangularBorderHandler::setarg(int arg,cl::Kernel& ker, int dimx, int dimy) { if (m_bufferMap.find(std::make_pair(dimx,dimy)) == m_bufferMap.end()) genBuffer(dimx,dimy); ker.setArg(arg,m_bufferMap.at(std::make_pair(dimx,dimy)) ()); }
void ParallelepipedalBorderHandler::setarg(int arg,cl::Kernel& ker, int dimx, int dimy,int dimz) { if (m_bufferMap.find(tri(dimx,dimy,dimz)) == m_bufferMap.end()) genBuffer(dimx,dimy,dimz); ker.setArg(arg,m_bufferMap.at(tri(dimx,dimy,dimz)) ()); }
inline void OpenCL::addkernelarg(std::size_t i, T const (& arg)[N], cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,N*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*N,&arg); kernel.setArg(i,buffer); }
inline void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0])); kernel.setArg(i,buffer); }
cl::Buffer performReduction(const cl::Buffer & in,cl::Kernel & ker,cl::CommandQueue & q,int size) { if (size == 1) return in; int newsize = std::max(1,size/4); cl::Buffer tmp (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*newsize); ker.setArg(0,in()); ker.setArg(1,tmp()); ker.setArg(2,size); ker.setArg(3,4); q.enqueueNDRangeKernel(ker,cl::NDRange(0),cl::NDRange(newsize), getBestWorkspaceDim(cl::NDRange(newsize))); return performReduction(tmp,ker,q,newsize); }
void findMinSeamVert(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, cl::Buffer &vertMinEnergy, cl::Buffer &vertMinIdx, int width, int height, int pitch, int colsRemoved) { cl_int errNum; errNum = findMinSeamVertKernel.setArg(0, energyMatrix); errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy); errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx); errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(5, width); errNum |= findMinSeamVertKernel.setArg(6, height); errNum |= findMinSeamVertKernel.setArg(7, pitch); errNum |= findMinSeamVertKernel.setArg(8, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting findMinSeamVert arguments." << std::endl; exit(-1); } // This kernel could be written to use more than one work group, but its probably not worth it. cl::NDRange offset = cl::NDRange(0); cl::NDRange localWorkSize = cl::NDRange(256); cl::NDRange globalWorkSize = cl::NDRange(256); errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl; exit(-1); } /** DEBUG **/ // int deviceResultIdx[1]; // float deviceResultEnergy[1]; // mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx); // mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy); // std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl; // std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl; }
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice) { int set_size=8; try { cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, isize*sizeof(cl_uchar), in, NULL); cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); gNV21Kernel.setArg(2,w); gNV21Kernel.setArg(3,h); gNV21Kernel.setArg(1,bufferIn); gNV21Kernel.setArg(0,bufferOut); gQueue.enqueueNDRangeKernel(gNV21Kernel, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); if (choice==1) { gLaplacianK.setArg(2,w); gLaplacianK.setArg(3,h); gLaplacianK.setArg(1,bufferOut); gLaplacianK.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gLaplacianK, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } else if (choice>1) { gNegative.setArg(2,w); gNegative.setArg(3,h); gNegative.setArg(1,bufferOut); gNegative.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gNegative, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out); } catch (cl::Error e) { LOGI("@oclDecoder: %s %d \n",e.what(),e.err()); } }
/// \brief Query the preferred factor of local size /// \ingroup OpenCL /// /// \param kern An OpenCL kernel /// \param dev An OpenCL device /// \param factor Multiplier factor of local size for optimzied performance /// \param lmax Maximum of the local size /// \param mmax Maximum of the multiplier of the factor inline void cl_minmax_local_size ( const ::cl::Kernel &kern, const ::cl::Device &dev, std::size_t &factor, std::size_t &lmax, std::size_t &mmax) { try { kern.getWorkGroupInfo(dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &factor); kern.getWorkGroupInfo(dev, CL_KERNEL_WORK_GROUP_SIZE, &lmax); if (factor == 0 || factor > lmax) { factor = lmax = mmax = 0; return; } mmax = lmax / factor; } catch (const ::cl::Error &) { factor = lmax = mmax = 0; } }
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), sinf_ocl_kernel, true); kern = cl::Kernel(prog, "sinf"); }); static const NDRange global(SIZE * 2); kern.setArg(0, devOut); kern.setArg(1, dx); kern.setArg(2, SIZE); queue.enqueueNDRangeKernel(kern, cl::NullRange, global); }
/** * Applies a gaussian blur filter to an image using openCL * @param ctx An openCL context object. * @param cmdQueue An openCL command queue. * @param sampler An openCL image sampler object. * @param height The height of the input image. * @param width The width of the input image. * @return An Image2D object containing the resulting image data. */ void blur(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &blurEvent, cl::Buffer &inputImage, cl::Buffer &outputImage, int height, int width, int colsRemoved) { // Set kernel arguments cl_int errNum; errNum = blurKernel.setArg(0, inputImage); errNum |= blurKernel.setArg(1, outputImage); errNum |= blurKernel.setArg(2, width); errNum |= blurKernel.setArg(3, height); errNum |= blurKernel.setArg(4, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting blurKernel arguments." << std::endl; std::cerr << errNum << std::endl; exit(-1); } // Determine local and global work size cl::NDRange offset = cl::NDRange(0, 0); cl::NDRange localWorkSize = cl::NDRange(16, 16); cl::NDRange globalWorkSize = cl::NDRange(math::roundUp(localWorkSize[0], width), math::roundUp(localWorkSize[1], height)); // Run blurKernel errNum = cmdQueue.enqueueNDRangeKernel(blurKernel, offset, globalWorkSize, localWorkSize, NULL, &blurEvent); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueuing blur kernel for execution." << std::endl; exit(-1); } }
void run(T* buf) { cl_int err; cl::Buffer outbuf( m_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, N*N*sizeof(T), buf, &err); checkErr(err, "Buffer::Buffer()"); err = m_kernel.setArg(0, outbuf); checkErr(err, "Kernel::setArg(0)"); err = m_kernel.setArg(1, N); checkErr(err, "Kernel::setArg(1)"); err = m_kernel.setArg(2, depth); checkErr(err, "Kernel::setArg(2)"); err = m_kernel.setArg(3, escape2); checkErr(err, "Kernel::setArg(3)"); cl::Event event; err = m_cmdq.enqueueNDRangeKernel( m_kernel, cl::NullRange, cl::NDRange(N*N), cl::NDRange(N, 1), NULL, &event); checkErr(err, "ComamndQueue::enqueueNDRangeKernel()"); event.wait(); err = m_cmdq.enqueueReadBuffer( outbuf, CL_TRUE, 0, N*N*sizeof(T), buf); checkErr(err, "ComamndQueue::enqueueReadBuffer()"); }
void maskUnreachable(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, int width, int height, int pitch, int colsRemoved) { cl_int errNum; errNum = maskUnreachableKernel.setArg(0, energyMatrix); errNum |= maskUnreachableKernel.setArg(1, width); errNum |= maskUnreachableKernel.setArg(2, height); errNum |= maskUnreachableKernel.setArg(3, pitch); errNum |= maskUnreachableKernel.setArg(4, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting maskUnreachable kernel arguments." << std::endl; exit(-1); } cl::NDRange offset = cl::NDRange(0, 0); cl::NDRange localWorkSize = cl::NDRange(16, 16); cl::NDRange globalWorkSize = cl::NDRange(math::roundUp(localWorkSize[0], width), math::roundUp(localWorkSize[1], height)); errNum = cmdQueue.enqueueNDRangeKernel(maskUnreachableKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueueing maskUnreachable kernel." << std::endl; std::cerr << errNum << std::endl; exit(-1); } }
void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,std::vector<cl::Buffer> &outputbuffer,cl::CommandQueue &quene)const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T)); outputbuffer.push_back(buffer); // std::cout << "enqeue\n"; cl_int err = quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0])); if (err){ std::cerr << "Error while pushing Vector. Errorcode: " << err << std::endl; } kernel.setArg(i,buffer); }
void backtrack(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, cl::Buffer &vertSeamPath, cl::Buffer &vertMinIdx, int width, int height, int pitch, int colsRemoved) { cl_int errNum; // Set kernel arguments errNum = backtrackKernel.setArg(0, energyMatrix); errNum |= backtrackKernel.setArg(1, vertSeamPath); errNum |= backtrackKernel.setArg(2, vertMinIdx); errNum |= backtrackKernel.setArg(3, width); errNum |= backtrackKernel.setArg(4, height); errNum |= backtrackKernel.setArg(5, pitch); errNum |= backtrackKernel.setArg(6, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting backtrack kernel arguments." << std::endl; exit(-1); } cl::NDRange offset = cl::NDRange(0); cl::NDRange localWorkSize = cl::NDRange(1); cl::NDRange globalWorkSize = cl::NDRange(256); errNum = cmdQueue.enqueueNDRangeKernel(backtrackKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueueing backTrack kernel for execution." << std::endl; exit(-1); } // /** DEBUGGING **/ // int deviceResult[height]; // mem::read(ctx, cmdQueue, deviceResult, vertSeamPath, height); // for (int i = height - 5; i < height; ++i) { // std::cout << "deviceResult[" << i << "]=\t" << deviceResult[i] << std::endl; // } }
int main(int argc, char *argv[]) { cl_int err = CL_SUCCESS; cl::Event evt; std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); if (platforms.size() == 0) { return false; } platform_ = platforms[0]; cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &err); CHECK_CL_ERROR(err, "cl::Context"); std::vector<cl::Device> devices = context_.getInfo<CL_CONTEXT_DEVICES>(); if (devices.size() == 0) { return false; } device_ = devices[0]; sources_.push_back(std::make_pair(source_str.c_str(), source_str.size())); program_ = cl::Program(context_, sources_); err = program_.build(devices); if (err != CL_SUCCESS) { std::string log = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cout << "program.build() ERROR: " << log.c_str() << std::endl; return false; } kernel_ = cl::Kernel(program_, "hello", &err); CHECK_CL_ERROR(err, "cl::Kernel"); buf_ = cl::Buffer(context_, CL_MEM_READ_ONLY, 1024, NULL, &err); queue_ = cl::CommandQueue(context_, device_, 0, &err); CHECK_CL_ERROR(err, "cl::CommandQueue"); kernel_.setArg(0, buf_); err = queue_.enqueueNDRangeKernel(kernel_, cl::NullRange, cl::NDRange(10, 10), cl::NullRange, NULL, &evt); evt.wait(); CHECK_CL_ERROR(err, "queue.enqueueNDRangeKernel()"); return 0; }
void Buffer::addToKernel(cl::Kernel& kernel, unsigned int argIndex) { if (clBuffer == NULL) { clBuffer = resourceManager->createBuffer(size); } try { kernel.setArg(argIndex, *clBuffer); } catch (cl::Error err) { std::cerr << "ERROR Setting Buffer kernel arg(" << argIndex << "): " << err.what() << "(" << err.err() << ")" << std::endl; throw err; } }
/// \brief The preferred global and local size /// \ingroup OpenCL /// /// \return The difference between the preferred global size and the N inline std::size_t cl_preferred_work_size (std::size_t N, const ::cl::Kernel &kern, const ::cl::Device &dev, std::size_t &global_size, std::size_t &local_size) { cl::size_t<3> reqd_size; try { kern.getWorkGroupInfo(dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, &reqd_size); } catch (const ::cl::Error &) { reqd_size[0] = 0; } if (reqd_size[0] != 0) { local_size = reqd_size[0]; global_size = cl_min_global_size(N, local_size); return global_size - N; } std::size_t factor; std::size_t lmax; std::size_t mmax; cl_minmax_local_size(kern, dev, factor, lmax, mmax); if (lmax == 0) { global_size = N; local_size = 0; return global_size - N; } local_size = lmax; global_size = cl_min_global_size(N, local_size); std::size_t diff_size = global_size - N; for (std::size_t m = mmax; m >= 1; --m) { std::size_t l = m * factor; std::size_t g = cl_min_global_size(N, l); std::size_t d = g - N; if (d < diff_size) { local_size = l; global_size = g; diff_size = d; } } return diff_size; }