void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
void PathOCLRenderThread::EnqueueAdvancePathsKernel(cl::CommandQueue &oclQueue) { PathOCLRenderEngine *engine = (PathOCLRenderEngine *)renderEngine; const u_int taskCount = engine->taskCount; // Micro kernels version oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_NEXT_VERTEX, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_NOTHING, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_OBJECT, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_DL, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_ILLUMINATE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_SAMPLE_BSDF, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_NEXT_VERTEX_RAY, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_SPLAT_SAMPLE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_NEXT_SAMPLE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); }
void updateParticles(float timeDelta) { try { vector<cl::Memory> glBuffers; glBuffers.push_back(m_positions); glBuffers.push_back(m_colors); //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL // this passes in the vector of VBO buffer objects (position and color) m_queue.enqueueAcquireGLObjects(&glBuffers); m_particleKernel.setArg(5, timeDelta); //pass in the timestep //execute the kernel m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles), cl::NullRange); //Release the VBOs so OpenGL can play with them m_queue.enqueueReleaseGLObjects(&glBuffers, NULL); m_queue.finish(); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void procOCL_OCV(int tex, int w, int h) { int64_t t = getTimeMs(); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, tex); std::vector < cl::Memory > images(1, imgIn); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); cv::UMat uIn, uOut, uTmp; cv::ocl::convertFromImage(imgIn(), uIn); LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t)); theQueue.enqueueReleaseGLObjects(&images); t = getTimeMs(); //cv::blur(uIn, uOut, cv::Size(5, 5)); cv::Laplacian(uIn, uTmp, CV_8U); cv:multiply(uTmp, 10, uOut); cv::ocl::finish(); LOGD("OpenCV processing costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, tex); images.clear(); images.push_back(imgOut); theQueue.enqueueAcquireGLObjects(&images); cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ); cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr(); size_t offset = 0; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { w, h, 1 }; CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS); theQueue.enqueueReleaseGLObjects(&images); cv::ocl::finish(); LOGD("uploading results to texture costs %d ms", getTimeInterval(t)); }
void MetaBallsApp::update() { std::vector<cl::Memory> acquire( { mClParticleBuf, mClMarchingRenderBuffer, mClMarchingDebugBuffer } ); mClCommandQueue.enqueueAcquireGLObjects( &acquire ); updateParticles(); updateMarching(); mClCommandQueue.enqueueReleaseGLObjects( &acquire ); }
void PTWeekend::draw() { /* * BEGIN - each frame part */ /* Enqueue kernel for execution */ glm::vec3 origin,lower_left, hor, ver; float theta = camera.getFov() * M_PI / 180.0f; float half_height = tan(theta / 2.0f); float half_width = camera.getAspectRatio() * half_height; origin = camera.getEyePoint(); glm::vec3 u, v, w; w = -glm::normalize(camera.getViewDirection()); //odd... u = glm::normalize(glm::cross(glm::vec3(0,1,0), w)); v = glm::cross(w, u); lower_left = origin - half_width * u - half_height * v - w; hor = 2.0f * half_width * u; ver = 2.0f * half_height * v; pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer"); clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not acquire gl objects"); cl::Event profiling_evt; clStatus = cmd_queue.enqueueNDRangeKernel(kernel, cl::NDRange(0,0), cl::NDRange(img_width, img_height), cl::NDRange(local_width,local_height), NULL, &profiling_evt); profiling_evt.wait(); pt_assert(clStatus, "Could not enqueue the kernel"); clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not release gl objects"); cmd_queue.finish(); cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = time_end - time_start; std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n"; /* * END - each frame part */ gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight())); }
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isKernelLatency) return 0; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI; cl::NDRange globalSize = (numItems / FETCH_PER_WI); cl::NDRange localSize = devInfo.maxWGSize; int iters = devInfo.kernelLatencyIters; float latency; try { log->print(NEWLINE TAB TAB "Kernel launch latency : "); log->xmlOpenTag("kernel_launch_latency"); log->xmlAppendAttribs("unit", "us"); cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float))); cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float))); cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset"); kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf); // Dummy calls queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.finish(); latency = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000; cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000; latency += (float)((int)end - (int)start); } latency /= iters; log->print(latency); log->print(" us" NEWLINE); log->xmlSetContent(latency); log->xmlCloseTag(); } catch(cl::Error error) { log->print(error.err() + NEWLINE); log->print(TAB TAB "Tests skipped" NEWLINE); return -1; } return 0; }
cl::Event RuntimeMeasurementsManager::enqueueNewMarker(cl::CommandQueue queue) { cl::Event event; #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) // Use deprecated API queue.enqueueMarker(&event); #else queue.enqueueMarkerWithWaitList(NULL, &event) #endif queue.finish(); return event; }
void initParticles(uint32_t num_particles) { m_geom = gl::Geometry::create(); m_geom->setPrimitiveType(GL_POINTS); m_mesh = gl::Mesh::create(m_geom, m_pointMaterial); m_numParticles = num_particles; GLsizei numBytes = m_numParticles * sizeof(vec4); m_geom->vertices().resize(m_numParticles, vec3(0)); m_geom->colors().resize(m_numParticles, vec4(1)); m_geom->point_sizes().resize(m_numParticles, 9.f); m_geom->createGLBuffers(); m_mesh->material()->setPointSize(2.f); scene().addObject(m_mesh); try { // shared position buffer for OpenGL / OpenCL m_positions = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->vertexBuffer().id()); m_colors = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->colorBuffer().id()); //create the OpenCL only arrays m_velocities = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_positionGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_velocityGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); vector<vec4> posGen, velGen; for (int i = 0; i < m_numParticles; i++) { posGen.push_back( vec4(glm::ballRand(20.0f), 1.f) ); vec2 tmp = glm::linearRand(vec2(-100), vec2(100)); float life = kinski::random(2.f, 5.f); float yVel = kinski::random<float>(5, 15); velGen.push_back(vec4(tmp.x, yVel, tmp.y, life)); m_geom->point_sizes()[i] = kinski::random(5.f, 15.f); } m_geom->createGLBuffers(); m_queue.enqueueWriteBuffer(m_velocities, CL_TRUE, 0, numBytes, &velGen[0]); m_queue.enqueueWriteBuffer(m_positionGen, CL_TRUE, 0, numBytes, &posGen[0]); m_queue.enqueueWriteBuffer(m_velocityGen, CL_TRUE, 0, numBytes, &velGen[0]); m_particleKernel.setArg(0, m_positions); m_particleKernel.setArg(1, m_colors); m_particleKernel.setArg(2, m_velocities); m_particleKernel.setArg(3, m_positionGen); m_particleKernel.setArg(4, m_velocityGen); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice) { int set_size=8; try { cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, isize*sizeof(cl_uchar), in, NULL); cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); gNV21Kernel.setArg(2,w); gNV21Kernel.setArg(3,h); gNV21Kernel.setArg(1,bufferIn); gNV21Kernel.setArg(0,bufferOut); gQueue.enqueueNDRangeKernel(gNV21Kernel, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); if (choice==1) { gLaplacianK.setArg(2,w); gLaplacianK.setArg(3,h); gLaplacianK.setArg(1,bufferOut); gLaplacianK.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gLaplacianK, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } else if (choice>1) { gNegative.setArg(2,w); gNegative.setArg(3,h); gNegative.setArg(1,bufferOut); gNegative.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gNegative, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out); } catch (cl::Error e) { LOGI("@oclDecoder: %s %d \n",e.what(),e.err()); } }
/** * generate 64 bit unsigned random numbers in device global memory *@param tinymt_status device global memories *@param total_num total number of work items *@param local_num number of local work items *@param data_size number of data to generate */ static void generate_uint64(Buffer& tinymt_status, int total_num, int local_num, int data_size) { #if defined(DEBUG) cout << "generate_uint64 start" << endl; #endif int min_size = total_num; if (data_size % min_size != 0) { data_size = (data_size / min_size + 1) * min_size; } Kernel uint_kernel(program, "tinymt_uint64_kernel"); Buffer output_buffer(context, CL_MEM_READ_WRITE, data_size * sizeof(uint64_t)); uint_kernel.setArg(0, tinymt_status); uint_kernel.setArg(1, output_buffer); uint_kernel.setArg(2, data_size / total_num); NDRange global(total_num); NDRange local(local_num); Event generate_event; #if defined(DEBUG) cout << "generate_uint64 enque kernel start" << endl; #endif queue.enqueueNDRangeKernel(uint_kernel, NullRange, global, local, NULL, &generate_event); uint64_t * output = new uint64_t[data_size]; generate_event.wait(); queue.enqueueReadBuffer(output_buffer, CL_TRUE, 0, data_size * sizeof(uint64_t), output); check_data(output, data_size, total_num); #if defined(DEBUG) print_uint64(output, data_size, total_num); #endif double time = get_time(generate_event); cout << "generate time:" << time * 1000 << "ms" << endl; delete[] output; #if defined(DEBUG) cout << "generate_uint64 end" << endl; #endif }
void copyFromDevice(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised."); queue.enqueueReadBuffer(m_buffer, CL_TRUE, 0, m_cb, m_pElts); }
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true); kern = cl::Kernel(prog, "julia"); }); //auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern); static const NDRange local(8, 8); NDRange global(local[0] * divup(DIMX, local[0]), local[1] * divup(DIMY, local[1])); kern.setArg(0, devOut); kern.setArg(1, DIMX); kern.setArg(2, DIMY); queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local); //juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY); }
bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 1024 * 2 + 15; std::vector<T> input(size); std::cout << "##Testing scan for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); for(size_t i = 0; i < input.size(); ++i) input[i] = i+1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::scan<T> scanFunctor; scanFunctor.build(queue, context); scanFunctor(bufferIn, bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << (failed ? " FAILED" : " PASSED") << std::endl; return failed; }
cl::Event runKernel(const cl::CommandQueue& queue, const cl::Kernel& kernel, const cl::NDRange& globalSize, const cl::NDRange& groupSize, std::vector<cl::Event>& events) { cl::Event event; queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, groupSize, &events, &event); events.push_back(event); return event; }
void runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 2 << 10; std::vector<T> input(size); std::cout << "##Testing bitonic sort for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type() << std::endl; for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::bitonicSort<T> bitonicSortFunctor; bitonicSortFunctor.build(queue, context); bitonicSortFunctor(bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); if (!testOutput(input, output)) M_throw() << "Incorrect output for size " << input.size() << " and type " << magnet::CL::detail::traits<T>::kernel_type(); }
inline void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0])); kernel.setArg(i,buffer); }
inline void OpenCL::addkernelarg(std::size_t i, T const (& arg)[N], cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,N*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*N,&arg); kernel.setArg(i,buffer); }
CL::Event OGLSharedFramebuffer::release(CL::CommandQueue& queue, const CL::Event& evt) { if (_shared) { CL::Event e = queue.enq_GL_release(_cl_buffer->get(), "release framebuffer", evt); return e; } else { assert(_local); CL::Event e = queue.enq_read_buffer(*_cl_buffer, _local, _tex_buffer.get_size(), "read framebuffer", evt); queue.wait_for_events(e); _tex_buffer.load(_local); return CL::Event(); } }
void procOCL_I2I(int texIn, int texOut, int w, int h) { if(!haveOpenCL) return; LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut); std::vector < cl::Memory > images; images.push_back(imgIn); images.push_back(imgOut); int64_t t = getTimeMs(); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once Laplacian.setArg(0, imgIn); Laplacian.setArg(1, imgOut); theQueue.finish(); LOGD("Kernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange); theQueue.finish(); LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueReleaseGLObjects(&images); theQueue.finish(); LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t)); }
void read(const cl::CommandQueue &q, size_t offset, size_t size, T *host, bool blocking = false) const { if (size) q.enqueueReadBuffer( buffer, blocking ? CL_TRUE : CL_FALSE, sizeof(T) * offset, sizeof(T) * size, host ); }
real L2Norm(const Buffer3D & in,cl::CommandQueue & q) { cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth()); CLContextLoader::getRedL2NormKer().setArg(0,in()); CLContextLoader::getRedL2NormKer().setArg(1,ans()); q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(), cl::NDRange(0), cl::NDRange(in.width()*in.height()*in.depth()), getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth()))); ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth()); real res; q.enqueueReadBuffer(ans,true,0,sizeof(real),&res); return sqrt(res); }
cl::Event copyToDeviceAsync(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyToDevice - Buffer is not initialised."); cl::Event complete; queue.enqueueWriteBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, NULL, &complete); return complete; }
CL::Event OGLSharedFramebuffer::acquire(CL::CommandQueue& queue, const CL::Event& e) { if (_shared) { return queue.enq_GL_acquire(_cl_buffer->get(), "acquire framebuffer", e); } else { return e; } }
/** * initialize tinymt status in device global memory * using 1 parameter for 1 generator. *@param tinymt_status internal state of kernel side tinymt *@param total total number of work items *@param local_item number of local work items *@param seed seed for initialization */ static void initialize_by_seed(Buffer& tinymt_status, int total, int local_item, uint32_t seed) { #if defined(DEBUG) cout << "initialize_by_seed start" << endl; #endif Kernel init_kernel(program, "tinymt_init_seed_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed); NDRange global(total); NDRange local(local_item); Event event; #if defined(DEBUG) cout << "global:" << dec << total << endl; cout << "group:" << dec << (total / local_item) << endl; cout << "local:" << dec << local_item << endl; #endif queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt32j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt32j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; #if defined(DEBUG) cout << "status[0].s0:" << hex << status[0].s0 << endl; cout << "status[0].s1:" << hex << status[0].s1 << endl; cout << "status[0].s2:" << hex << status[0].s2 << endl; cout << "status[0].s3:" << hex << status[0].s3 << endl; #endif check_status(status, total); #if defined(DEBUG) cout << "initialize_by_seed end" << endl; #endif }
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata, cl::Buffer& device_result, int iterations, ProgramCache& cache, cl::CommandQueue& queue) { cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>(); std::vector<std::string> sources; sources.push_back("ParallelQueue"); sources.push_back("ParallelQueueTests"); cl::Program& program = cache.getProgram(sources); cl::Kernel sum_test_kernel(program, "sum_test"); cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>(); int warp_size = sum_test_kernel .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device); std::cout << "warp size: " << warp_size << std::endl; int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int queue_num_threads = 512; if(queue_num_threads > max_group_size) queue_num_threads = max_group_size; cl::LocalSpaceArg local_queue = cl::__local(sizeof(int) * queue_num_threads * 2); cl::LocalSpaceArg reduction_buffer = cl::__local(sizeof(int) * queue_num_threads); cl::LocalSpaceArg got_work = cl::__local(sizeof(int)); cl::LocalSpaceArg prefix_sum_input = cl::__local(sizeof(int) * queue_num_threads); cl::LocalSpaceArg prefix_sum_output = cl::__local(sizeof(int) * queue_num_threads); sum_test_kernel.setArg(0, queue_data); sum_test_kernel.setArg(1, queue_metadata); sum_test_kernel.setArg(2, device_result); sum_test_kernel.setArg(3, iterations); sum_test_kernel.setArg(4, local_queue); sum_test_kernel.setArg(5, reduction_buffer); sum_test_kernel.setArg(6, got_work); sum_test_kernel.setArg(7, prefix_sum_input); sum_test_kernel.setArg(8, prefix_sum_output); cl::NDRange nullRange; cl::NDRange global(queue_num_threads, 1); cl::NDRange local(queue_num_threads, 1); cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel, nullRange, global, local); }
CL::Event Framebuffer::clear(CL::CommandQueue& queue, const CL::Event& e) { _clear_kernel.set_arg(0, _cl_buffer->get()); vec4 color = config.clear_color(); _clear_kernel.set_arg(1, vec4(powf(color.x, 2.2), powf(color.y, 2.2), powf(color.z, 2.2), 1000)); return queue.enq_kernel(_clear_kernel, _size.x * _size.y, 256, "clear framebuffer", e); }
/** * initialize tinymt status in device global memory * using 1 parameter for all generators. *@param tinymt_status device global memories *@param total total number of work items *@param local_item number of local work items *@param seed_array seeds for initialization *@param seed_size size of seed_array */ static void initialize_by_array(Buffer& tinymt_status, int total, int local_item, uint64_t seed_array[], int seed_size) { #if defined(DEBUG) cout << "initialize_by_array start" << endl; #endif Buffer seed_array_buffer(context, CL_MEM_READ_WRITE, seed_size * sizeof(uint64_t)); queue.enqueueWriteBuffer(seed_array_buffer, CL_TRUE, 0, seed_size * sizeof(uint64_t), seed_array); Kernel init_kernel(program, "tinymt_init_array_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed_array_buffer); init_kernel.setArg(2, seed_size); NDRange global(total); NDRange local(local_item); Event event; queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt64j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt64j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; check_status(status, total); #if defined(DEBUG) cout << "initialize_by_array end" << endl; #endif }
void findMinSeamVert(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, cl::Buffer &vertMinEnergy, cl::Buffer &vertMinIdx, int width, int height, int pitch, int colsRemoved) { cl_int errNum; errNum = findMinSeamVertKernel.setArg(0, energyMatrix); errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy); errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx); errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(5, width); errNum |= findMinSeamVertKernel.setArg(6, height); errNum |= findMinSeamVertKernel.setArg(7, pitch); errNum |= findMinSeamVertKernel.setArg(8, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting findMinSeamVert arguments." << std::endl; exit(-1); } // This kernel could be written to use more than one work group, but its probably not worth it. cl::NDRange offset = cl::NDRange(0); cl::NDRange localWorkSize = cl::NDRange(256); cl::NDRange globalWorkSize = cl::NDRange(256); errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl; exit(-1); } /** DEBUG **/ // int deviceResultIdx[1]; // float deviceResultEnergy[1]; // mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx); // mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy); // std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl; // std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl; }
void run(T* buf) { cl_int err; cl::Buffer outbuf( m_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, N*N*sizeof(T), buf, &err); checkErr(err, "Buffer::Buffer()"); err = m_kernel.setArg(0, outbuf); checkErr(err, "Kernel::setArg(0)"); err = m_kernel.setArg(1, N); checkErr(err, "Kernel::setArg(1)"); err = m_kernel.setArg(2, depth); checkErr(err, "Kernel::setArg(2)"); err = m_kernel.setArg(3, escape2); checkErr(err, "Kernel::setArg(3)"); cl::Event event; err = m_cmdq.enqueueNDRangeKernel( m_kernel, cl::NullRange, cl::NDRange(N*N), cl::NDRange(N, 1), NULL, &event); checkErr(err, "ComamndQueue::enqueueNDRangeKernel()"); event.wait(); err = m_cmdq.enqueueReadBuffer( outbuf, CL_TRUE, 0, N*N*sizeof(T), buf); checkErr(err, "ComamndQueue::enqueueReadBuffer()"); }