void PathOCLRenderThread::EnqueueAdvancePathsKernel(cl::CommandQueue &oclQueue) { PathOCLRenderEngine *engine = (PathOCLRenderEngine *)renderEngine; const u_int taskCount = engine->taskCount; // Micro kernels version oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_NEXT_VERTEX, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_NOTHING, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_OBJECT, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_DL, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_ILLUMINATE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_SAMPLE_BSDF, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_NEXT_VERTEX_RAY, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_SPLAT_SAMPLE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_NEXT_SAMPLE, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange, cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize)); }
float clPeak::run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters) { float timed = 0; // Dummy calls queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.finish(); if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else // std timer { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); } queue.finish(); timed = timer.stopAndTime(); } return (timed / iters); }
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isKernelLatency) return 0; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI; cl::NDRange globalSize = (numItems / FETCH_PER_WI); cl::NDRange localSize = devInfo.maxWGSize; int iters = devInfo.kernelLatencyIters; float latency; try { log->print(NEWLINE TAB TAB "Kernel launch latency : "); log->xmlOpenTag("kernel_launch_latency"); log->xmlAppendAttribs("unit", "us"); cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float))); cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float))); cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset"); kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf); // Dummy calls queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.finish(); latency = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000; cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000; latency += (float)((int)end - (int)start); } latency /= iters; log->print(latency); log->print(" us" NEWLINE); log->xmlSetContent(latency); log->xmlCloseTag(); } catch(cl::Error error) { log->print(error.err() + NEWLINE); log->print(TAB TAB "Tests skipped" NEWLINE); return -1; } return 0; }
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice) { int set_size=8; try { cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, isize*sizeof(cl_uchar), in, NULL); cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); gNV21Kernel.setArg(2,w); gNV21Kernel.setArg(3,h); gNV21Kernel.setArg(1,bufferIn); gNV21Kernel.setArg(0,bufferOut); gQueue.enqueueNDRangeKernel(gNV21Kernel, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); if (choice==1) { gLaplacianK.setArg(2,w); gLaplacianK.setArg(3,h); gLaplacianK.setArg(1,bufferOut); gLaplacianK.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gLaplacianK, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } else if (choice>1) { gNegative.setArg(2,w); gNegative.setArg(3,h); gNegative.setArg(1,bufferOut); gNegative.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gNegative, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out); } catch (cl::Error e) { LOGI("@oclDecoder: %s %d \n",e.what(),e.err()); } }
void updateParticles(float timeDelta) { try { vector<cl::Memory> glBuffers; glBuffers.push_back(m_positions); glBuffers.push_back(m_colors); //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL // this passes in the vector of VBO buffer objects (position and color) m_queue.enqueueAcquireGLObjects(&glBuffers); m_particleKernel.setArg(5, timeDelta); //pass in the timestep //execute the kernel m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles), cl::NullRange); //Release the VBOs so OpenGL can play with them m_queue.enqueueReleaseGLObjects(&glBuffers, NULL); m_queue.finish(); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
void procOCL_I2I(int texIn, int texOut, int w, int h) { if(!haveOpenCL) return; LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut); std::vector < cl::Memory > images; images.push_back(imgIn); images.push_back(imgOut); int64_t t = getTimeMs(); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once Laplacian.setArg(0, imgIn); Laplacian.setArg(1, imgOut); theQueue.finish(); LOGD("Kernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange); theQueue.finish(); LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueReleaseGLObjects(&images); theQueue.finish(); LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t)); }
cl::Event runKernel(const cl::CommandQueue& queue, const cl::Kernel& kernel, const cl::NDRange& globalSize, const cl::NDRange& groupSize, std::vector<cl::Event>& events) { cl::Event event; queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, groupSize, &events, &event); events.push_back(event); return event; }
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true); kern = cl::Kernel(prog, "julia"); }); //auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern); static const NDRange local(8, 8); NDRange global(local[0] * divup(DIMX, local[0]), local[1] * divup(DIMY, local[1])); kern.setArg(0, devOut); kern.setArg(1, DIMX); kern.setArg(2, DIMY); queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local); //juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY); }
void PTWeekend::draw() { /* * BEGIN - each frame part */ /* Enqueue kernel for execution */ glm::vec3 origin,lower_left, hor, ver; float theta = camera.getFov() * M_PI / 180.0f; float half_height = tan(theta / 2.0f); float half_width = camera.getAspectRatio() * half_height; origin = camera.getEyePoint(); glm::vec3 u, v, w; w = -glm::normalize(camera.getViewDirection()); //odd... u = glm::normalize(glm::cross(glm::vec3(0,1,0), w)); v = glm::cross(w, u); lower_left = origin - half_width * u - half_height * v - w; hor = 2.0f * half_width * u; ver = 2.0f * half_height * v; pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer"); clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not acquire gl objects"); cl::Event profiling_evt; clStatus = cmd_queue.enqueueNDRangeKernel(kernel, cl::NDRange(0,0), cl::NDRange(img_width, img_height), cl::NDRange(local_width,local_height), NULL, &profiling_evt); profiling_evt.wait(); pt_assert(clStatus, "Could not enqueue the kernel"); clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not release gl objects"); cmd_queue.finish(); cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = time_end - time_start; std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n"; /* * END - each frame part */ gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight())); }
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata, cl::Buffer& device_result, int iterations, ProgramCache& cache, cl::CommandQueue& queue) { cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>(); std::vector<std::string> sources; sources.push_back("ParallelQueue"); sources.push_back("ParallelQueueTests"); cl::Program& program = cache.getProgram(sources); cl::Kernel sum_test_kernel(program, "sum_test"); cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>(); int warp_size = sum_test_kernel .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device); std::cout << "warp size: " << warp_size << std::endl; int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int queue_num_threads = 512; if(queue_num_threads > max_group_size) queue_num_threads = max_group_size; cl::LocalSpaceArg local_queue = cl::__local(sizeof(int) * queue_num_threads * 2); cl::LocalSpaceArg reduction_buffer = cl::__local(sizeof(int) * queue_num_threads); cl::LocalSpaceArg got_work = cl::__local(sizeof(int)); cl::LocalSpaceArg prefix_sum_input = cl::__local(sizeof(int) * queue_num_threads); cl::LocalSpaceArg prefix_sum_output = cl::__local(sizeof(int) * queue_num_threads); sum_test_kernel.setArg(0, queue_data); sum_test_kernel.setArg(1, queue_metadata); sum_test_kernel.setArg(2, device_result); sum_test_kernel.setArg(3, iterations); sum_test_kernel.setArg(4, local_queue); sum_test_kernel.setArg(5, reduction_buffer); sum_test_kernel.setArg(6, got_work); sum_test_kernel.setArg(7, prefix_sum_input); sum_test_kernel.setArg(8, prefix_sum_output); cl::NDRange nullRange; cl::NDRange global(queue_num_threads, 1); cl::NDRange local(queue_num_threads, 1); cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel, nullRange, global, local); }
void findMinSeamVert(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, cl::Buffer &vertMinEnergy, cl::Buffer &vertMinIdx, int width, int height, int pitch, int colsRemoved) { cl_int errNum; errNum = findMinSeamVertKernel.setArg(0, energyMatrix); errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy); errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx); errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float))); errNum |= findMinSeamVertKernel.setArg(5, width); errNum |= findMinSeamVertKernel.setArg(6, height); errNum |= findMinSeamVertKernel.setArg(7, pitch); errNum |= findMinSeamVertKernel.setArg(8, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting findMinSeamVert arguments." << std::endl; exit(-1); } // This kernel could be written to use more than one work group, but its probably not worth it. cl::NDRange offset = cl::NDRange(0); cl::NDRange localWorkSize = cl::NDRange(256); cl::NDRange globalWorkSize = cl::NDRange(256); errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl; exit(-1); } /** DEBUG **/ // int deviceResultIdx[1]; // float deviceResultEnergy[1]; // mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx); // mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy); // std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl; // std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl; }
void Reduce::enqueue( const cl::CommandQueue &commandQueue, const cl::Buffer &inBuffer, const cl::Buffer &outBuffer, ::size_t first, ::size_t elements, ::size_t outPosition, const VECTOR_CLASS<cl::Event> *events, cl::Event *event) { /* Validate parameters */ if (first + elements < first) { // Only happens if first + elements overflows. size_t is unsigned so behaviour // is well-defined. throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds"); } if (inBuffer.getInfo<CL_MEM_SIZE>() / elementSize < first + elements) throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds"); if (outBuffer.getInfo<CL_MEM_SIZE>() / elementSize <= outPosition) throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output position out of buffer bounds"); if (!(inBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) { throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: input buffer is not readable"); } if (!(outBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output buffer is not writable"); } if (elements == 0) throw cl::Error(CL_INVALID_GLOBAL_WORK_SIZE, "clogs::Reduce::enqueue: elements is zero"); const ::size_t blockSize = roundUp(elements, reduceWorkGroupSize * reduceBlocks) / reduceBlocks; reduceKernel.setArg(1, outBuffer); reduceKernel.setArg(2, (cl_uint) outPosition); reduceKernel.setArg(3, inBuffer); reduceKernel.setArg(4, (cl_uint) first); reduceKernel.setArg(5, (cl_uint) elements); reduceKernel.setArg(7, (cl_uint) blockSize); cl::Event reduceEvent; commandQueue.enqueueNDRangeKernel( reduceKernel, cl::NullRange, cl::NDRange(reduceWorkGroupSize * reduceBlocks), cl::NDRange(reduceWorkGroupSize), events, &reduceEvent); doEventCallback(reduceEvent); if (event != NULL) *event = reduceEvent; }
void zeroBuffer(::cl::Context &context, ::cl::CommandQueue &commandQueue, ::cl::Buffer &buffer, size_t size, std::vector<::cl::Event> *events, ::cl::Event &event) { cl_int status; ::cl::Kernel kernel=getKernel(context, "zeroMemory", "utils.cl", utils_cl); status=kernel.setArg(0, buffer); ::cl::NDRange globalThreads(size); status=commandQueue.enqueueNDRangeKernel(kernel, ::cl::NullRange, globalThreads, ::cl::NullRange, events, &event); }
void backtrack(cl::Context &ctx, cl::CommandQueue &cmdQueue, cl::Event &event, std::vector<cl::Event> &deps, cl::Buffer &energyMatrix, cl::Buffer &vertSeamPath, cl::Buffer &vertMinIdx, int width, int height, int pitch, int colsRemoved) { cl_int errNum; // Set kernel arguments errNum = backtrackKernel.setArg(0, energyMatrix); errNum |= backtrackKernel.setArg(1, vertSeamPath); errNum |= backtrackKernel.setArg(2, vertMinIdx); errNum |= backtrackKernel.setArg(3, width); errNum |= backtrackKernel.setArg(4, height); errNum |= backtrackKernel.setArg(5, pitch); errNum |= backtrackKernel.setArg(6, colsRemoved); if (errNum != CL_SUCCESS) { std::cerr << "Error setting backtrack kernel arguments." << std::endl; exit(-1); } cl::NDRange offset = cl::NDRange(0); cl::NDRange localWorkSize = cl::NDRange(1); cl::NDRange globalWorkSize = cl::NDRange(256); errNum = cmdQueue.enqueueNDRangeKernel(backtrackKernel, offset, globalWorkSize, localWorkSize, &deps, &event); if (errNum != CL_SUCCESS) { std::cerr << "Error enqueueing backTrack kernel for execution." << std::endl; exit(-1); } // /** DEBUGGING **/ // int deviceResult[height]; // mem::read(ctx, cmdQueue, deviceResult, vertSeamPath, height); // for (int i = height - 5; i < height; ++i) { // std::cout << "deviceResult[" << i << "]=\t" << deviceResult[i] << std::endl; // } }
void MetaBallsApp::updateParticles() { int random = rand(); float time = 1.0f / 60.0f; mClParticleUpdate.setArg( 3, sizeof(float), &time ); mClParticleUpdate.setArg( 4, sizeof(int32_t), &random ); // Queue the kernel up for execution across the array mClCommandQueue.enqueueNDRangeKernel( mClParticleUpdate, cl::NullRange, cl::NDRange( NUM_PARTICLES ) ); }
void kernel(cl::Buffer& devOut, cl::Buffer& histOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern_img, kern_hist, kern_zero; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true); kern_img = cl::Kernel(prog, "image_gen"); kern_hist = cl::Kernel(prog, "hist_freq"); kern_zero = cl::Kernel(prog, "zero_buffer"); }); static const NDRange local(16, 16); NDRange global(local[0] * divup(DIMX, local[0]), local[1] * divup(DIMY, local[1])); static int tileSize = 32; tileSize++; persistance += 0.01; kern_img.setArg(0, devOut); kern_img.setArg(1, DIMX); kern_img.setArg(2, DIMY); kern_img.setArg(3, persistance); kern_img.setArg(4, tileSize); queue.enqueueNDRangeKernel(kern_img, cl::NullRange, global, local); static const NDRange global_hist(NBINS); kern_zero.setArg(0, histOut); kern_zero.setArg(1, NBINS); queue.enqueueNDRangeKernel(kern_zero, cl::NullRange, global_hist); kern_hist.setArg(0, devOut); kern_hist.setArg(1, histOut); kern_hist.setArg(2, DIMX); kern_hist.setArg(3, DIMY); kern_hist.setArg(4, NBINS); queue.enqueueNDRangeKernel(kern_hist, cl::NullRange, global, local); }
void zeroImage(::cl::Context &context, ::cl::CommandQueue &commandQueue, ::cl::Image2D &image, std::vector<::cl::Event> *events, ::cl::Event &event) { cl_int status; size_t width, height; ::cl::Kernel kernel=getKernel(context, "zeroFloatImage", "utils.cl", utils_cl); image.getImageInfo(CL_IMAGE_WIDTH, &width); image.getImageInfo(CL_IMAGE_HEIGHT, &height); status=kernel.setArg(0, image); ::cl::NDRange globalThreads(width, height); status=commandQueue.enqueueNDRangeKernel(kernel, ::cl::NullRange, globalThreads, ::cl::NullRange, events, &event); }
void bigLocalQueuesTest(cl::Buffer queue_data, cl::Buffer queue_metadata, int iterations, ProgramCache& cache, cl::CommandQueue& queue) { cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>(); std::vector<std::string> sources; sources.push_back("ParallelQueue"); sources.push_back("ParallelQueueTests"); cl::Program& program = cache.getProgram(sources); cl::Kernel big_local_queues_test(program, "big_local_queues_test"); cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>(); int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int queue_num_threads = 512; if(queue_num_threads > max_group_size) queue_num_threads = max_group_size; cl::LocalSpaceArg local_queue = cl::__local(sizeof(int) * queue_num_threads * 5); cl::LocalSpaceArg got_work = cl::__local(sizeof(int)); cl::LocalSpaceArg prefix_sum_input = cl::__local(sizeof(int) * queue_num_threads); cl::LocalSpaceArg prefix_sum_output = cl::__local(sizeof(int) * queue_num_threads); big_local_queues_test.setArg(0, queue_data); big_local_queues_test.setArg(1, queue_metadata); big_local_queues_test.setArg(2, iterations); big_local_queues_test.setArg(3, local_queue); big_local_queues_test.setArg(4, got_work); big_local_queues_test.setArg(5, prefix_sum_input); big_local_queues_test.setArg(6, prefix_sum_output); cl::NDRange nullRange; cl::NDRange global(queue_num_threads, 1); cl::NDRange local(queue_num_threads, 1); // std::cout << "enqueuing big_local_queues_test kernel" << std::endl; cl_int status = queue.enqueueNDRangeKernel(big_local_queues_test, nullRange, global, local); //fflush(stdout); }
void computeInterpolation(cl::CommandQueue &cmd, cl::Program &prog, const int nValues, const int nGrids, const int nPoints, cl::Buffer &distances, cl::Buffer &weightSum, cl::Buffer &knownValues, cl::Buffer &gridValues){ // void computeInterpolation(const int nValues, const int nGrids, // const int nPoints, const __global float *distances, // const __global float *weightSum, const __global float *knownValues, // __global float *gridValues){ cl::Kernel kernel( prog, "computeInterpolation_AOS"); kernel.setArg(0, (cl_int) nValues); kernel.setArg(1, (cl_int) nGrids); kernel.setArg(2, (cl_int) nPoints); kernel.setArg(3, distances); kernel.setArg(4, weightSum); kernel.setArg(5, knownValues); kernel.setArg(6, gridValues); // cout<<"great8.2"<<endl; size_t lSize=16; cl::NDRange local( lSize, lSize ); cl::NDRange global( ((cl_int)nGrids + lSize-1) / lSize * lSize, ((cl_int)nValues + lSize-1) / lSize * lSize ); cl::Event event; // MY_DATA_TYPE t = 0.0; // cl_uint count = 0; // cout<<"great8.3"<<endl; cmd.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, 0, &event); // cout<<"great8.4"<<endl; event.wait(); // cout<<"great8.5"<<endl; // t += (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() // - event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) / 1.0e9; // cout<<"t4 "<<t<<endl; };
/** * generate 64 bit unsigned random numbers in device global memory *@param tinymt_status device global memories *@param total_num total number of work items *@param local_num number of local work items *@param data_size number of data to generate */ static void generate_uint64(Buffer& tinymt_status, int total_num, int local_num, int data_size) { #if defined(DEBUG) cout << "generate_uint64 start" << endl; #endif int min_size = total_num; if (data_size % min_size != 0) { data_size = (data_size / min_size + 1) * min_size; } Kernel uint_kernel(program, "tinymt_uint64_kernel"); Buffer output_buffer(context, CL_MEM_READ_WRITE, data_size * sizeof(uint64_t)); uint_kernel.setArg(0, tinymt_status); uint_kernel.setArg(1, output_buffer); uint_kernel.setArg(2, data_size / total_num); NDRange global(total_num); NDRange local(local_num); Event generate_event; #if defined(DEBUG) cout << "generate_uint64 enque kernel start" << endl; #endif queue.enqueueNDRangeKernel(uint_kernel, NullRange, global, local, NULL, &generate_event); uint64_t * output = new uint64_t[data_size]; generate_event.wait(); queue.enqueueReadBuffer(output_buffer, CL_TRUE, 0, data_size * sizeof(uint64_t), output); check_data(output, data_size, total_num); #if defined(DEBUG) print_uint64(output, data_size, total_num); #endif double time = get_time(generate_event); cout << "generate time:" << time * 1000 << "ms" << endl; delete[] output; #if defined(DEBUG) cout << "generate_uint64 end" << endl; #endif }
int main(int argc, char *argv[]) { cl_int err = CL_SUCCESS; cl::Event evt; std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); if (platforms.size() == 0) { return false; } platform_ = platforms[0]; cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &err); CHECK_CL_ERROR(err, "cl::Context"); std::vector<cl::Device> devices = context_.getInfo<CL_CONTEXT_DEVICES>(); if (devices.size() == 0) { return false; } device_ = devices[0]; sources_.push_back(std::make_pair(source_str.c_str(), source_str.size())); program_ = cl::Program(context_, sources_); err = program_.build(devices); if (err != CL_SUCCESS) { std::string log = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cout << "program.build() ERROR: " << log.c_str() << std::endl; return false; } kernel_ = cl::Kernel(program_, "hello", &err); CHECK_CL_ERROR(err, "cl::Kernel"); buf_ = cl::Buffer(context_, CL_MEM_READ_ONLY, 1024, NULL, &err); queue_ = cl::CommandQueue(context_, device_, 0, &err); CHECK_CL_ERROR(err, "cl::CommandQueue"); kernel_.setArg(0, buf_); err = queue_.enqueueNDRangeKernel(kernel_, cl::NullRange, cl::NDRange(10, 10), cl::NullRange, NULL, &evt); evt.wait(); CHECK_CL_ERROR(err, "queue.enqueueNDRangeKernel()"); return 0; }
/* openCL version */ void computeDistances(cl::CommandQueue & cmd, cl::Program &prog, const int DIM, const int nPoints, cl::Buffer &knownCoords, const int nGrids, cl::Buffer &gridCoords, cl::Buffer &distances){ // cout<<"great8.1"<<endl; // cl::Kernel kernel(*gpu.program, "count1"); cl::Kernel kernel( prog, "computeDistances_AOS"); // void computeDistances(int DIM, int nPoints, // __global float *knownCoords, int nGrids, // __global float *gridCoords, __global float* distances){ kernel.setArg(0, (cl_int) DIM); kernel.setArg(1, (cl_int) nPoints); kernel.setArg(2, knownCoords); kernel.setArg(3, (cl_int) nGrids); kernel.setArg(4, gridCoords); kernel.setArg(5, distances); // cout<<"great8.2"<<endl; size_t lSize=16; cl::NDRange local( lSize, lSize ); cl::NDRange global( ((cl_int)nGrids + lSize-1) / lSize * lSize, ((cl_int)nPoints + lSize-1) / lSize * lSize ); cl::Event event; // MY_DATA_TYPE t = 0.0; // cl_uint count = 0; // cout<<"great8.3"<<endl; cmd.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, 0, &event); // cout<<"great8.4"<<endl; event.wait(); // cout<<"great8.5"<<endl; // t += (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() // - event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) / 1.0e9; // cout<<"t2 "<<t<<endl; };
cl::Buffer performReduction(const cl::Buffer & in,cl::Kernel & ker,cl::CommandQueue & q,int size) { if (size == 1) return in; int newsize = std::max(1,size/4); cl::Buffer tmp (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*newsize); ker.setArg(0,in()); ker.setArg(1,tmp()); ker.setArg(2,size); ker.setArg(3,4); q.enqueueNDRangeKernel(ker,cl::NDRange(0),cl::NDRange(newsize), getBestWorkspaceDim(cl::NDRange(newsize))); return performReduction(tmp,ker,q,newsize); }
real L2Norm(const Buffer3D & in,cl::CommandQueue & q) { cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth()); CLContextLoader::getRedL2NormKer().setArg(0,in()); CLContextLoader::getRedL2NormKer().setArg(1,ans()); q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(), cl::NDRange(0), cl::NDRange(in.width()*in.height()*in.depth()), getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth()))); ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth()); real res; q.enqueueReadBuffer(ans,true,0,sizeof(real),&res); return sqrt(res); }
/** * initialize tinymt status in device global memory * using 1 parameter for 1 generator. *@param tinymt_status internal state of kernel side tinymt *@param total total number of work items *@param local_item number of local work items *@param seed seed for initialization */ static void initialize_by_seed(Buffer& tinymt_status, int total, int local_item, uint32_t seed) { #if defined(DEBUG) cout << "initialize_by_seed start" << endl; #endif Kernel init_kernel(program, "tinymt_init_seed_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed); NDRange global(total); NDRange local(local_item); Event event; #if defined(DEBUG) cout << "global:" << dec << total << endl; cout << "group:" << dec << (total / local_item) << endl; cout << "local:" << dec << local_item << endl; #endif queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt32j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt32j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; #if defined(DEBUG) cout << "status[0].s0:" << hex << status[0].s0 << endl; cout << "status[0].s1:" << hex << status[0].s1 << endl; cout << "status[0].s2:" << hex << status[0].s2 << endl; cout << "status[0].s3:" << hex << status[0].s3 << endl; #endif check_status(status, total); #if defined(DEBUG) cout << "initialize_by_seed end" << endl; #endif }
/** * initialize tinymt status in device global memory * using 1 parameter for all generators. *@param tinymt_status device global memories *@param total total number of work items *@param local_item number of local work items *@param seed_array seeds for initialization *@param seed_size size of seed_array */ static void initialize_by_array(Buffer& tinymt_status, int total, int local_item, uint64_t seed_array[], int seed_size) { #if defined(DEBUG) cout << "initialize_by_array start" << endl; #endif Buffer seed_array_buffer(context, CL_MEM_READ_WRITE, seed_size * sizeof(uint64_t)); queue.enqueueWriteBuffer(seed_array_buffer, CL_TRUE, 0, seed_size * sizeof(uint64_t), seed_array); Kernel init_kernel(program, "tinymt_init_array_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed_array_buffer); init_kernel.setArg(2, seed_size); NDRange global(total); NDRange local(local_item); Event event; queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt64j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt64j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; check_status(status, total); #if defined(DEBUG) cout << "initialize_by_array end" << endl; #endif }
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue) { static std::once_flag compileFlag; static cl::Program prog; static cl::Kernel kern; std::call_once(compileFlag, [queue]() { prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), sinf_ocl_kernel, true); kern = cl::Kernel(prog, "sinf"); }); static const NDRange global(SIZE * 2); kern.setArg(0, devOut); kern.setArg(1, dx); kern.setArg(2, SIZE); queue.enqueueNDRangeKernel(kern, cl::NullRange, global); }
void genOffsets(cl::Context& context, cl::CommandQueue& queue, cl::Program& program, cl::Buffer* cm_buffer, cl::Buffer* offsets_buffer) { // Make kernel cl::Kernel kernel(program, "offsets"); // Set arguments to kernel kernel.setArg(0, *cm_buffer); kernel.setArg(1, *offsets_buffer); // Run the kernel on specific ND range cl::NDRange global(BINS); queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, cl::NullRange); }
void RTBiasPathOCLRenderThread::EnqueueRenderSampleKernel(cl::CommandQueue &oclQueue) { RTBiasPathOCLRenderEngine *engine = (RTBiasPathOCLRenderEngine *)renderEngine; // Check the maximum number of task to execute. I have to // consider preview, normal and long run phase const u_int tileWidth = engine->tileRepository->tileWidth; const u_int tileHeight = engine->tileRepository->tileHeight; const u_int threadFilmPixelCount = tileWidth * tileHeight; u_int taskCount = threadFilmPixelCount / (engine->previewResolutionReduction * engine->previewResolutionReduction); taskCount = Max(taskCount, threadFilmPixelCount / (engine->resolutionReduction * engine->resolutionReduction)); if (engine->longRunResolutionReductionStep > 0) taskCount = Max(taskCount, threadFilmPixelCount / (engine->longRunResolutionReduction * engine->longRunResolutionReduction)); // Micro kernels version oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_TRACE_EYE_RAY, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_ILLUMINATE_EYE_MISS, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_ILLUMINATE_EYE_HIT, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_DL_VERTEX_1, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_DIFFUSE, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_GLOSSY, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_SPECULAR, cl::NullRange, cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)), cl::NDRange(renderSampleWorkGroupSize)); }