Exemplo n.º 1
0
void simulationStep() {
    try {
        // copy
        auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 nullptr, nullptr);
        queue.enqueueWriteBuffer(buffer, CL_TRUE, 0,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 visualizationBufferCPU, NULL, NULL);

        // enque
        stepKernel.setArg(2, buffer);
        cl::NDRange global((size_t) (fieldWidth * fieldHeight));
        queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange);

        // read back
        queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0,
                                sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                visualizationBufferCPU, NULL, NULL);

        // finish
        queue.finish();
    } catch (cl::Error err) {
        std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl;
        exit(3);
    }

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE,
                 visualizationBufferCPU);
}
Exemplo n.º 2
0
void PathOCLRenderThread::EnqueueAdvancePathsKernel(cl::CommandQueue &oclQueue) {
	PathOCLRenderEngine *engine = (PathOCLRenderEngine *)renderEngine;
	const u_int taskCount = engine->taskCount;

	// Micro kernels version
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_NEXT_VERTEX, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_NOTHING, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_OBJECT, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_DL, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_ILLUMINATE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_SAMPLE_BSDF, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_NEXT_VERTEX_RAY, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_SPLAT_SAMPLE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_NEXT_SAMPLE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
}
Exemplo n.º 3
0
    void updateParticles(float timeDelta)
    {
        try
        {
            vector<cl::Memory> glBuffers;
            glBuffers.push_back(m_positions);
            glBuffers.push_back(m_colors);
            
            //this will update our system by calculating new velocity and updating the positions of our particles
            //Make sure OpenGL is done using our VBOs
            glFinish();
            
            // map OpenGL buffer object for writing from OpenCL
            // this passes in the vector of VBO buffer objects (position and color)
            m_queue.enqueueAcquireGLObjects(&glBuffers);
            
            m_particleKernel.setArg(5, timeDelta); //pass in the timestep
            
            //execute the kernel
            m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles),
                                         cl::NullRange);
            //Release the VBOs so OpenGL can play with them
            m_queue.enqueueReleaseGLObjects(&glBuffers, NULL);

            m_queue.finish();
        }
        catch(cl::Error &error)
        {
            LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")";
        }
    }
Exemplo n.º 4
0
void procOCL_OCV(int tex, int w, int h)
{
    int64_t t = getTimeMs();
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, tex);
    std::vector < cl::Memory > images(1, imgIn);
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    cv::UMat uIn, uOut, uTmp;
    cv::ocl::convertFromImage(imgIn(), uIn);
    LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t));
    theQueue.enqueueReleaseGLObjects(&images);

    t = getTimeMs();
    //cv::blur(uIn, uOut, cv::Size(5, 5));
    cv::Laplacian(uIn, uTmp, CV_8U);
    cv:multiply(uTmp, 10, uOut);
    cv::ocl::finish();
    LOGD("OpenCV processing costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, tex);
    images.clear();
    images.push_back(imgOut);
    theQueue.enqueueAcquireGLObjects(&images);
    cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ);
    cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
    size_t offset = 0;
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { w, h, 1 };
    CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
    theQueue.enqueueReleaseGLObjects(&images);
    cv::ocl::finish();
    LOGD("uploading results to texture costs %d ms", getTimeInterval(t));
}
Exemplo n.º 5
0
void MetaBallsApp::update()
{
    std::vector<cl::Memory> acquire( { mClParticleBuf, mClMarchingRenderBuffer, mClMarchingDebugBuffer } );
    mClCommandQueue.enqueueAcquireGLObjects( &acquire );
    updateParticles();
    updateMarching();
    mClCommandQueue.enqueueReleaseGLObjects( &acquire );
}
Exemplo n.º 6
0
void PTWeekend::draw()
{
    /*
     * BEGIN - each frame part
     */
    
    /* Enqueue kernel for execution */
    
    glm::vec3 origin,lower_left, hor, ver;
    
    float theta = camera.getFov() * M_PI / 180.0f;
    float half_height = tan(theta / 2.0f);
    float half_width = camera.getAspectRatio() * half_height;
    
    origin = camera.getEyePoint();
    glm::vec3 u, v, w;
    
    w = -glm::normalize(camera.getViewDirection()); //odd...
    u = glm::normalize(glm::cross(glm::vec3(0,1,0), w));
    v = glm::cross(w, u);
    
    lower_left = origin - half_width * u - half_height * v - w;
    hor = 2.0f * half_width * u;
    ver = 2.0f * half_height * v;
    
    pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer");
    
    clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not acquire gl objects");
    
    cl::Event profiling_evt;
    
    
    
    clStatus = cmd_queue.enqueueNDRangeKernel(kernel,
                                              cl::NDRange(0,0),
                                              cl::NDRange(img_width, img_height),
                                              cl::NDRange(local_width,local_height),
                                              NULL,
                                              &profiling_evt);
    profiling_evt.wait();
    
    pt_assert(clStatus, "Could not enqueue the kernel");
    clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not release gl objects");
    cmd_queue.finish();
    
    cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong total_time = time_end - time_start;
    std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n";
    
    /*
     * END - each frame part
     */
    
    gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight()));
}
Exemplo n.º 7
0
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
    if(!isKernelLatency)
        return 0;

    cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
    cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
    cl::NDRange globalSize = (numItems / FETCH_PER_WI);
    cl::NDRange localSize = devInfo.maxWGSize;
    int iters = devInfo.kernelLatencyIters;
    float latency;

    try
    {
        log->print(NEWLINE TAB TAB "Kernel launch latency : ");
        log->xmlOpenTag("kernel_launch_latency");
        log->xmlAppendAttribs("unit", "us");

        cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
        cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));

        cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
        kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);

        // Dummy calls
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.finish();

        latency = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
            queue.finish();
            cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
            cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
            latency += (float)((int)end - (int)start);
        }
        latency /= iters;

        log->print(latency);    log->print(" us" NEWLINE);
        log->xmlSetContent(latency);
        log->xmlCloseTag();
    }
    catch(cl::Error error)
    {
        log->print(error.err() + NEWLINE);
        log->print(TAB TAB "Tests skipped" NEWLINE);
        return -1;
    }

    return 0;
}
cl::Event RuntimeMeasurementsManager::enqueueNewMarker(cl::CommandQueue queue) {
    cl::Event event;
#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
    // Use deprecated API
    queue.enqueueMarker(&event);
#else
    queue.enqueueMarkerWithWaitList(NULL, &event)
#endif
    queue.finish();

    return event;
}
Exemplo n.º 9
0
 void initParticles(uint32_t num_particles)
 {
     m_geom = gl::Geometry::create();
     m_geom->setPrimitiveType(GL_POINTS);
     m_mesh = gl::Mesh::create(m_geom, m_pointMaterial);
     
     m_numParticles = num_particles;
     GLsizei numBytes = m_numParticles * sizeof(vec4);
     
     m_geom->vertices().resize(m_numParticles, vec3(0));
     m_geom->colors().resize(m_numParticles, vec4(1));
     m_geom->point_sizes().resize(m_numParticles, 9.f);
     m_geom->createGLBuffers();
     m_mesh->material()->setPointSize(2.f);
     scene().addObject(m_mesh);
     try
     {
         // shared position buffer for OpenGL / OpenCL
         m_positions = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->vertexBuffer().id());
         m_colors = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->colorBuffer().id());
         
         //create the OpenCL only arrays
         m_velocities = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes );
         m_positionGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes );
         m_velocityGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes );
         
         vector<vec4> posGen, velGen;
         for (int i = 0; i < m_numParticles; i++)
         {
             posGen.push_back( vec4(glm::ballRand(20.0f), 1.f) );
             vec2 tmp = glm::linearRand(vec2(-100), vec2(100));
             float life = kinski::random(2.f, 5.f);
             float yVel = kinski::random<float>(5, 15);
             velGen.push_back(vec4(tmp.x, yVel, tmp.y, life));
             m_geom->point_sizes()[i] = kinski::random(5.f, 15.f);
         }
         m_geom->createGLBuffers();
         
         m_queue.enqueueWriteBuffer(m_velocities, CL_TRUE, 0, numBytes, &velGen[0]);
         m_queue.enqueueWriteBuffer(m_positionGen, CL_TRUE, 0, numBytes, &posGen[0]);
         m_queue.enqueueWriteBuffer(m_velocityGen, CL_TRUE, 0, numBytes, &velGen[0]);
         
         m_particleKernel.setArg(0, m_positions);
         m_particleKernel.setArg(1, m_colors);
         m_particleKernel.setArg(2, m_velocities);
         m_particleKernel.setArg(3, m_positionGen);
         m_particleKernel.setArg(4, m_velocityGen);
     }
     catch(cl::Error &error)
     {
         LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")";
     }
 }
Exemplo n.º 10
0
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice)
{
	int set_size=8;
    try {
        cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                isize*sizeof(cl_uchar), in, NULL);
        cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        gNV21Kernel.setArg(2,w);
        gNV21Kernel.setArg(3,h);
        gNV21Kernel.setArg(1,bufferIn);
        gNV21Kernel.setArg(0,bufferOut);
        gQueue.enqueueNDRangeKernel(gNV21Kernel,
                cl::NullRange,
                cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                cl::NDRange(set_size,set_size),
                NULL,
                NULL);
        if (choice==1) {
            gLaplacianK.setArg(2,w);
            gLaplacianK.setArg(3,h);
            gLaplacianK.setArg(1,bufferOut);
            gLaplacianK.setArg(0,bufferOut2);
            gQueue.enqueueNDRangeKernel(gLaplacianK,
                    cl::NullRange,
                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                    cl::NDRange(set_size,set_size),
                    NULL,
                    NULL);
        }
        else if (choice>1) {
        	gNegative.setArg(2,w);
        	gNegative.setArg(3,h);
        	gNegative.setArg(1,bufferOut);
        	gNegative.setArg(0,bufferOut2);
        	gQueue.enqueueNDRangeKernel(gNegative,
        	                    cl::NullRange,
        	                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
        	                    cl::NDRange(set_size,set_size),
        	                    NULL,
        	                    NULL);

        }

        gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out);
    }
    catch (cl::Error e) {
        LOGI("@oclDecoder: %s %d \n",e.what(),e.err());
    }
}
Exemplo n.º 11
0
/**
 * generate 64 bit unsigned random numbers in device global memory
 *@param tinymt_status device global memories
 *@param total_num total number of work items
 *@param local_num number of local work items
 *@param data_size number of data to generate
 */
static void generate_uint64(Buffer& tinymt_status,
                            int total_num,
                            int local_num,
                            int data_size)
{
#if defined(DEBUG)
    cout << "generate_uint64 start" << endl;
#endif
    int min_size = total_num;
    if (data_size % min_size != 0) {
        data_size = (data_size / min_size + 1) * min_size;
    }
    Kernel uint_kernel(program, "tinymt_uint64_kernel");
    Buffer output_buffer(context,
                         CL_MEM_READ_WRITE,
                         data_size * sizeof(uint64_t));
    uint_kernel.setArg(0, tinymt_status);
    uint_kernel.setArg(1, output_buffer);
    uint_kernel.setArg(2, data_size / total_num);
    NDRange global(total_num);
    NDRange local(local_num);
    Event generate_event;
#if defined(DEBUG)
    cout << "generate_uint64 enque kernel start" << endl;
#endif
    queue.enqueueNDRangeKernel(uint_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &generate_event);
    uint64_t * output = new uint64_t[data_size];
    generate_event.wait();
    queue.enqueueReadBuffer(output_buffer,
                            CL_TRUE,
                            0,
                            data_size * sizeof(uint64_t),
                            output);
    check_data(output, data_size, total_num);
#if defined(DEBUG)
    print_uint64(output, data_size, total_num);
#endif
    double time = get_time(generate_event);
    cout << "generate time:" << time * 1000 << "ms" << endl;
    delete[] output;
#if defined(DEBUG)
    cout << "generate_uint64 end" << endl;
#endif
}
Exemplo n.º 12
0
	void copyFromDevice(cl::CommandQueue &queue)
	{
		if(m_pElts==NULL)
			throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised.");
		
		queue.enqueueReadBuffer(m_buffer, CL_TRUE, 0, m_cb, m_pElts);
	}
Exemplo n.º 13
0
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue)
{
    static std::once_flag   compileFlag;
    static cl::Program      prog;
    static cl::Kernel       kern;

    std::call_once(compileFlag,
        [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true);
            kern = cl::Kernel(prog, "julia");
        });

    //auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern);

    static const NDRange local(8, 8);
    NDRange global(local[0] * divup(DIMX, local[0]),
                   local[1] * divup(DIMY, local[1]));

    kern.setArg(0, devOut);
    kern.setArg(1, DIMX);
    kern.setArg(2, DIMY);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local);

    //juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY);
}
Exemplo n.º 14
0
bool runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = 1024 * 2 + 15;
  
  std::vector<T> input(size);
  
  std::cout << "##Testing scan for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type();
  
  for(size_t i = 0; i < input.size(); ++i)
    input[i] = i+1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0])
    ;
  
  magnet::CL::scan<T> scanFunctor;
  scanFunctor.build(queue, context);
  
  scanFunctor(bufferIn, bufferIn);
  
  std::vector<T> output(size);
  
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);
  bool failed = !testOutput(input, output);

  std::cout << (failed ? " FAILED" : " PASSED") << std::endl; 
  return failed;
}
Exemplo n.º 15
0
cl::Event runKernel(const cl::CommandQueue& queue, const cl::Kernel& kernel, const cl::NDRange& globalSize, const cl::NDRange& groupSize, std::vector<cl::Event>& events)
{
	cl::Event event;
	queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, groupSize, &events, &event);
	events.push_back(event);
	return event;
}
Exemplo n.º 16
0
void runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = 2 << 10;

  std::vector<T> input(size);

  std::cout << "##Testing bitonic sort for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type()
	    << std::endl;
  
  for(size_t i = 0; i < input.size(); ++i)
    input[i] = input.size() - i - 1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0])
    ;
  
  magnet::CL::bitonicSort<T> bitonicSortFunctor;
  bitonicSortFunctor.build(queue, context);
  bitonicSortFunctor(bufferIn);

  std::vector<T> output(size);
 
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);

  if (!testOutput(input, output))
    M_throw() << "Incorrect output for size " 
	      << input.size()
	      << " and type "
	      << magnet::CL::detail::traits<T>::kernel_type();
}
inline void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,cl::CommandQueue &quene) const
{
    cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T));
//    std::cout << "enqeue\n";
    quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0]));
    kernel.setArg(i,buffer);

}
inline void OpenCL::addkernelarg(std::size_t i, T const (& arg)[N], cl::Kernel & kernel,cl::CommandQueue &quene) const
{
    cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,N*sizeof(T));
//    std::cout << "enqeue\n";
    quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*N,&arg);
    kernel.setArg(i,buffer);

}
Exemplo n.º 19
0
    CL::Event OGLSharedFramebuffer::release(CL::CommandQueue& queue, const CL::Event& evt)
    {
        if (_shared) {
            CL::Event e = queue.enq_GL_release(_cl_buffer->get(),
                                               "release framebuffer", evt);
            return e;
        } else {

            assert(_local);
            CL::Event e = queue.enq_read_buffer(*_cl_buffer, _local, _tex_buffer.get_size(),
                                                "read framebuffer", evt);
            queue.wait_for_events(e);

            _tex_buffer.load(_local);
            return CL::Event();
        }
    }
Exemplo n.º 20
0
void procOCL_I2I(int texIn, int texOut, int w, int h)
{
    if(!haveOpenCL) return;

    LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h);
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
    std::vector < cl::Memory > images;
    images.push_back(imgIn);
    images.push_back(imgOut);

    int64_t t = getTimeMs();
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once
    Laplacian.setArg(0, imgIn);
    Laplacian.setArg(1, imgOut);
    theQueue.finish();
    LOGD("Kernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange);
    theQueue.finish();
    LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueReleaseGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t));
}
Exemplo n.º 21
0
 void read(const cl::CommandQueue &q, size_t offset, size_t size, T *host,
         bool blocking = false) const
 {
     if (size)
         q.enqueueReadBuffer(
                 buffer, blocking ? CL_TRUE : CL_FALSE,
                 sizeof(T) * offset, sizeof(T) * size, host
                 );
 }
Exemplo n.º 22
0
real L2Norm(const Buffer3D & in,cl::CommandQueue & q)
{
	cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth());

	CLContextLoader::getRedL2NormKer().setArg(0,in());
	CLContextLoader::getRedL2NormKer().setArg(1,ans());

	q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(),
					cl::NDRange(0),
					cl::NDRange(in.width()*in.height()*in.depth()),
					getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth())));

	ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth());

	real res;
	q.enqueueReadBuffer(ans,true,0,sizeof(real),&res);
	return sqrt(res);
}
Exemplo n.º 23
0
	cl::Event copyToDeviceAsync(cl::CommandQueue &queue)
	{
		if(m_pElts==NULL)
			throw cl::Error(CL_INVALID_MEM_OBJECT, "copyToDevice - Buffer is not initialised.");
		
		cl::Event complete;
		queue.enqueueWriteBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, NULL, &complete);
		return complete;
	}
Exemplo n.º 24
0
 CL::Event OGLSharedFramebuffer::acquire(CL::CommandQueue& queue, const CL::Event& e)
 {
     if (_shared) {
         return queue.enq_GL_acquire(_cl_buffer->get(),
                                     "acquire framebuffer", e);
     } else {
         return e;
     }
 }
Exemplo n.º 25
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for 1 generator.
 *@param tinymt_status internal state of kernel side tinymt
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed seed for initialization
 */
static void initialize_by_seed(Buffer& tinymt_status,
                               int total,
                               int local_item,
                               uint32_t seed)
{
#if defined(DEBUG)
    cout << "initialize_by_seed start" << endl;
#endif
    Kernel init_kernel(program, "tinymt_init_seed_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
#if defined(DEBUG)
    cout << "global:" << dec << total << endl;
    cout << "group:" << dec << (total / local_item) << endl;
    cout << "local:" << dec << local_item << endl;
#endif
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt32j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt32j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
#if defined(DEBUG)
    cout << "status[0].s0:" << hex << status[0].s0 << endl;
    cout << "status[0].s1:" << hex << status[0].s1 << endl;
    cout << "status[0].s2:" << hex << status[0].s2 << endl;
    cout << "status[0].s3:" << hex << status[0].s3 << endl;
#endif
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_seed end" << endl;
#endif
}
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
             cl::Buffer& device_result, int iterations,
             ProgramCache& cache,
             cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel sum_test_kernel(program, "sum_test");

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();

    int warp_size = sum_test_kernel
        .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);

    std::cout << "warp size: " << warp_size << std::endl;

    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::LocalSpaceArg local_queue
            = cl::__local(sizeof(int) * queue_num_threads * 2);
    cl::LocalSpaceArg reduction_buffer
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg got_work
            = cl::__local(sizeof(int));
    cl::LocalSpaceArg prefix_sum_input
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg prefix_sum_output
            = cl::__local(sizeof(int) * queue_num_threads);

    sum_test_kernel.setArg(0, queue_data);
    sum_test_kernel.setArg(1, queue_metadata);
    sum_test_kernel.setArg(2, device_result);
    sum_test_kernel.setArg(3, iterations);
    sum_test_kernel.setArg(4, local_queue);
    sum_test_kernel.setArg(5, reduction_buffer);
    sum_test_kernel.setArg(6, got_work);
    sum_test_kernel.setArg(7, prefix_sum_input);
    sum_test_kernel.setArg(8, prefix_sum_output);

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

    cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel,
                                               nullRange, global, local);
}
Exemplo n.º 27
0
 CL::Event Framebuffer::clear(CL::CommandQueue& queue, const CL::Event& e)
 {
     _clear_kernel.set_arg(0, _cl_buffer->get());
     vec4 color = config.clear_color();
     _clear_kernel.set_arg(1, vec4(powf(color.x, 2.2), 
                                   powf(color.y, 2.2),
                                   powf(color.z, 2.2), 1000));
     return queue.enq_kernel(_clear_kernel, _size.x * _size.y, 256,
                             "clear framebuffer", e);
 }
Exemplo n.º 28
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for all generators.
 *@param tinymt_status device global memories
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed_array seeds for initialization
 *@param seed_size size of seed_array
 */
static void initialize_by_array(Buffer& tinymt_status,
                                int total,
                                int local_item,
                                uint64_t seed_array[],
                                int seed_size)
{
#if defined(DEBUG)
    cout << "initialize_by_array start" << endl;
#endif
    Buffer seed_array_buffer(context,
                             CL_MEM_READ_WRITE,
                             seed_size * sizeof(uint64_t));
    queue.enqueueWriteBuffer(seed_array_buffer,
                             CL_TRUE,
                             0,
                             seed_size * sizeof(uint64_t),
                             seed_array);
    Kernel init_kernel(program, "tinymt_init_array_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed_array_buffer);
    init_kernel.setArg(2, seed_size);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt64j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt64j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_array end" << endl;
#endif
}
Exemplo n.º 29
0
    void findMinSeamVert(cl::Context &ctx,
                         cl::CommandQueue &cmdQueue,
                         cl::Event &event,
                         std::vector<cl::Event> &deps,
                         cl::Buffer &energyMatrix,
                         cl::Buffer &vertMinEnergy,
                         cl::Buffer &vertMinIdx,
                         int width,
                         int height,
                         int pitch,
                         int colsRemoved) {

        cl_int errNum;
        errNum = findMinSeamVertKernel.setArg(0, energyMatrix);
        errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy);
        errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx);
        errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float)));
        errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float)));
        errNum |= findMinSeamVertKernel.setArg(5, width);
        errNum |= findMinSeamVertKernel.setArg(6, height);
        errNum |= findMinSeamVertKernel.setArg(7, pitch);
        errNum |= findMinSeamVertKernel.setArg(8, colsRemoved);

        if (errNum != CL_SUCCESS) {
            std::cerr << "Error setting findMinSeamVert arguments." << std::endl;
            exit(-1);
        }

        // This kernel could be written to use more than one work group, but its probably not worth it.

        cl::NDRange offset = cl::NDRange(0);
        cl::NDRange localWorkSize = cl::NDRange(256);
        cl::NDRange globalWorkSize = cl::NDRange(256);

        errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel,
                                               offset,
                                               globalWorkSize,
                                               localWorkSize,
                                               &deps,
                                               &event);
        if (errNum != CL_SUCCESS) {
            std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl;
            exit(-1);
        }

        /** DEBUG **/
        // int deviceResultIdx[1];
        // float deviceResultEnergy[1];

        // mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx);
        // mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy);

        // std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl;
        // std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl;
    }
Exemplo n.º 30
0
    void run(T* buf)
    {
        cl_int err;
        cl::Buffer outbuf(
            m_context,
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
            N*N*sizeof(T),
            buf,
            &err);
        checkErr(err, "Buffer::Buffer()");

        err = m_kernel.setArg(0, outbuf);
        checkErr(err, "Kernel::setArg(0)");

        err = m_kernel.setArg(1, N);
        checkErr(err, "Kernel::setArg(1)");

        err = m_kernel.setArg(2, depth);
        checkErr(err, "Kernel::setArg(2)");

        err = m_kernel.setArg(3, escape2);
        checkErr(err, "Kernel::setArg(3)");

        cl::Event event;
        err = m_cmdq.enqueueNDRangeKernel(
            m_kernel,
            cl::NullRange,
            cl::NDRange(N*N),
            cl::NDRange(N, 1),
            NULL,
            &event);
        checkErr(err, "ComamndQueue::enqueueNDRangeKernel()");

        event.wait();
        err = m_cmdq.enqueueReadBuffer(
            outbuf,
            CL_TRUE,
            0,
            N*N*sizeof(T),
            buf);
        checkErr(err, "ComamndQueue::enqueueReadBuffer()");
    }