void PathOCLRenderThread::EnqueueAdvancePathsKernel(cl::CommandQueue &oclQueue) {
	PathOCLRenderEngine *engine = (PathOCLRenderEngine *)renderEngine;
	const u_int taskCount = engine->taskCount;

	// Micro kernels version
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_NEXT_VERTEX, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_NOTHING, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_HIT_OBJECT, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_RT_DL, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_ILLUMINATE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_DL_SAMPLE_BSDF, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_NEXT_VERTEX_RAY, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_SPLAT_SAMPLE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_NEXT_SAMPLE, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*advancePathsKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange,
			cl::NDRange(taskCount), cl::NDRange(advancePathsWorkGroupSize));
}
Example #2
0
float clPeak::run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters)
{
    float timed = 0;

    // Dummy calls
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
    queue.finish();

    if(useEventTimer)
    {
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;

            queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
            queue.finish();
            timed += timeInUS(timeEvent);
        }
    } else      // std timer
    {
        Timer timer;

        timer.start();
        for(int i=0; i<iters; i++)
        {
            queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
        }
        queue.finish();
        timed = timer.stopAndTime();
    }

    return (timed / iters);
}
Example #3
0
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
    if(!isKernelLatency)
        return 0;

    cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
    cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
    cl::NDRange globalSize = (numItems / FETCH_PER_WI);
    cl::NDRange localSize = devInfo.maxWGSize;
    int iters = devInfo.kernelLatencyIters;
    float latency;

    try
    {
        log->print(NEWLINE TAB TAB "Kernel launch latency : ");
        log->xmlOpenTag("kernel_launch_latency");
        log->xmlAppendAttribs("unit", "us");

        cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
        cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));

        cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
        kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);

        // Dummy calls
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.finish();

        latency = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
            queue.finish();
            cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
            cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
            latency += (float)((int)end - (int)start);
        }
        latency /= iters;

        log->print(latency);    log->print(" us" NEWLINE);
        log->xmlSetContent(latency);
        log->xmlCloseTag();
    }
    catch(cl::Error error)
    {
        log->print(error.err() + NEWLINE);
        log->print(TAB TAB "Tests skipped" NEWLINE);
        return -1;
    }

    return 0;
}
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice)
{
	int set_size=8;
    try {
        cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                isize*sizeof(cl_uchar), in, NULL);
        cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        gNV21Kernel.setArg(2,w);
        gNV21Kernel.setArg(3,h);
        gNV21Kernel.setArg(1,bufferIn);
        gNV21Kernel.setArg(0,bufferOut);
        gQueue.enqueueNDRangeKernel(gNV21Kernel,
                cl::NullRange,
                cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                cl::NDRange(set_size,set_size),
                NULL,
                NULL);
        if (choice==1) {
            gLaplacianK.setArg(2,w);
            gLaplacianK.setArg(3,h);
            gLaplacianK.setArg(1,bufferOut);
            gLaplacianK.setArg(0,bufferOut2);
            gQueue.enqueueNDRangeKernel(gLaplacianK,
                    cl::NullRange,
                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                    cl::NDRange(set_size,set_size),
                    NULL,
                    NULL);
        }
        else if (choice>1) {
        	gNegative.setArg(2,w);
        	gNegative.setArg(3,h);
        	gNegative.setArg(1,bufferOut);
        	gNegative.setArg(0,bufferOut2);
        	gQueue.enqueueNDRangeKernel(gNegative,
        	                    cl::NullRange,
        	                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
        	                    cl::NDRange(set_size,set_size),
        	                    NULL,
        	                    NULL);

        }

        gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out);
    }
    catch (cl::Error e) {
        LOGI("@oclDecoder: %s %d \n",e.what(),e.err());
    }
}
Example #5
0
    void updateParticles(float timeDelta)
    {
        try
        {
            vector<cl::Memory> glBuffers;
            glBuffers.push_back(m_positions);
            glBuffers.push_back(m_colors);
            
            //this will update our system by calculating new velocity and updating the positions of our particles
            //Make sure OpenGL is done using our VBOs
            glFinish();
            
            // map OpenGL buffer object for writing from OpenCL
            // this passes in the vector of VBO buffer objects (position and color)
            m_queue.enqueueAcquireGLObjects(&glBuffers);
            
            m_particleKernel.setArg(5, timeDelta); //pass in the timestep
            
            //execute the kernel
            m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles),
                                         cl::NullRange);
            //Release the VBOs so OpenGL can play with them
            m_queue.enqueueReleaseGLObjects(&glBuffers, NULL);

            m_queue.finish();
        }
        catch(cl::Error &error)
        {
            LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")";
        }
    }
void simulationStep() {
    try {
        // copy
        auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 nullptr, nullptr);
        queue.enqueueWriteBuffer(buffer, CL_TRUE, 0,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 visualizationBufferCPU, NULL, NULL);

        // enque
        stepKernel.setArg(2, buffer);
        cl::NDRange global((size_t) (fieldWidth * fieldHeight));
        queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange);

        // read back
        queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0,
                                sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                visualizationBufferCPU, NULL, NULL);

        // finish
        queue.finish();
    } catch (cl::Error err) {
        std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl;
        exit(3);
    }

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE,
                 visualizationBufferCPU);
}
Example #7
0
void procOCL_I2I(int texIn, int texOut, int w, int h)
{
    if(!haveOpenCL) return;

    LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h);
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
    std::vector < cl::Memory > images;
    images.push_back(imgIn);
    images.push_back(imgOut);

    int64_t t = getTimeMs();
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once
    Laplacian.setArg(0, imgIn);
    Laplacian.setArg(1, imgOut);
    theQueue.finish();
    LOGD("Kernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange);
    theQueue.finish();
    LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueReleaseGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t));
}
Example #8
0
cl::Event runKernel(const cl::CommandQueue& queue, const cl::Kernel& kernel, const cl::NDRange& globalSize, const cl::NDRange& groupSize, std::vector<cl::Event>& events)
{
	cl::Event event;
	queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, groupSize, &events, &event);
	events.push_back(event);
	return event;
}
Example #9
0
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue)
{
    static std::once_flag   compileFlag;
    static cl::Program      prog;
    static cl::Kernel       kern;

    std::call_once(compileFlag,
        [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true);
            kern = cl::Kernel(prog, "julia");
        });

    //auto juliaOp = cl::make_kernel<Buffer, unsigned, unsigned>(kern);

    static const NDRange local(8, 8);
    NDRange global(local[0] * divup(DIMX, local[0]),
                   local[1] * divup(DIMY, local[1]));

    kern.setArg(0, devOut);
    kern.setArg(1, DIMX);
    kern.setArg(2, DIMY);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local);

    //juliaOp(EnqueueArgs(queue, global, local), devOut, DIMX, DIMY);
}
void PTWeekend::draw()
{
    /*
     * BEGIN - each frame part
     */
    
    /* Enqueue kernel for execution */
    
    glm::vec3 origin,lower_left, hor, ver;
    
    float theta = camera.getFov() * M_PI / 180.0f;
    float half_height = tan(theta / 2.0f);
    float half_width = camera.getAspectRatio() * half_height;
    
    origin = camera.getEyePoint();
    glm::vec3 u, v, w;
    
    w = -glm::normalize(camera.getViewDirection()); //odd...
    u = glm::normalize(glm::cross(glm::vec3(0,1,0), w));
    v = glm::cross(w, u);
    
    lower_left = origin - half_width * u - half_height * v - w;
    hor = 2.0f * half_width * u;
    ver = 2.0f * half_height * v;
    
    pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer");
    
    clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not acquire gl objects");
    
    cl::Event profiling_evt;
    
    
    
    clStatus = cmd_queue.enqueueNDRangeKernel(kernel,
                                              cl::NDRange(0,0),
                                              cl::NDRange(img_width, img_height),
                                              cl::NDRange(local_width,local_height),
                                              NULL,
                                              &profiling_evt);
    profiling_evt.wait();
    
    pt_assert(clStatus, "Could not enqueue the kernel");
    clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not release gl objects");
    cmd_queue.finish();
    
    cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong total_time = time_end - time_start;
    std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n";
    
    /*
     * END - each frame part
     */
    
    gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight()));
}
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
             cl::Buffer& device_result, int iterations,
             ProgramCache& cache,
             cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel sum_test_kernel(program, "sum_test");

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();

    int warp_size = sum_test_kernel
        .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);

    std::cout << "warp size: " << warp_size << std::endl;

    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::LocalSpaceArg local_queue
            = cl::__local(sizeof(int) * queue_num_threads * 2);
    cl::LocalSpaceArg reduction_buffer
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg got_work
            = cl::__local(sizeof(int));
    cl::LocalSpaceArg prefix_sum_input
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg prefix_sum_output
            = cl::__local(sizeof(int) * queue_num_threads);

    sum_test_kernel.setArg(0, queue_data);
    sum_test_kernel.setArg(1, queue_metadata);
    sum_test_kernel.setArg(2, device_result);
    sum_test_kernel.setArg(3, iterations);
    sum_test_kernel.setArg(4, local_queue);
    sum_test_kernel.setArg(5, reduction_buffer);
    sum_test_kernel.setArg(6, got_work);
    sum_test_kernel.setArg(7, prefix_sum_input);
    sum_test_kernel.setArg(8, prefix_sum_output);

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

    cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel,
                                               nullRange, global, local);
}
Example #12
0
    void findMinSeamVert(cl::Context &ctx,
                         cl::CommandQueue &cmdQueue,
                         cl::Event &event,
                         std::vector<cl::Event> &deps,
                         cl::Buffer &energyMatrix,
                         cl::Buffer &vertMinEnergy,
                         cl::Buffer &vertMinIdx,
                         int width,
                         int height,
                         int pitch,
                         int colsRemoved) {

        cl_int errNum;
        errNum = findMinSeamVertKernel.setArg(0, energyMatrix);
        errNum |= findMinSeamVertKernel.setArg(1, vertMinEnergy);
        errNum |= findMinSeamVertKernel.setArg(2, vertMinIdx);
        errNum |= findMinSeamVertKernel.setArg(3, cl::__local(256 * sizeof(float)));
        errNum |= findMinSeamVertKernel.setArg(4, cl::__local(256 * sizeof(float)));
        errNum |= findMinSeamVertKernel.setArg(5, width);
        errNum |= findMinSeamVertKernel.setArg(6, height);
        errNum |= findMinSeamVertKernel.setArg(7, pitch);
        errNum |= findMinSeamVertKernel.setArg(8, colsRemoved);

        if (errNum != CL_SUCCESS) {
            std::cerr << "Error setting findMinSeamVert arguments." << std::endl;
            exit(-1);
        }

        // This kernel could be written to use more than one work group, but its probably not worth it.

        cl::NDRange offset = cl::NDRange(0);
        cl::NDRange localWorkSize = cl::NDRange(256);
        cl::NDRange globalWorkSize = cl::NDRange(256);

        errNum = cmdQueue.enqueueNDRangeKernel(findMinSeamVertKernel,
                                               offset,
                                               globalWorkSize,
                                               localWorkSize,
                                               &deps,
                                               &event);
        if (errNum != CL_SUCCESS) {
            std::cerr << "Error enqueuing computeSeams kernel for execution." << std::endl;
            exit(-1);
        }

        /** DEBUG **/
        // int deviceResultIdx[1];
        // float deviceResultEnergy[1];

        // mem::read(ctx, cmdQueue, deviceResultIdx, vertMinIdx);
        // mem::read(ctx, cmdQueue, deviceResultEnergy, vertMinEnergy);

        // std::cout << "deviceResultIdx = " << deviceResultIdx[0] << std::endl;
        // std::cout << "deviceResultEnergy = " << deviceResultEnergy[0] << std::endl;
    }
void Reduce::enqueue(
    const cl::CommandQueue &commandQueue,
    const cl::Buffer &inBuffer,
    const cl::Buffer &outBuffer,
    ::size_t first,
    ::size_t elements,
    ::size_t outPosition,
    const VECTOR_CLASS<cl::Event> *events,
    cl::Event *event)
{
    
    /* Validate parameters */
    if (first + elements < first)
    {
        // Only happens if first + elements overflows. size_t is unsigned so behaviour
        // is well-defined.
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds");
    }
    if (inBuffer.getInfo<CL_MEM_SIZE>() / elementSize < first + elements)
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: range out of input buffer bounds");
    if (outBuffer.getInfo<CL_MEM_SIZE>() / elementSize <= outPosition)
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output position out of buffer bounds");
    if (!(inBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY)))
    {
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: input buffer is not readable");
    }
    if (!(outBuffer.getInfo<CL_MEM_FLAGS>() & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY)))
    {
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: output buffer is not writable");
    }
    if (elements == 0)
        throw cl::Error(CL_INVALID_GLOBAL_WORK_SIZE, "clogs::Reduce::enqueue: elements is zero");

    const ::size_t blockSize = roundUp(elements, reduceWorkGroupSize * reduceBlocks) / reduceBlocks;
    
    reduceKernel.setArg(1, outBuffer);
    reduceKernel.setArg(2, (cl_uint) outPosition);
    reduceKernel.setArg(3, inBuffer);
    reduceKernel.setArg(4, (cl_uint) first);
    reduceKernel.setArg(5, (cl_uint) elements);
    reduceKernel.setArg(7, (cl_uint) blockSize);

    cl::Event reduceEvent;
    commandQueue.enqueueNDRangeKernel(
        reduceKernel,
        cl::NullRange,
        cl::NDRange(reduceWorkGroupSize * reduceBlocks),
        cl::NDRange(reduceWorkGroupSize),
        events, &reduceEvent);
    doEventCallback(reduceEvent);

    if (event != NULL)
        *event = reduceEvent;
}
void zeroBuffer(::cl::Context &context, ::cl::CommandQueue &commandQueue, ::cl::Buffer &buffer, size_t size, std::vector<::cl::Event> *events, ::cl::Event &event)
{
    cl_int status;
    ::cl::Kernel kernel=getKernel(context, "zeroMemory", "utils.cl", utils_cl);

    status=kernel.setArg(0, buffer);

    ::cl::NDRange globalThreads(size);

    status=commandQueue.enqueueNDRangeKernel(kernel, ::cl::NullRange, globalThreads, ::cl::NullRange, events, &event);
}
Example #15
0
    void backtrack(cl::Context &ctx,
                   cl::CommandQueue &cmdQueue,
                   cl::Event &event,
                   std::vector<cl::Event> &deps,
                   cl::Buffer &energyMatrix,
                   cl::Buffer &vertSeamPath,
                   cl::Buffer &vertMinIdx,
                   int width,
                   int height,
                   int pitch,
                   int colsRemoved) {

        cl_int errNum;

        // Set kernel arguments
        errNum = backtrackKernel.setArg(0, energyMatrix);
        errNum |= backtrackKernel.setArg(1, vertSeamPath);
        errNum |= backtrackKernel.setArg(2, vertMinIdx);
        errNum |= backtrackKernel.setArg(3, width);
        errNum |= backtrackKernel.setArg(4, height);
        errNum |= backtrackKernel.setArg(5, pitch);
        errNum |= backtrackKernel.setArg(6, colsRemoved);

        if (errNum != CL_SUCCESS) {
            std::cerr << "Error setting backtrack kernel arguments." << std::endl;
            exit(-1);
        }

        cl::NDRange offset = cl::NDRange(0);
        cl::NDRange localWorkSize = cl::NDRange(1);
        cl::NDRange globalWorkSize = cl::NDRange(256);

        errNum = cmdQueue.enqueueNDRangeKernel(backtrackKernel,
                                               offset,
                                               globalWorkSize,
                                               localWorkSize,
                                               &deps,
                                               &event);


        if (errNum != CL_SUCCESS) {
            std::cerr << "Error enqueueing backTrack kernel for execution." << std::endl;
            exit(-1);
        }

        // /** DEBUGGING **/
        // int deviceResult[height];

        // mem::read(ctx, cmdQueue, deviceResult, vertSeamPath, height);
        // for (int i = height - 5; i < height; ++i) {
        //     std::cout << "deviceResult[" << i << "]=\t" << deviceResult[i] << std::endl;
        // }

    }
void MetaBallsApp::updateParticles()
{
    int random = rand();
    float time = 1.0f / 60.0f;

    mClParticleUpdate.setArg( 3, sizeof(float), &time );
    mClParticleUpdate.setArg( 4, sizeof(int32_t), &random );

    // Queue the kernel up for execution across the array
    mClCommandQueue.enqueueNDRangeKernel( mClParticleUpdate,
                                          cl::NullRange,
                                          cl::NDRange( NUM_PARTICLES ) );
}
Example #17
0
void kernel(cl::Buffer& devOut, cl::Buffer& histOut, cl::CommandQueue& queue)
{
    static std::once_flag   compileFlag;
    static cl::Program      prog;
    static cl::Kernel       kern_img, kern_hist, kern_zero;

    std::call_once(compileFlag,
        [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), fractal_ocl_kernel, true);
            kern_img  = cl::Kernel(prog, "image_gen");
            kern_hist = cl::Kernel(prog, "hist_freq");
            kern_zero = cl::Kernel(prog, "zero_buffer");
        });

    static const NDRange local(16, 16);
    NDRange global(local[0] * divup(DIMX, local[0]),
                   local[1] * divup(DIMY, local[1]));

    static int tileSize = 32; tileSize++;
    persistance += 0.01;
    kern_img.setArg(0, devOut);
    kern_img.setArg(1, DIMX);
    kern_img.setArg(2, DIMY);
    kern_img.setArg(3, persistance);
    kern_img.setArg(4, tileSize);
    queue.enqueueNDRangeKernel(kern_img, cl::NullRange, global, local);

    static const NDRange global_hist(NBINS);
    kern_zero.setArg(0, histOut);
    kern_zero.setArg(1, NBINS);
    queue.enqueueNDRangeKernel(kern_zero, cl::NullRange, global_hist);

    kern_hist.setArg(0, devOut);
    kern_hist.setArg(1, histOut);
    kern_hist.setArg(2, DIMX);
    kern_hist.setArg(3, DIMY);
    kern_hist.setArg(4, NBINS);
    queue.enqueueNDRangeKernel(kern_hist, cl::NullRange, global, local);
}
void zeroImage(::cl::Context &context, ::cl::CommandQueue &commandQueue, ::cl::Image2D &image, std::vector<::cl::Event> *events, ::cl::Event &event)
{
    cl_int status;
    size_t width, height;
    ::cl::Kernel kernel=getKernel(context, "zeroFloatImage", "utils.cl", utils_cl);

    image.getImageInfo(CL_IMAGE_WIDTH, &width);
    image.getImageInfo(CL_IMAGE_HEIGHT, &height);
    status=kernel.setArg(0, image);

    ::cl::NDRange globalThreads(width, height);

    status=commandQueue.enqueueNDRangeKernel(kernel, ::cl::NullRange, globalThreads, ::cl::NullRange, events, &event);
}
void bigLocalQueuesTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
                        int iterations, ProgramCache& cache,
                        cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel big_local_queues_test(program, "big_local_queues_test");

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];

    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::LocalSpaceArg local_queue
            = cl::__local(sizeof(int) * queue_num_threads * 5);
    cl::LocalSpaceArg got_work
            = cl::__local(sizeof(int));
    cl::LocalSpaceArg prefix_sum_input
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg prefix_sum_output
            = cl::__local(sizeof(int) * queue_num_threads);

    big_local_queues_test.setArg(0, queue_data);
    big_local_queues_test.setArg(1, queue_metadata);
    big_local_queues_test.setArg(2, iterations);
    big_local_queues_test.setArg(3, local_queue);
    big_local_queues_test.setArg(4, got_work);
    big_local_queues_test.setArg(5, prefix_sum_input);
    big_local_queues_test.setArg(6, prefix_sum_output);

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

   // std::cout << "enqueuing big_local_queues_test kernel" << std::endl;

    cl_int status = queue.enqueueNDRangeKernel(big_local_queues_test,
                                               nullRange, global, local);
    //fflush(stdout);
}
Example #20
0
void computeInterpolation(cl::CommandQueue &cmd, cl::Program &prog, 
	const int nValues, const int nGrids, const int nPoints, 
	cl::Buffer &distances, cl::Buffer &weightSum, cl::Buffer &knownValues, 
	cl::Buffer &gridValues){

	// void computeInterpolation(const int nValues, const int nGrids, 
	// const int nPoints, const __global float *distances, 
	// const __global float *weightSum, const __global float *knownValues, 
	// __global float *gridValues){

	cl::Kernel kernel( prog, "computeInterpolation_AOS");

	kernel.setArg(0, (cl_int) nValues);
	kernel.setArg(1, (cl_int) nGrids);
	kernel.setArg(2, (cl_int) nPoints);
	kernel.setArg(3, distances);
	kernel.setArg(4, weightSum);
	kernel.setArg(5, knownValues);
	kernel.setArg(6, gridValues);

	// cout<<"great8.2"<<endl;

	size_t lSize=16;
	cl::NDRange local( lSize, lSize );
	cl::NDRange global( ((cl_int)nGrids + lSize-1) / 
		lSize * lSize, ((cl_int)nValues + lSize-1) / lSize * lSize );

	cl::Event event;
	// MY_DATA_TYPE t = 0.0;
	// cl_uint count = 0;

	// cout<<"great8.3"<<endl;

	cmd.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, 0, &event);

	// cout<<"great8.4"<<endl;

	event.wait();

	// cout<<"great8.5"<<endl;
	
	// t += (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() 
	// 	- event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) / 1.0e9;

	// cout<<"t4 "<<t<<endl;



};
Example #21
0
/**
 * generate 64 bit unsigned random numbers in device global memory
 *@param tinymt_status device global memories
 *@param total_num total number of work items
 *@param local_num number of local work items
 *@param data_size number of data to generate
 */
static void generate_uint64(Buffer& tinymt_status,
                            int total_num,
                            int local_num,
                            int data_size)
{
#if defined(DEBUG)
    cout << "generate_uint64 start" << endl;
#endif
    int min_size = total_num;
    if (data_size % min_size != 0) {
        data_size = (data_size / min_size + 1) * min_size;
    }
    Kernel uint_kernel(program, "tinymt_uint64_kernel");
    Buffer output_buffer(context,
                         CL_MEM_READ_WRITE,
                         data_size * sizeof(uint64_t));
    uint_kernel.setArg(0, tinymt_status);
    uint_kernel.setArg(1, output_buffer);
    uint_kernel.setArg(2, data_size / total_num);
    NDRange global(total_num);
    NDRange local(local_num);
    Event generate_event;
#if defined(DEBUG)
    cout << "generate_uint64 enque kernel start" << endl;
#endif
    queue.enqueueNDRangeKernel(uint_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &generate_event);
    uint64_t * output = new uint64_t[data_size];
    generate_event.wait();
    queue.enqueueReadBuffer(output_buffer,
                            CL_TRUE,
                            0,
                            data_size * sizeof(uint64_t),
                            output);
    check_data(output, data_size, total_num);
#if defined(DEBUG)
    print_uint64(output, data_size, total_num);
#endif
    double time = get_time(generate_event);
    cout << "generate time:" << time * 1000 << "ms" << endl;
    delete[] output;
#if defined(DEBUG)
    cout << "generate_uint64 end" << endl;
#endif
}
Example #22
0
int main(int argc, char *argv[])
{
	cl_int err = CL_SUCCESS;
	cl::Event evt;

	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	if (platforms.size() == 0) {
		return false;
	}
	platform_ = platforms[0];

	cl_context_properties properties[] = 
		{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
	context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &err); 
	CHECK_CL_ERROR(err, "cl::Context");

	std::vector<cl::Device> devices = context_.getInfo<CL_CONTEXT_DEVICES>();
	if (devices.size() == 0) {
		return false;
	}
	device_ = devices[0];

	sources_.push_back(std::make_pair(source_str.c_str(), source_str.size()));
	program_ = cl::Program(context_, sources_);
	err = program_.build(devices);
	if (err != CL_SUCCESS) {
		std::string log = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
		std::cout << "program.build() ERROR: " << log.c_str() << std::endl;
		return false;
	}

	kernel_ = cl::Kernel(program_, "hello", &err); 
	CHECK_CL_ERROR(err, "cl::Kernel");

	buf_ = cl::Buffer(context_, CL_MEM_READ_ONLY, 1024, NULL, &err);

	queue_ = cl::CommandQueue(context_, device_, 0, &err);
	CHECK_CL_ERROR(err, "cl::CommandQueue");

	kernel_.setArg(0, buf_);

	err = queue_.enqueueNDRangeKernel(kernel_, cl::NullRange, cl::NDRange(10, 10), cl::NullRange, NULL, &evt); 
	evt.wait();
	CHECK_CL_ERROR(err, "queue.enqueueNDRangeKernel()");

	return 0;
}
Example #23
0
/*
	openCL version
*/
void computeDistances(cl::CommandQueue & cmd, cl::Program &prog, const int DIM, 
	const int nPoints, cl::Buffer &knownCoords, const int nGrids, 
	cl::Buffer &gridCoords, cl::Buffer &distances){

	// cout<<"great8.1"<<endl;

	// cl::Kernel kernel(*gpu.program, "count1");
	cl::Kernel kernel( prog, "computeDistances_AOS");

	// void computeDistances(int DIM, int nPoints,
	// 	__global float *knownCoords, int nGrids,
	// 	__global float *gridCoords, __global float* distances){

	kernel.setArg(0, (cl_int) DIM);
	kernel.setArg(1, (cl_int) nPoints);
	kernel.setArg(2, knownCoords);
	kernel.setArg(3, (cl_int) nGrids);
	kernel.setArg(4, gridCoords);
	kernel.setArg(5, distances);

	// cout<<"great8.2"<<endl;

	size_t lSize=16;
	cl::NDRange local( lSize, lSize );
	cl::NDRange global( ((cl_int)nGrids + lSize-1) / 
		lSize * lSize, ((cl_int)nPoints + lSize-1) / lSize * lSize );

	cl::Event event;
	// MY_DATA_TYPE t = 0.0;
	// cl_uint count = 0;

	// cout<<"great8.3"<<endl;

	cmd.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, 0, &event);

	// cout<<"great8.4"<<endl;

	event.wait();

	// cout<<"great8.5"<<endl;
	
	// t += (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() 
	// 	- event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) / 1.0e9;

	// cout<<"t2 "<<t<<endl;
};
Example #24
0
cl::Buffer performReduction(const cl::Buffer & in,cl::Kernel & ker,cl::CommandQueue & q,int size)
{
	if (size == 1) return in;

	int newsize = std::max(1,size/4);
	cl::Buffer tmp (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*newsize);

	ker.setArg(0,in());
	ker.setArg(1,tmp());
	ker.setArg(2,size);
	ker.setArg(3,4);

	q.enqueueNDRangeKernel(ker,cl::NDRange(0),cl::NDRange(newsize),
												 getBestWorkspaceDim(cl::NDRange(newsize)));

	return performReduction(tmp,ker,q,newsize);
}
Example #25
0
real L2Norm(const Buffer3D & in,cl::CommandQueue & q)
{
	cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth());

	CLContextLoader::getRedL2NormKer().setArg(0,in());
	CLContextLoader::getRedL2NormKer().setArg(1,ans());

	q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(),
					cl::NDRange(0),
					cl::NDRange(in.width()*in.height()*in.depth()),
					getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth())));

	ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth());

	real res;
	q.enqueueReadBuffer(ans,true,0,sizeof(real),&res);
	return sqrt(res);
}
Example #26
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for 1 generator.
 *@param tinymt_status internal state of kernel side tinymt
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed seed for initialization
 */
static void initialize_by_seed(Buffer& tinymt_status,
                               int total,
                               int local_item,
                               uint32_t seed)
{
#if defined(DEBUG)
    cout << "initialize_by_seed start" << endl;
#endif
    Kernel init_kernel(program, "tinymt_init_seed_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
#if defined(DEBUG)
    cout << "global:" << dec << total << endl;
    cout << "group:" << dec << (total / local_item) << endl;
    cout << "local:" << dec << local_item << endl;
#endif
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt32j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt32j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
#if defined(DEBUG)
    cout << "status[0].s0:" << hex << status[0].s0 << endl;
    cout << "status[0].s1:" << hex << status[0].s1 << endl;
    cout << "status[0].s2:" << hex << status[0].s2 << endl;
    cout << "status[0].s3:" << hex << status[0].s3 << endl;
#endif
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_seed end" << endl;
#endif
}
Example #27
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for all generators.
 *@param tinymt_status device global memories
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed_array seeds for initialization
 *@param seed_size size of seed_array
 */
static void initialize_by_array(Buffer& tinymt_status,
                                int total,
                                int local_item,
                                uint64_t seed_array[],
                                int seed_size)
{
#if defined(DEBUG)
    cout << "initialize_by_array start" << endl;
#endif
    Buffer seed_array_buffer(context,
                             CL_MEM_READ_WRITE,
                             seed_size * sizeof(uint64_t));
    queue.enqueueWriteBuffer(seed_array_buffer,
                             CL_TRUE,
                             0,
                             seed_size * sizeof(uint64_t),
                             seed_array);
    Kernel init_kernel(program, "tinymt_init_array_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed_array_buffer);
    init_kernel.setArg(2, seed_size);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt64j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt64j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_array end" << endl;
#endif
}
Example #28
0
void kernel(cl::Buffer& devOut, cl::CommandQueue& queue)
{
    static std::once_flag   compileFlag;
    static cl::Program      prog;
    static cl::Kernel       kern;

    std::call_once(compileFlag,
        [queue]() {
        prog = cl::Program(queue.getInfo<CL_QUEUE_CONTEXT>(), sinf_ocl_kernel, true);
            kern = cl::Kernel(prog, "sinf");
        });

    static const NDRange global(SIZE * 2);

    kern.setArg(0, devOut);
    kern.setArg(1, dx);
    kern.setArg(2, SIZE);
    queue.enqueueNDRangeKernel(kern, cl::NullRange, global);
}
Example #29
0
void genOffsets(cl::Context& context,
                cl::CommandQueue& queue,
                cl::Program& program,
                cl::Buffer* cm_buffer,
                cl::Buffer* offsets_buffer)
{

    // Make kernel
    cl::Kernel kernel(program, "offsets");
    
    // Set arguments to kernel
    kernel.setArg(0, *cm_buffer);
    kernel.setArg(1, *offsets_buffer);

    // Run the kernel on specific ND range
    cl::NDRange global(BINS);
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, cl::NullRange);

}
void RTBiasPathOCLRenderThread::EnqueueRenderSampleKernel(cl::CommandQueue &oclQueue) {
	RTBiasPathOCLRenderEngine *engine = (RTBiasPathOCLRenderEngine *)renderEngine;

	// Check the maximum number of task to execute. I have to
	// consider preview, normal and long run phase
	const u_int tileWidth = engine->tileRepository->tileWidth;
	const u_int tileHeight = engine->tileRepository->tileHeight;
	const u_int threadFilmPixelCount = tileWidth * tileHeight;

	u_int taskCount = threadFilmPixelCount / (engine->previewResolutionReduction * engine->previewResolutionReduction);
	taskCount = Max(taskCount, threadFilmPixelCount / (engine->resolutionReduction * engine->resolutionReduction));
	if (engine->longRunResolutionReductionStep > 0)
		taskCount = Max(taskCount, threadFilmPixelCount / (engine->longRunResolutionReduction * engine->longRunResolutionReduction));

	// Micro kernels version
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_GENERATE_CAMERA_RAY, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_TRACE_EYE_RAY, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_ILLUMINATE_EYE_MISS, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_ILLUMINATE_EYE_HIT, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_DL_VERTEX_1, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_DIFFUSE, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_GLOSSY, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
	oclQueue.enqueueNDRangeKernel(*renderSampleKernel_MK_BSDF_SAMPLE_SPECULAR, cl::NullRange,
			cl::NDRange(RoundUp<u_int>(taskCount, renderSampleWorkGroupSize)),
			cl::NDRange(renderSampleWorkGroupSize));
}