int GFXCreateProgram( const char *vprogram, const char *fprogram, const char *extra_defines )
{
    ProgramCache::key_type key = cacheKey( vprogram, fprogram, extra_defines );
    ProgramCache::const_iterator it = programCache.find( key );
    if ( it != programCache.end() )
        return it->second;
    int rv = programCache[key] = GFXCreateProgramNoCache( vprogram, fprogram, extra_defines );
    programICache[rv] = key;
    return rv;
}
Exemplo n.º 2
0
    LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
    {
        switch(msg)
        {
            case WM_CLOSE:
                DestroyWindow(hwnd);
            break;
            case WM_DESTROY:
                PostQuitMessage(0);
            break;
            case WM_KEYDOWN:
		    switch ( wParam )
		    {
                case VK_ESCAPE:
                    PostQuitMessage(0);
                    break;
		        case VK_SPACE:
                {
                    programCache.NotifySourceChange();
                    break;
                }
		    }
		    return 0;
            default:
                return DefWindowProc(hwnd, msg, wParam, lParam);
        }
        return 0;
    }
void sumTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
             cl::Buffer& device_result, int iterations,
             ProgramCache& cache,
             cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel sum_test_kernel(program, "sum_test");

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();

    int warp_size = sum_test_kernel
        .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);

    std::cout << "warp size: " << warp_size << std::endl;

    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::LocalSpaceArg local_queue
            = cl::__local(sizeof(int) * queue_num_threads * 2);
    cl::LocalSpaceArg reduction_buffer
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg got_work
            = cl::__local(sizeof(int));
    cl::LocalSpaceArg prefix_sum_input
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg prefix_sum_output
            = cl::__local(sizeof(int) * queue_num_threads);

    sum_test_kernel.setArg(0, queue_data);
    sum_test_kernel.setArg(1, queue_metadata);
    sum_test_kernel.setArg(2, device_result);
    sum_test_kernel.setArg(3, iterations);
    sum_test_kernel.setArg(4, local_queue);
    sum_test_kernel.setArg(5, reduction_buffer);
    sum_test_kernel.setArg(6, got_work);
    sum_test_kernel.setArg(7, prefix_sum_input);
    sum_test_kernel.setArg(8, prefix_sum_output);

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

    cl_int status = queue.enqueueNDRangeKernel(sum_test_kernel,
                                               nullRange, global, local);
}
void bigLocalQueuesTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
                        int iterations, ProgramCache& cache,
                        cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel big_local_queues_test(program, "big_local_queues_test");

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];

    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::LocalSpaceArg local_queue
            = cl::__local(sizeof(int) * queue_num_threads * 5);
    cl::LocalSpaceArg got_work
            = cl::__local(sizeof(int));
    cl::LocalSpaceArg prefix_sum_input
            = cl::__local(sizeof(int) * queue_num_threads);
    cl::LocalSpaceArg prefix_sum_output
            = cl::__local(sizeof(int) * queue_num_threads);

    big_local_queues_test.setArg(0, queue_data);
    big_local_queues_test.setArg(1, queue_metadata);
    big_local_queues_test.setArg(2, iterations);
    big_local_queues_test.setArg(3, local_queue);
    big_local_queues_test.setArg(4, got_work);
    big_local_queues_test.setArg(5, prefix_sum_input);
    big_local_queues_test.setArg(6, prefix_sum_output);

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

   // std::cout << "enqueuing big_local_queues_test kernel" << std::endl;

    cl_int status = queue.enqueueNDRangeKernel(big_local_queues_test,
                                               nullRange, global, local);
    //fflush(stdout);
}
void GFXDestroyProgram( int program )
{
    // Find program
    ProgramICache::iterator it = programICache.find( program );
    if (it != programICache.end()) {
        /*
        if (glDeleteProgram_p)
            glDeleteProgram_p( program );
        */
        // FIXME: Real problem here with program leakage, 
        //      but cannot destroy like this, brings all kind of issues
        //      since the caller may not hold the only reference.
        programCache.erase(it->second);
        programICache.erase(it);
    }
}
Exemplo n.º 6
0
        void Initialize()
        {
            U32 programId   = ProgramCache::CreateId( "level3" );
            m_program       = programCache.GetProgram( programId );

            m_geom.m_primitive = PT_TRIANGLES;

            VertexDeclaration& vDecl            = m_geom.m_vertexDecl;
            vDecl.m_attributes[0].m_semantic    = VS_POSITION;
            vDecl.m_attributes[0].m_type        = DT_F32;
            vDecl.m_attributes[0].m_size        = 4;
            vDecl.m_attributes[0].m_normalized  = false;
            vDecl.m_attributes[0].m_offset      = 0;
            vDecl.m_size                        = 16;
            vDecl.m_count                       = 1;

            m_geom.m_vertexCount = 4;
            const F32 vb[4][4] =
            {
                { -1.0f, -1.0f, 0.0f, 1.0f },
                { +1.0f, -1.0f, 1.0f, 1.0f },
                { -1.0f, +1.0f, 0.0f, 0.0f },
                { +1.0f, +1.0f, 1.0f, 0.0f }
            };

            m_geom.m_vertexBuffer = RenderDevice::CreateVertexBuffer( sizeof(vb), vb, BU_STATIC );

            m_geom.m_indexType = DT_U8;

            const U8 ib[2][3] =
            {
                0, 1, 2,
                2, 1, 3,
            };

            m_geom.m_indexBuffer = RenderDevice::CreateIndexBuffer( sizeof(ib), ib, BU_STATIC );
            m_geom.m_vertexArray = RenderDevice::CreateVertexArray( vDecl, m_geom.m_vertexBuffer );
            m_geom.m_indexCount  = 6;
        }
void dequeueTest(cl::Buffer queue_data, cl::Buffer queue_metadata,
                 cl::Buffer& device_result, ProgramCache& cache,
                 cl::CommandQueue& queue)
{
    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::vector<std::string> sources;
    sources.push_back("ParallelQueue");
    sources.push_back("ParallelQueueTests");

    cl::Program& program = cache.getProgram(sources);

    cl::Kernel dequeue_test_kernel(program, "dequeue_test");

    cl::LocalSpaceArg local_mem = cl::__local(sizeof(int));

    dequeue_test_kernel.setArg(0, queue_data);
    dequeue_test_kernel.setArg(1, queue_metadata);
    dequeue_test_kernel.setArg(2, device_result);
    dequeue_test_kernel.setArg(3, local_mem);

    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    int max_group_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];

    int queue_num_threads = 512;

    if(queue_num_threads > max_group_size)
        queue_num_threads = max_group_size;

    cl::NDRange nullRange;
    cl::NDRange global(queue_num_threads, 1);
    cl::NDRange local(queue_num_threads, 1);

    cl_int status = queue.enqueueNDRangeKernel(dequeue_test_kernel,
                                               nullRange, global, local);
}
Exemplo n.º 8
0
		//! Return whether program source is cached
		bool contains(const std::string& source)
		{
			return cachedPrograms.find(source) != cachedPrograms.end();
		}
Exemplo n.º 9
0
		//! Destroy the cache, programs will be released automatically
		~SourceCacher()
		{
			cerr << "Destroying source cacher containing " << cachedPrograms.size() << " cached programs" << endl;
		}
Exemplo n.º 10
0
namespace Level3_NS
{
    const U32 FRAME_MAX_COUNT = 60;
    U64 clockTicks = 0;
    U32 frameCount = 0;

    const SizeT frameAllocatorSize = 10 * 1024;

    RenderDevice device3d;
    ProgramCache programCache;
    RenderList renderList;


    class RenderSimple : public RenderGeometry
    {
    public:
        void Draw()
        {
            RenderDevice::BeginGeometry( m_vertexDecl, m_vertexArray, m_indexBuffer );
            RenderDevice::DrawIndexed( m_primitive, m_indexCount, m_indexType );
            RenderDevice::EndGeometry( m_vertexDecl );
        }

        PrimitiveType       m_primitive;
        Handle              m_vertexBuffer;
        VertexDeclaration   m_vertexDecl;
        SizeT               m_vertexCount;
        DataType            m_indexType;
        Handle              m_vertexArray;
        Handle              m_indexBuffer;
        SizeT               m_indexCount;
    };

    class FullScreenQuadRenderer
    {
    public:
        FullScreenQuadRenderer()
        {
        }

        void Initialize()
        {
            U32 programId   = ProgramCache::CreateId( "level3" );
            m_program       = programCache.GetProgram( programId );

            m_geom.m_primitive = PT_TRIANGLES;

            VertexDeclaration& vDecl            = m_geom.m_vertexDecl;
            vDecl.m_attributes[0].m_semantic    = VS_POSITION;
            vDecl.m_attributes[0].m_type        = DT_F32;
            vDecl.m_attributes[0].m_size        = 4;
            vDecl.m_attributes[0].m_normalized  = false;
            vDecl.m_attributes[0].m_offset      = 0;
            vDecl.m_size                        = 16;
            vDecl.m_count                       = 1;

            m_geom.m_vertexCount = 4;
            const F32 vb[4][4] =
            {
                { -1.0f, -1.0f, 0.0f, 1.0f },
                { +1.0f, -1.0f, 1.0f, 1.0f },
                { -1.0f, +1.0f, 0.0f, 0.0f },
                { +1.0f, +1.0f, 1.0f, 0.0f }
            };

            m_geom.m_vertexBuffer = RenderDevice::CreateVertexBuffer( sizeof(vb), vb, BU_STATIC );

            m_geom.m_indexType = DT_U8;

            const U8 ib[2][3] =
            {
                0, 1, 2,
                2, 1, 3,
            };

            m_geom.m_indexBuffer = RenderDevice::CreateIndexBuffer( sizeof(ib), ib, BU_STATIC );
            m_geom.m_vertexArray = RenderDevice::CreateVertexArray( vDecl, m_geom.m_vertexBuffer );
            m_geom.m_indexCount  = 6;
        }

        void Render()
        {
            RenderElement element;

            element.m_program               = m_program;
            element.m_textureCount          = 0;
            element.m_uniformBufferCount    = 0;
            element.m_geometry              = &m_geom;

            renderList.Push( element );
        }

        void Destroy()
        {
            RenderDevice::DestroyVertexArray( m_geom.m_vertexArray );
            RenderDevice::DestroyBuffer( m_geom.m_indexBuffer );
            RenderDevice::DestroyBuffer( m_geom.m_vertexBuffer );
        }

    private:
        ProgramHandle   m_program;
        RenderSimple    m_geom;
    };



    LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
    {
        switch(msg)
        {
            case WM_CLOSE:
                DestroyWindow(hwnd);
            break;
            case WM_DESTROY:
                PostQuitMessage(0);
            break;
            case WM_KEYDOWN:
		    switch ( wParam )
		    {
                case VK_ESCAPE:
                    PostQuitMessage(0);
                    break;
		        case VK_SPACE:
                {
                    programCache.NotifySourceChange();
                    break;
                }
		    }
		    return 0;
            default:
                return DefWindowProc(hwnd, msg, wParam, lParam);
        }
        return 0;
    }

    Bool MessagePump( MSG * msg )
    {
        // check for messages
        while ( PeekMessage( msg, NULL, 0, 0, PM_REMOVE ) )
        {
            // handle or dispatch messages
            if ( msg->message == WM_QUIT )
            {
                return false;
            }
            else
            {
                TranslateMessage( msg );
                DispatchMessage( msg );
            }
        }

        return true;
    }

    void DisplayFramerate( HWND hwnd )
    {
        ++frameCount;
        if ( frameCount == FRAME_MAX_COUNT )
        {
            U64 currentTicks    = Core::TimeUtils::ClockTime();
            F64 fps             = FRAME_MAX_COUNT * Core::TimeUtils::ClockFrequency() / ( currentTicks - clockTicks );
            clockTicks          = currentTicks;
            frameCount          = 0;

            Char text[ 32 ];
            SetWindowText( hwnd, Core::StringUtils::FormatString( text, 32, "Level3 - [ %0.0f fps ]", fps ) );
        }
    }

}
void watershed(int width, int height,
               cl::Buffer& src,
               cl::Buffer& labeled,
               ProgramCache& cache,
               cl::CommandQueue& queue)
{

#ifdef OPENCL_PROFILE
    watershed_descent_kernel_time = 0;
    watershed_increment_kernel_time = 0;
    watershed_minima_kernel_time = 0;
    watershed_plateau_kernel_time = 0;
    watershed_flood_kernel_time = 0;
#endif

    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::stringstream params_stream;
    params_stream << "-DBLOCK_SIZE=";
    params_stream << BLOCK_SIZE;

    std::string program_params = params_stream.str();

    cl::Program& program = cache.getProgram("Watershed", program_params);
    
    cl::Kernel descent_kernel(program, "descent_kernel");
    cl::Kernel increment_kernel(program, "increment_kernel");
    cl::Kernel minima_kernel(program, "minima_kernel");
    cl::Kernel plateau_kernel(program, "plateau_kernel");
    cl::Kernel flood_kernel(program, "flood_kernel");

    //setting constant memory with neigbourhood
    cl::Buffer cl_neighbourhood_x = cl::Buffer(context,CL_MEM_READ_ONLY,
                                               sizeof(neighbourhood_x));
    cl::Buffer cl_neighbourhood_y = cl::Buffer(context, CL_MEM_READ_ONLY,
                                               sizeof(neighbourhood_y));

#ifdef OPENCL_PROFILE
    cl::Event first_event;
    queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0,
                             sizeof(neighbourhood_x),
                             neighbourhood_x, __null, &first_event);
#else
    queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0,
                             sizeof(neighbourhood_x), neighbourhood_x);
#endif

    queue.enqueueWriteBuffer(cl_neighbourhood_y, CL_TRUE, 0,
                             sizeof(neighbourhood_y), neighbourhood_y);

    //const size_t block_size = 6;
    //cl::LocalSpaceArg local_mem = cl::__local(block_size * block_size * sizeof(float));
    cl::LocalSpaceArg local_mem = cl::__local(BLOCK_SIZE * BLOCK_SIZE * sizeof(float));

    //setting args for descent_kernel
    descent_kernel.setArg(0, src);
    descent_kernel.setArg(1, labeled);
    descent_kernel.setArg(2, cl_neighbourhood_x);
    descent_kernel.setArg(3, cl_neighbourhood_y);
    descent_kernel.setArg(4, local_mem);
    descent_kernel.setArg(5, width);
    descent_kernel.setArg(6, height);

    size_t global_width = (width / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE;
    size_t global_height = (height / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE;

#ifdef DEBUG_PRINT
    std::cout << "global width=" << global_width
              << " global height=" << global_height << std::endl;
#endif

    cl::NDRange global(global_width, global_height);
    cl::NDRange local(BLOCK_SIZE, BLOCK_SIZE);

    cl_int status;

#ifdef OPENCL_PROFILE
    {
        VECTOR_CLASS<cl::Event> events_vector(1);
        status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange,
                                            global, local, __null,
                                            &events_vector[0]);

        cl::WaitForEvents(events_vector);

        cl::Event& event = events_vector[0];

        cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
        cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
        cl_ulong total_time = end - start;

        watershed_descent_kernel_time = total_time;
    }
#else
    status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange,
                                        global, local);
#endif

#ifdef DEBUG_PRINT
    std::cout << "kernel execution " << status << std::endl;
#endif

   // queue.flush();
   // queue.enqueueBarrier();

    /* PREPARING INCREMENT KERNEL */

    increment_kernel.setArg(0, labeled);
    increment_kernel.setArg(1, width);
    increment_kernel.setArg(2, height);

#ifdef OPENCL_PROFILE
    {
        VECTOR_CLASS<cl::Event> events_vector(1);
        status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange,
                                            global, local, __null,
                                            &events_vector[0]);

        cl::WaitForEvents(events_vector);

        cl::Event& event = events_vector[0];

        cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
        cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
        cl_ulong total_time = end - start;

        watershed_increment_kernel_time = total_time;
    }
#else
    status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange,
                                        global, local);
#endif

//    queue.enqueueBarrier();


    /* PREPARING MINIMA KERNEL */

    int counter_tmp = 0;
    cl::Buffer counter(context, CL_MEM_READ_WRITE, sizeof(int));
    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);

    queue.enqueueBarrier();

    minima_kernel.setArg(0, counter);
    minima_kernel.setArg(1, labeled);
    minima_kernel.setArg(2, cl_neighbourhood_x);
    minima_kernel.setArg(3, cl_neighbourhood_y);
    minima_kernel.setArg(4, local_mem);
    minima_kernel.setArg(5, width);
    minima_kernel.setArg(6, height);

    int old_val = -1;
    int new_val = -2;
    int c = 0;


    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_minima_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange,
                                            global, local);
#endif
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
        c++;
    }

#ifdef DEBUG_PRINT
    std::cout << "step 2: " << c << " iterations" << std::endl;
#endif

    /* PREPARING PLATEAU KERNEL */

    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);

    queue.enqueueBarrier();

    plateau_kernel.setArg(0, counter);
    plateau_kernel.setArg(1, src);
    plateau_kernel.setArg(2, labeled);
    plateau_kernel.setArg(3, cl_neighbourhood_x);
    plateau_kernel.setArg(4, cl_neighbourhood_y);
    plateau_kernel.setArg(5, local_mem);
    plateau_kernel.setArg(6, width);
    plateau_kernel.setArg(7, height);

    old_val = -1;
    new_val = -2;
    c = 0;

#ifdef OPENCL_PROFILE
    watershed_plateau_kernel_time = 0;
#endif

    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_plateau_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange,
                                            global, local);
#endif
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
        c++;
    }

#ifdef DEBUG_PRINT
    std::cout << "step 3: " << c << " iterations" << std::endl;
#endif

    //preparing flood kernel

    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);
    queue.enqueueBarrier();

    flood_kernel.setArg(0, counter);
    flood_kernel.setArg(1, labeled);
    flood_kernel.setArg(2, width);
    flood_kernel.setArg(3, height);

    old_val = -1;
    new_val = -2;
    c = 0;

    int new_block_size = 16;
    local = cl::NDRange(new_block_size, new_block_size);

    int n_width = ((width - 1) / new_block_size + 2) * new_block_size;
    int n_height = ((height - 1) / new_block_size + 2) * new_block_size;

    global = cl::NDRange(n_width, n_height);

#ifdef DEBUG_PRINT
    std::cout << "flood kernel invocation params:" << std::endl;
    std::cout << "local: " << local[0] << ", " << local[1] << std::endl;
    std::cout << "global: " << global[0] << ", " << global[1] << std::endl;
#endif

#ifdef OPENCL_PROFILE
    cl::Event last_event;
#endif


#ifdef OPENCL_PROFILE
    watershed_flood_kernel_time = 0;
#endif

    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_flood_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange,
                                            global, local);
#endif

#ifdef OPENCL_PROFILE
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int),
                                &new_val, __null, &last_event);
#else
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
#endif
        c++;
    }

#ifdef OPENCL_PROFILE
    watershed_descent_kernel_time /= TIME_DIVISOR;
    watershed_increment_kernel_time /= TIME_DIVISOR;
    watershed_minima_kernel_time /= TIME_DIVISOR;
    watershed_plateau_kernel_time /= TIME_DIVISOR;
    watershed_flood_kernel_time /= TIME_DIVISOR;
#endif

#ifdef DEBUG_PRINT
    std::cout << "step 4: " << c << " iterations" << std::endl;
#endif

#ifdef OPENCL_PROFILE
    last_event.wait();

    cl_ulong start = first_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    cl_ulong end = last_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong total_time = end - start;

    setLastExecutionTime(total_time/TIME_DIVISOR);
#endif
}